In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from util.parsers import *
from util.modeling import *

In [2]:
cols = [
    "review_key",
    "vehicle_key",
    "date_key",
    "review_rating"
]

In [3]:
review_facts = pd.DataFrame(columns=cols)

### Get dimensions

In [4]:
date_dim = pd.read_csv("data/dateDim.csv", index_col=0, parse_dates=["date_full_description"])
review_dim = pd.read_csv("data/reviewDim.csv", index_col=0)
car_dim = pd.read_csv("data/vehicleDim.csv", index_col=0)

In [5]:
review_dim["review_hash"] = review_dim.review_hash.apply(int)

### Get data to process

In [6]:
PATH = "../stage_1/data/processedData/"

In [7]:
reviews = pd.read_csv(PATH + "review.csv", index_col=0)

In [8]:
reviews["Date"] = reviews.Date.apply(np.datetime64)

In [9]:
reviews.head()

Unnamed: 0,Company,Model,Year,Reviewer,Date,Title,Rating,Review
0,acura,ilx,2013,mahowald,2012-08-12,Getting 43 Mpg on daily commute,5,"I love this car.\r\nGas mileage, suspension, a..."
1,acura,ilx,2013,VIP,2016-07-26,New to me ILX,5,I purchased my 2013 ILX from the dealer used w...
2,acura,ilx,2013,bybchucky,2013-09-14,Excellet Commuter Car,4,I recently purchased a 2013 ILX with the Tech ...
3,acura,ilx,2013,bayareamom,2015-01-26,Peppy Hybrid that Turns Heads,4,We bought our ILX used and have been incredibl...
4,acura,ilx,2013,Kerry Olson,2016-03-26,2013 Acura ILX Hybrid w/ Tech Package,5,In April of 2015 we were in need of another ca...


In [10]:
reviews.shape

(262923, 8)

In [13]:
date_keys = get_date_foreign_key(
    reviews, 
    date_dim,
    "Date"
)

100%|████████████████████████████████████████████████████████████████████████| 262923/262923 [02:03<00:00, 2125.16it/s]


In [98]:
car_key = get_car_foreign_keys_np(
    reviews,
    car_dim,
    year_label="Year",
    model_label="Model",
    make_label="Company"
)

2632it [00:07, 340.39it/s]


KeyboardInterrupt: 

In [92]:
review_keys = get_review_foreign_key_np(
    reviews,
    review_dim,
    "Review",
    "Reviewer",
    "Title"
)

262923it [22:59, 190.55it/s]


In [100]:
review_facts = pd.concat(
    (review_facts,
    get_review_fact_table(reviews, "Rating", car_key, date_keys, review_keys))
)

In [101]:
review_facts.head()

Unnamed: 0,review_key,vehicle_key,date_key,review_rating
0,0,360479,3060,5
1,1,360479,2663,5
2,2,360479,3692,4
3,3,360479,3693,4
4,4,360479,3354,5


In [102]:
review_facts.shape

(262923, 4)

In [103]:
review_facts.to_csv("data/review_facts.csv")