In [18]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from util.parsers import *
from util.modeling import *

In [19]:
cols = [
    "review_key",
    "vehicle_key",
    "date_key",
    "review_rating"
]

In [20]:
review_facts = pd.DataFrame(columns=cols)

### Get dimensions

In [55]:
date_dim = pd.read_csv("data/dateDim.csv", index_col=0, parse_dates=["date_full_description"])
review_dim = pd.read_csv("data/reviewDim.csv", index_col=0)
car_dim = pd.read_csv("data/vehicleDim.csv", index_col=0)

In [61]:
review_dim["review_hash"] = review_dim.review_hash.apply(int)

### Get data to process

In [5]:
PATH = "../stage_1/data/processedData/"

In [6]:
reviews = pd.read_csv(PATH + "review.csv", index_col=0)

In [7]:
reviews["Date"] = reviews.Date.apply(np.datetime64)

In [8]:
reviews.head()

Unnamed: 0,Company,Model,Year,Reviewer,Date,Title,Rating,Review
0,acura,ilx,2013,mahowald,2012-08-12,Getting 43 Mpg on daily commute,5,"I love this car.\r\nGas mileage, suspension, a..."
1,acura,ilx,2013,VIP,2016-07-26,New to me ILX,5,I purchased my 2013 ILX from the dealer used w...
2,acura,ilx,2013,bybchucky,2013-09-14,Excellet Commuter Car,4,I recently purchased a 2013 ILX with the Tech ...
3,acura,ilx,2013,bayareamom,2015-01-26,Peppy Hybrid that Turns Heads,4,We bought our ILX used and have been incredibl...
4,acura,ilx,2013,Kerry Olson,2016-03-26,2013 Acura ILX Hybrid w/ Tech Package,5,In April of 2015 we were in need of another ca...


In [41]:
reviews.shape

(262923, 8)

In [42]:
date_keys = get_date_foreign_key(
    reviews, 
    date_dim,
    "Date"
)

  2%|██▌                                                                                                                          | 5512/262923 [00:02<02:15, 1900.03it/s]


KeyboardInterrupt: 

In [47]:
int(md5("asd".encode()).hexdigest(), 16)

159618536159443155471728566704729305870

In [40]:
car_key = get_car_foreign_keys(
    reviews,
    car_dim,
    year_label="Year",
    model_label="Model",
    make_label="Company"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 262923/262923 [14:12<00:00, 308.52it/s]


In [72]:
review_keys = get_review_foreign_key(
    reviews,
    review_dim,
    "Review",
    "Reviewer",
    "Title"
)

334it [00:05, 66.32it/s]


KeyboardInterrupt: 

In [59]:
int(review_dim.iloc[0].review_hash)

85416213507445856046016537860046552941

In [54]:
int( md5(reviews.iloc[0].Review.encode() ).hexdigest(), 16)

85416213507445856046016537860046552941

In [52]:
car_dim[(car_dim.vehicle_year == 2013) &
        (car_dim.vehicle_make == "acura") &
        (car_dim.vehicle_model == "ilx") &
        (car_dim.vehicle_mileage == "Unknown")
]

Unnamed: 0_level_0,vehicle_year,vehicle_age,vehicle_number_cylinders,vehicle_mileage,vehicle_make,vehicle_model,vehicle_engine,vehicle_fuel_type,vehicle_transmission
vehicle_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
360479,2013.0,10.0,Unknown,Unknown,acura,ilx,Unknown,Unknown,Unknown


In [33]:
car_dim.loc[360479].vehicle_mileage=="Unknown"

True

In [71]:
def get_review_foreign_key(
    df,
    review_dim,
    description_label=None,
    username_label=None,
    title_label=None
):
    keys=[]
    if description_label!=None:
        hash_values = df[description_label].apply(lambda x: int(md5(x.encode()).hexdigest(), 16))
        
    for i, row in tqdm(enumerate(df.index)):
        descript = df.loc[row, description_label] if description_label!= None else "Unknown"
        user = df.loc[row, username_label] if username_label!= None else "Unknown"
        title = df.loc[row, title_label] if username_label!=None else "Unknown"
        
        temp_df = review_dim[
            review_dim.review_hash==hash_values[i]
        ]
        temp_df = temp_df[
            temp_df.review_title==title
        ]
        temp_df = temp_df[
            temp_df.review_username==user
        ]
        keys.append(
            temp_df[ 
                (temp_df.review_description==descript)
            ].index[0]
        )
    return keys