In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from util.parsers import *

In [2]:
cols = [
    #key
    "date_posted_key",
    "date_sold_key",
    "location_key",
    "sales_price",
    "fuel_price"
]

In [3]:
sales_facts = pd.DataFrame(columns=cols)

### Get Data Dimensions and utils

In [4]:
PATH = "../stage_1/data/processedData/"

In [70]:
#load dimensions
car_dim = pd.read_csv("data/vehicleDim.csv", index_col=0)
date_dim = pd.read_csv("data/dateDim.csv", index_col=0, parse_dates=["date_full_description"])
loc_dim = pd.read_csv("data/locationDim.csv", index_col=0)
fuel_prices = pd.read_csv(PATH + "/fuel_prices.csv", index_col=0, parse_dates=["Start Date", "Stop Date"])

In [6]:
car_dim.head()

Unnamed: 0_level_0,vehicle_year,vehicle_age,vehicle_mileage,vehicle_make,vehicle_model,vehicle_fuel_type
vehicle_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2013.0,10.0,"[98500, 143400)",ford,f150,gas
1,2012.0,11.0,"[60500, 98500)",gmc,sierra 2500,gas
2,2016.0,7.0,"[26000, 60500)",chevrolet,c1500,gas
3,2019.0,4.0,"[26000, 60500)",toyota,tacoma,gas
4,2016.0,7.0,"[0, 26000)",chevrolet,colorado,gas


In [55]:
def get_car_foreign_keys(
    df,
    year_label,
    mileage_label, 
    make_label,
    model_label,
    fuel_label,
):
    keys=[]
    for row in tqdm(df.index):
        try:
            temp_df = car_dim[(car_dim.vehicle_year == df.loc[row, year_label])]
            temp_df = temp_df[(temp_df.vehicle_make == df.loc[row, make_label])]
            temp_df = temp_df[(temp_df.vehicle_model == df.loc[row, model_label])]
            temp_df = temp_df[(temp_df.vehicle_mileage == df.loc[row, mileage_label])]

            keys.append(
                temp_df[(temp_df.vehicle_fuel_type==df.loc[row, fuel_label])].index[0]
            )
        except:
            print(row)
            break
    return keys

In [8]:
def get_date_foreign_key(
    df,
    date_label
):
    keys=[]
    for row in tqdm(df.index):
        keys.append(
            date_dim[
                (date_dim.date_full_description==df.loc[row, date_label])
            ].index[0]
        )
    return keys

In [38]:
def get_loc_foreign_key(
    df,
    lat_label,
    long_label,
    state_label
):
    keys=[]
    for row in tqdm(df.index):
        item = df.loc[row]
        temp_df = loc_dim[(loc_dim.location_state == item[state_label])]
        keys.append(
            temp_df[
                #(loc_dim.location_state == df.loc[row, state_label]) &
                (temp_df.location_lat==item[lat_label]) &
                (temp_df.location_long==item[long_label])
            ].index[0]
        )
    return keys

In [10]:
def get_fuel_prices(
    df,
    date_label,
    fuel_label,
):
    prices = []
    for row in tqdm(df.index):
        price = fuel_prices[
            (fuel_prices["Start Date"]<=df.loc[row, date_label]) &
            (fuel_prices["Stop Date"]>df.loc[row, date_label])
        ]
        
        if df.loc[row, fuel_label]=="gas":
            prices.append(
                price["Gasoline Price"]
            )
        elif df.loc[row, fuel_label]=="diesel":
            prices.append(
                price["Diesel Price"]
            )
        else:
            prices.append(0)
    return prices

In [88]:
def get_fact_table(
    df,
    price_label, 
    fuel_prices, 
    car_keys, 
    date_sold_key, 
    date_post_keys, 
    location_keys
):
    facts = pd.DataFrame(columns =["vehicle_key", "date_sold_key", "location_key", "fuel_price", "sales_price", "date_posted_key"])
    
    facts["vehicle_key"] = car_keys
    facts["location_key"] = location_keys
    facts["date_sold_key"] = date_sold_key
    facts["sales_price"] = df[price_label].tolist()
    facts["fuel_price"] = fuel_prices
    facts["date_post_key"] = date_post_keys
    
    return facts

### Generate Sales facts

In [74]:
def process_table(
    df,
    mileage_label,
    fuel_label,
    post_date_label,
    sale_date_label,
    car_dim_labels,
    loc_dim_labels,
    date_dim_labels,
    post_dim_labels=None
):
    post_filters=None
    df[mileage_label] = df[mileage_label].apply(mileage_to_miles_band)
    df[fuel_label] = df[fuel_label].apply(lambda f: fuel_map[f])
    
    df_ve[sale_date_label] = df_ve[sale_date_label].apply(lambda x: x[:10])
    df_ve[sale_date_label] = df_ve[sale_date_label].apply(np.datetime64)
    
    if post_date_label!=None:
        df_ve[post_date_label] = df_ve[post_date_label].apply(lambda x: x[:10])
        df_ve[post_date_label] = df_ve[post_date_label].apply(np.datetime64)
        post_filters = df_ve[post_dim_labels]
        
    car_filters = df_ve[car_dim_labels]
    loc_filters = df_ve[loc_dim_labels]
    date_filters = df_ve[date_dim_labels]
    
    return car_filters, loc_filters, date_filters, post_filters

- Vehicles

In [12]:
df_ve = pd.read_csv(PATH + "vehicles.csv", index_col=0)
df_ve.head(1)

Unnamed: 0,id,region,region_url,price,year,manufacturer,model,cylinders,fuel,odometer,title_status,transmission,VIN,drive,type,description,state,lat,long,posting_date
31,7316356412,auburn,https://auburn.craigslist.org,15000,2013.0,ford,f150,6 cylinders,gas,128000.0,clean,automatic,,rwd,truck,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592,-85.5189,2021-05-03 14:02:03-05:00


In [13]:
df_ve.odometer = df_ve.odometer.apply(mileage_to_miles_band)

In [14]:
df_ve.fuel = df_ve.fuel.apply(lambda f: fuel_map[f])

In [15]:
df_ve.posting_date = df_ve.posting_date.apply(lambda x: x[:10])
df_ve.posting_date = df_ve.posting_date.apply(np.datetime64)

In [16]:
car_filters = df_ve[["year", "manufacturer", "model", "odometer", "fuel"]]
loc_filters = df_ve[["lat", "long", "state"]]
date_filters = df_ve[["posting_date", "fuel"]]

In [17]:
car_filters.iloc[0]

year                     2013.0
manufacturer               ford
model                      f150
odometer        [98500, 143400)
fuel                        gas
Name: 31, dtype: object

In [18]:
vehicle_key = get_car_foreign_keys(
    car_filters,
    car_dim,
    "year",
    "odometer",
    "manufacturer",
    "model",
    "fuel"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157431/157431 [06:15<00:00, 419.39it/s]


In [23]:
date_key = get_date_foreign_key(
    date_filters,
    date_dim,
    "posting_date"
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157431/157431 [01:27<00:00, 1799.17it/s]


In [39]:
loc_key = get_loc_foreign_key(
    loc_filters,
    location_dim,
    "lat",
    "long",
    "state",
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157431/157431 [11:25<00:00, 229.56it/s]


In [40]:
prices_of_fuel = get_fuel_prices(
    date_filters,
    "posting_date",
    "fuel"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 157431/157431 [02:52<00:00, 910.52it/s]


In [42]:
sales_facts = pd.concat(
    (sales_facts, get_fact_table(df_ve,"price", prices_of_fuel, vehicle_key, date_key, np.nan,  loc_key))
).reset_index(drop=True)

In [43]:
sales_facts.head()

Unnamed: 0,date_posted_key,date_sold_key,location_key,sales_price,fuel_price,vehicle_key,date_post_key
0,,0,0,15000,"1602 2.981 Name: Gasoline Price, dtype: flo...",0.0,
1,,0,1,27990,"1602 2.981 Name: Gasoline Price, dtype: flo...",1.0,
2,,0,1,34590,"1602 2.981 Name: Gasoline Price, dtype: flo...",2.0,
3,,0,2,35000,"1602 2.981 Name: Gasoline Price, dtype: flo...",3.0,
4,,0,1,29990,"1602 2.981 Name: Gasoline Price, dtype: flo...",4.0,


- used cars

In [77]:
df_ve = pd.read_csv(PATH + "used_car_sales.csv")
df_ve.head(1)

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Engine,BodyType,NumCylinders,DriveType,datesold,lat,long,state,fuel
0,137178,7500,2020,78611,84430,ford,mustang,1988.0,5.0l gas v8,sedan,8,RWD,2020-03-19 00:00:00,30.767327,-98.30109,tx,gas


In [78]:
(car_filters,
 loc_filters,
 date_filters,
 _) = process_table(
    df_ve,
    "Mileage",
    "fuel",
    None,
    "datesold",
    ["Year", "Make", "Model", "Mileage", "fuel"],
    ["lat", "long", "state"],
    ["datesold", "fuel"]
)

In [80]:
vehicle_key = get_car_foreign_keys(
    car_filters,
    "Year",
    "Mileage",
    "Make",
    "Model",
    "fuel"
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72590/72590 [03:00<00:00, 403.18it/s]


In [81]:
date_key = get_date_foreign_key(
    date_filters,
    "datesold"
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72590/72590 [00:42<00:00, 1697.93it/s]


In [82]:
loc_key = get_loc_foreign_key(
    loc_filters,
    "lat",
    "long",
    "state",
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72590/72590 [05:06<00:00, 236.55it/s]


In [83]:
prices_of_fuel = get_fuel_prices(
    date_filters,
    "datesold",
    "fuel"
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72590/72590 [01:05<00:00, 1115.28it/s]


In [84]:
sales_facts = pd.concat(
    (sales_facts, get_fact_table(df_ve,"pricesold", prices_of_fuel, vehicle_key, date_key, np.nan,  loc_key))
).reset_index(drop=True)

In [91]:
sales_facts.isna().sum()

date_posted_key    230021
date_sold_key           0
location_key            0
sales_price             0
fuel_price              0
vehicle_key             0
dtype: int64

- tn_mvr

In [97]:
df_ve = pd.read_csv(PATH + "tn_mvr.csv", index_col=0)
df_ve.head(1)

Unnamed: 0,vin,price,odometer_type,mileage,county,zip,model_year,make,model,vehicle_type,new_used,title_issue_date,purchase_date,lat,long,state,fuel
1013,137ZA8434TE173571,31000.0,1,0.0,Tipton,38053,1996,am-general,hummer,AUTO,U,2019-01-17,2019-01-04,35.347965,-89.90668,tn,gas


In [98]:
(car_filters,
 loc_filters,
 date_filters,
 post_filters) = process_table(
    df_ve,
    "mileage",
    "fuel",
    "title_issue_date",
    "purchase_date",
    ["model_year", "make", "model", "mileage", "fuel"],
    ["lat", "long", "state"],
    ["purchase_date", "fuel"],
    ["title_issue_date"]
)

In [99]:
vehicle_key = get_car_foreign_keys(
    car_filters,
    "model_year",
    "mileage",
    "make",
    "model",
    "fuel"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535786/535786 [22:12<00:00, 402.04it/s]


In [101]:
date_key = get_date_foreign_key(
    date_filters,
    "purchase_date"
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535786/535786 [06:07<00:00, 1456.30it/s]


In [102]:
loc_key = get_loc_foreign_key(
    loc_filters,
    "lat",
    "long",
    "state",
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535786/535786 [40:18<00:00, 221.50it/s]


In [104]:
prices_of_fuel = get_fuel_prices(
    date_filters,
    "purchase_date",
    "fuel"
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 535786/535786 [07:22<00:00, 1210.88it/s]


In [107]:
post_key = get_date_foreign_key(
    post_filters,
    "title_issue_date"
)

  2%|██▊                                                                                                                         | 12146/535786 [00:06<04:21, 2005.61it/s]


IndexError: index 0 is out of bounds for axis 0 with size 0

In [108]:
sales_facts = pd.concat((sales_facts, get_fact_table(df_ve, "price", prices_of_fuel, vehicle_key, date_key, np.nan, loc_key)))