In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from util.parsers import *
from util.modeling import *

In [2]:
cols = [
    #key
    "date_posted_key",
    "date_sold_key",
    "location_key",
    "sales_price",
    "fuel_price"
]

In [3]:
sales_facts = pd.DataFrame(columns=cols)

### Get Data Dimensions and utils

In [4]:
PATH = "../stage_1/data/processedData/"

In [5]:
#load dimensions
car_dim = pd.read_csv("data/vehicleDim.csv", index_col=0)
date_dim = pd.read_csv("data/dateDim.csv", index_col=0, parse_dates=["date_full_description"])
loc_dim = pd.read_csv("data/locationDim.csv", index_col=0)
fuel_prices = pd.read_csv(PATH + "/fuel_prices.csv", index_col=0, parse_dates=["Start Date", "Stop Date"])

### Generate Sales facts

In [6]:
def process_table(
    df,
    mileage_label,
    fuel_label,
    post_date_label,
    sale_date_label,
    car_dim_labels,
    loc_dim_labels,
    date_dim_labels,
    post_dim_labels=None
):
    post_filters=None
    df[mileage_label] = df[mileage_label].apply(mileage_to_miles_band)
    df[fuel_label] = df[fuel_label].apply(lambda f: fuel_map[f])
    
    df_ve[sale_date_label] = df_ve[sale_date_label].apply(lambda x: x[:10])
    df_ve[sale_date_label] = df_ve[sale_date_label].apply(np.datetime64)
    
    if post_date_label!=None:
        df_ve[post_date_label] = df_ve[post_date_label].apply(lambda x: x[:10])
        df_ve[post_date_label] = df_ve[post_date_label].apply(np.datetime64)
        post_filters = df_ve[post_dim_labels]
        
    car_filters = df_ve[car_dim_labels]
    loc_filters = df_ve[loc_dim_labels]
    date_filters = df_ve[date_dim_labels]
    
    return car_filters, loc_filters, date_filters, post_filters

- Vehicles

In [7]:
df_ve = pd.read_csv(PATH + "vehicles.csv", index_col=0)
df_ve.head(1)

Unnamed: 0,id,region,region_url,price,year,manufacturer,model,cylinders,fuel,odometer,title_status,transmission,VIN,drive,type,description,state,lat,long,posting_date
31,7316356412,auburn,https://auburn.craigslist.org,15000,2013.0,ford,f150,6 cylinders,gas,128000.0,clean,automatic,,rwd,truck,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592,-85.5189,2021-05-03 14:02:03-05:00


In [8]:
df_ve.odometer = df_ve.odometer.apply(mileage_to_miles_band)

In [9]:
df_ve.fuel = df_ve.fuel.apply(lambda f: fuel_map[f])

In [10]:
df_ve.posting_date = df_ve.posting_date.apply(lambda x: x[:10])
df_ve.posting_date = df_ve.posting_date.apply(np.datetime64)

In [17]:
car_filters = df_ve[["year", "manufacturer", "model", "odometer", "fuel", "transmission", "cylinders"]]
loc_filters = df_ve[["lat", "long", "state"]]
date_filters = df_ve[["posting_date", "fuel"]]

In [18]:
car_filters.iloc[0]

year                     2013.0
manufacturer               ford
model                      f150
odometer        [98500, 143400)
fuel                        gas
transmission          automatic
cylinders           6 cylinders
Name: 31, dtype: object

In [31]:
vehicle_key = get_car_foreign_keys(
    car_filters,
    car_dim,
    year_label="year",
    mileage_label="odometer",
    make_label="manufacturer",
    model_label="model",
    fuel_label="fuel",
    cylinders_label="cylinders",
    transmission_label="transmission"
)

  1%|▍                                                                          | 1023/157431 [00:03<09:55, 262.84it/s]

3016





In [30]:
def get_car_foreign_keys(
    df,
    car_dim,
    year_label=None,
    mileage_label=None, 
    make_label=None,
    model_label=None,
    fuel_label=None,
    cylinders_label=None,
    engine_label=None,
    transmission_label=None
):
    keys=[]
    for row in tqdm(df.index):
        try:
            year = df.loc[row, year_label] if year_label!=None else EMPTY_VALUE
            mileage = df.loc[row, mileage_label] if mileage_label!=None else EMPTY_VALUE
            make = df.loc[row, make_label] if make_label!=None else EMPTY_VALUE
            model = df.loc[row, model_label] if model_label!=None else EMPTY_VALUE
            fuel = df.loc[row, fuel_label] if fuel_label!=None else EMPTY_VALUE
            cyl = df.loc[row, cylinders_label] if cylinders_label!=None else EMPTY_VALUE
            eng = df.loc[row, engine_label] if engine_label!=None else EMPTY_VALUE
            trans = df.loc[row, transmission_label] if transmission_label!=None else EMPTY_VALUE
            
            temp_df = car_dim[(car_dim.vehicle_year == year)]
            temp_df = temp_df[(temp_df.vehicle_make == make)]
            temp_df = temp_df[(temp_df.vehicle_model == model)]
            temp_df = temp_df[(temp_df.vehicle_mileage == mileage)]
            temp_df = temp_df[temp_df.vehicle_transmission==trans]
            temp_df = temp_df[temp_df.vehicle_engine==eng]
            temp_df=temp_df[temp_df.vehicle_number_cylinders==cyl]

            keys.append(
                temp_df[(temp_df.vehicle_fuel_type==fuel)].index[0]
            )
        except:
            print(row)
            break
    return keys

In [16]:
date_key = get_date_foreign_key(
    date_filters,
    date_dim,
    "posting_date"
)

100%|████████████████████████████████████████████████████████████████████████| 157431/157431 [01:16<00:00, 2047.44it/s]


In [19]:
df_ve.iloc[0]["lat"]

32.592

In [21]:
loc_key = get_loc_foreign_key(
    loc_filters,
    loc_dim,
    "lat",
    "long",
    "state",
)

100%|█████████████████████████████████████████████████████████████████████████| 157431/157431 [09:17<00:00, 282.23it/s]


In [65]:
prices_of_fuel = get_fuel_prices(
    date_filters,
    fuel_prices,
    "posting_date",
    "fuel"
)

100%|████████████████████████████████████████████████████████████████████████| 157431/157431 [02:10<00:00, 1210.96it/s]


In [67]:
sales_facts = pd.concat(
    (sales_facts, get_fact_table(
        df=df_ve,
        price_label="price", 
        price_of_fuel=prices_of_fuel, 
        car_keys=vehicle_key, 
        date_sold_keys=date_key, 
        date_post_keys=np.nan,  
        location_keys=loc_key))
).reset_index(drop=True)

In [68]:
sales_facts.head()

Unnamed: 0,date_posted_key,date_sold_key,location_key,sales_price,fuel_price,vehicle_key
0,,0,0,15000,2.981,0.0
1,,0,1,27990,2.981,1.0
2,,0,1,34590,2.981,2.0
3,,0,2,35000,2.981,3.0
4,,0,1,29990,2.981,4.0


- used cars

In [69]:
df_ve = pd.read_csv(PATH + "used_car_sales.csv")
df_ve.head(1)

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Engine,BodyType,NumCylinders,DriveType,datesold,lat,long,state,fuel
0,137178,7500,2020,78611,84430,ford,mustang,1988.0,5.0l gas v8,sedan,8,RWD,2020-03-19 00:00:00,30.767327,-98.30109,tx,gas


In [70]:
(car_filters,
 loc_filters,
 date_filters,
 _) = process_table(
    df_ve,
    "Mileage",
    "fuel",
    None,
    "datesold",
    ["Year", "Make", "Model", "Mileage", "fuel"],
    ["lat", "long", "state"],
    ["datesold", "fuel"]
)

In [72]:
vehicle_key = get_car_foreign_keys(
    car_filters,
    car_dim,
    "Year",
    "Mileage",
    "Make",
    "Model",
    "fuel"
)

100%|███████████████████████████████████████████████████████████████████████████| 72590/72590 [03:14<00:00, 373.91it/s]


In [73]:
date_key = get_date_foreign_key(
    date_filters,
    date_dim,
    "datesold"
)

100%|██████████████████████████████████████████████████████████████████████████| 72590/72590 [00:36<00:00, 2001.83it/s]


In [74]:
loc_key = get_loc_foreign_key(
    loc_filters,
    loc_dim,
    "lat",
    "long",
    "state",
)

100%|███████████████████████████████████████████████████████████████████████████| 72590/72590 [04:19<00:00, 279.41it/s]


In [75]:
prices_of_fuel = get_fuel_prices(
    date_filters,
    fuel_prices,
    "datesold",
    "fuel"
)

100%|██████████████████████████████████████████████████████████████████████████| 72590/72590 [00:59<00:00, 1215.78it/s]


In [76]:
sales_facts = pd.concat(
    (sales_facts, get_fact_table(df_ve,"pricesold", prices_of_fuel, vehicle_key, date_key, np.nan,  loc_key))
).reset_index(drop=True)

In [78]:
sales_facts.head()

Unnamed: 0,date_posted_key,date_sold_key,location_key,sales_price,fuel_price,vehicle_key
0,,0,0,15000,2.981,0.0
1,,0,1,27990,2.981,1.0
2,,0,1,34590,2.981,2.0
3,,0,2,35000,2.981,3.0
4,,0,1,29990,2.981,4.0


- tn_mvr

In [79]:
df_ve = pd.read_csv(PATH + "tn_mvr.csv", index_col=0)
df_ve.head(1)

Unnamed: 0,vin,price,odometer_type,mileage,county,zip,model_year,make,model,vehicle_type,new_used,title_issue_date,purchase_date,lat,long,state,fuel
1013,137ZA8434TE173571,31000.0,1,0.0,Tipton,38053,1996,am-general,hummer,AUTO,U,2019-01-17,2019-01-04,35.347965,-89.90668,tn,gas


In [80]:
(car_filters,
 loc_filters,
 date_filters,
 post_filters) = process_table(
    df_ve,
    "mileage",
    "fuel",
    "title_issue_date",
    "purchase_date",
    ["model_year", "make", "model", "mileage", "fuel"],
    ["lat", "long", "state"],
    ["purchase_date", "fuel"],
    ["title_issue_date"]
)

In [81]:
vehicle_key = get_car_foreign_keys(
    car_filters,
    car_dim,
    "model_year",
    "mileage",
    "make",
    "model",
    "fuel"
)

100%|█████████████████████████████████████████████████████████████████████████| 535786/535786 [24:52<00:00, 358.93it/s]


In [82]:
date_key = get_date_foreign_key(
    date_filters,
    date_dim,
    "purchase_date"
)

100%|████████████████████████████████████████████████████████████████████████| 535786/535786 [04:25<00:00, 2019.69it/s]


In [101]:
loc_key = get_loc_foreign_key_np(
    loc_filters,
    loc_dim,
    "lat",
    "long",
    "state",
)

100%|█████████████████████████████████████████████████████████████████████████| 535786/535786 [09:32<00:00, 935.22it/s]


In [90]:
prices_of_fuel = get_fuel_prices(
    date_filters,
    fuel_prices,
    "purchase_date",
    "fuel"
)

100%|████████████████████████████████████████████████████████████████████████| 535786/535786 [07:29<00:00, 1191.80it/s]


In [91]:
post_key = get_date_foreign_key(
    post_filters,
    date_dim,
    "title_issue_date"
)

100%|████████████████████████████████████████████████████████████████████████| 535786/535786 [04:21<00:00, 2046.74it/s]


In [114]:
prices_of_fuel[:10]

[2.358, 2.639, 2.464, 1.951, 2.954, 4.879, 2.702, 3.495, 2.984, 2.86]

In [115]:
sales_facts = pd.concat(
    (
        sales_facts, 
         get_fact_table(df_ve, "price", prices_of_fuel, vehicle_key, date_key, np.nan, loc_key)
    )
)

In [116]:
sales_facts.shape

(765807, 6)

In [117]:
sales_facts.to_csv("data/sales_facts.csv")