In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import time

from datetime import datetime

In [2]:
cols = [
    #"date_key",
    "full_description",
    "year",
    "month",
    "written_month",
    "day",
    "written_day_of_week",
    "season",
    "day_of_week",
    "week_number_in_year",
    "weekend_indicator",
    "month_end_indicator"
]
cols = list(map(lambda x: "date_" + x, cols))

In [3]:
date_dim = pd.DataFrame(columns=cols)

In [4]:
date_dim.shape

(0, 11)

In [5]:
PATH = "../stage_1/data/processedData/"

In [6]:
def parse_data(df, date_label, chars=10, fmt="%Y-%m-%d"):
    df = df[[date_label]]
    print("Original shape=", df.shape)
    
    df.columns = ["date_full_description"]
    df["date_full_description"] = df.date_full_description.apply(
        lambda x: x[:chars]
    )
    df = df.drop_duplicates()
    
    print("Distinct shape=", df.shape)
    df["date_full_description"] = df.date_full_description.apply(
        lambda d: datetime.strptime(d, fmt)
    )
    return df
    
    

In [7]:
def merge_dfs(df_a, df_b):
    return pd.concat((df_a, df_b)).reset_index(drop=True).drop_duplicates()

- Vehicles

In [8]:
df_vehicles = pd.read_csv(f"{PATH}/vehicles.csv", index_col=0)
df_vehicles.head(1)

Unnamed: 0,id,region,region_url,price,year,manufacturer,model,cylinders,fuel,odometer,title_status,transmission,VIN,drive,type,description,state,lat,long,posting_date
31,7316356412,auburn,https://auburn.craigslist.org,15000,2013.0,ford,f150,6 cylinders,gas,128000.0,clean,automatic,,rwd,truck,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592,-85.5189,2021-05-03 14:02:03-05:00


In [9]:
df_vehicles = parse_data(df_vehicles, "posting_date")

Original shape= (157431, 1)
Distinct shape= (31, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date_full_description"] = df.date_full_description.apply(


In [10]:
date_dim = merge_dfs(date_dim, df_vehicles)

In [11]:
date_dim.shape

(31, 11)

- Used cars

In [12]:
df_used_cars = pd.read_csv(f"{PATH}/used_car_sales.csv")
df_used_cars.head(1)

Unnamed: 0,ID,pricesold,yearsold,zipcode,Mileage,Make,Model,Year,Engine,BodyType,NumCylinders,DriveType,datesold,lat,long,state,fuel
0,137178,7500,2020,78611,84430,ford,mustang,1988.0,5.0l gas v8,sedan,8,RWD,2020-03-19 00:00:00,30.767327,-98.30109,tx,gas


In [13]:
df_used_cars = parse_data(df_used_cars, "datesold")

Original shape= (72590, 1)
Distinct shape= (1040, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date_full_description"] = df.date_full_description.apply(


In [14]:
date_dim = merge_dfs(date_dim, df_used_cars)

In [15]:
date_dim.shape

(1071, 11)

- tn_mvr

In [16]:
df_tn = pd.read_csv(f"{PATH}/tn_mvr.csv", index_col=0)
df_tn.head(1)

Unnamed: 0,vin,price,odometer_type,mileage,county,zip,model_year,make,model,vehicle_type,new_used,title_issue_date,purchase_date,lat,long,state,fuel
1013,137ZA8434TE173571,31000.0,1,0.0,Tipton,38053,1996,am-general,hummer,AUTO,U,2019-01-17,2019-01-04,35.347965,-89.90668,tn,gas


In [17]:
df_t1 = parse_data(df_tn, "purchase_date")

Original shape= (535786, 1)
Distinct shape= (3677, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date_full_description"] = df.date_full_description.apply(


In [18]:
df_t2 = parse_data(df_tn, "title_issue_date")

Original shape= (535786, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date_full_description"] = df.date_full_description.apply(


Distinct shape= (1722, 1)


In [19]:
date_dim = merge_dfs(date_dim, df_t1).drop_duplicates()

In [20]:
date_dim = merge_dfs(date_dim, df_t2).drop_duplicates()

In [21]:
date_dim.shape

(3690, 11)

- reviews

In [22]:
df_rev = pd.read_csv(f"{PATH}/review.csv", index_col=0)
df_rev.head(1)

Unnamed: 0,Company,Model,Year,Reviewer,Date,Title,Rating,Review
0,acura,ilx,2013,mahowald,2012-08-12,Getting 43 Mpg on daily commute,5,"I love this car.\r\nGas mileage, suspension, a..."


In [23]:
df_rev = parse_data(df_rev, "Date")

Original shape= (262923, 1)
Distinct shape= (6911, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date_full_description"] = df.date_full_description.apply(


In [24]:
df_rev.dropna(inplace=True)

In [25]:
date_dim = merge_dfs(date_dim, df_rev)

In [26]:
date_dim.shape

(7695, 11)

In [27]:
date_dim = date_dim.drop_duplicates()

In [28]:
date_dim.shape

(7695, 11)

- Infere other fields

In [29]:
date_dim.head()

Unnamed: 0,date_full_description,date_year,date_month,date_written_month,date_day,date_written_day_of_week,date_season,date_day_of_week,date_week_number_in_year,date_weekend_indicator,date_month_end_indicator
0,2021-05-03,,,,,,,,,,
1,2021-05-02,,,,,,,,,,
2,2021-05-01,,,,,,,,,,
3,2021-04-29,,,,,,,,,,
4,2021-04-27,,,,,,,,,,


In [30]:
temp = date_dim.loc[0, "date_full_description"]

In [31]:
date_dim.date_year=date_dim.date_full_description.apply(lambda d: d.year)
date_dim.date_month=date_dim.date_full_description.apply(lambda d: d.month)
date_dim.date_day=date_dim.date_full_description.apply(lambda d: d.day)

In [32]:
date_dim.date_weekend_indicator = date_dim.date_full_description.apply(lambda d: d.isoweekday()!=1)

In [33]:
date_dim.date_written_month = date_dim.date_full_description.apply(lambda d: d.month_name().lower())

In [34]:
date_dim.date_week_number_in_year = date_dim.date_full_description.apply(lambda d: d.weekofyear)

In [35]:
date_dim.date_month_end_indicator = date_dim.date_full_description.apply(lambda d: d.is_month_end)

In [36]:
date_dim.date_day_of_week = date_dim.date_full_description.apply(lambda d: d.weekday())

In [37]:
date_dim.date_written_day_of_week = date_dim.date_full_description.apply(lambda d: d.day_name().lower())

In [38]:
def get_season(date):
    day = date.day_of_year
    spring = range(80, 172)
    summer = range(172, 264)
    fall = range(264, 355)

    if day in spring:
        return 'spring'
    elif day in summer:
        return 'summer'
    elif day in fall:
        return 'fall'
    else:
        return 'winter'

In [39]:
date_dim.date_season = date_dim.date_full_description.apply(get_season)

In [40]:
date_dim.head()

Unnamed: 0,date_full_description,date_year,date_month,date_written_month,date_day,date_written_day_of_week,date_season,date_day_of_week,date_week_number_in_year,date_weekend_indicator,date_month_end_indicator
0,2021-05-03,2021,5,may,3,monday,spring,0,18,False,False
1,2021-05-02,2021,5,may,2,sunday,spring,6,17,True,False
2,2021-05-01,2021,5,may,1,saturday,spring,5,17,True,False
3,2021-04-29,2021,4,april,29,thursday,spring,3,17,True,False
4,2021-04-27,2021,4,april,27,tuesday,spring,1,17,True,False


In [41]:
temp.day_of_year

123

In [None]:
date_dim.fillna("Unknown", inplace=True)

In [42]:
date_dim.to_csv("data/dateDim.csv")