In [98]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Functions

#### Preprocessing

In [53]:
def fillWithMean(col):
    c = col.dropna(axis=0).tolist()
    mean = np.mean(c)
    
    return [mean if pd.isnull(x) else x for x in col]

In [3]:
def fillWithMode(col):
    col = col.tolist()
    occ = dict()
    
    for val in col:
        occ[val] = 0
        
    for val in col:
        occ[val] += 1
        
    occ = list(sorted(occ.items(), key=lambda x: x[1], reverse=True))
        
    return [occ[0][0] if pd.isnull(x) else x for x in col]

In [61]:
def convertBool(col):
    col = col.tolist()
    for index in range(len(col)):
        if pd.isnull(col[index]) or not col[index]:
            col[index] = 0
        else:
            col[index] = 1

    return col

In [17]:
def rescaleNumericalValues(col):
    col = np.array(col.tolist()).reshape(-1,1)
    ss = StandardScaler()
    return ss.fit_transform(col).reshape(-1)

In [6]:
def encodeCategoricalValues(col):
    col = col.tolist()
    
    mapping = dict()
    s = set(col)
    
    i = 0
    for val in s:
        mapping[val] = i
        i += 1
    
    l = []
    
    for val in col:
        l.append(mapping[val])
    
    return l

In [7]:
def processYears(yrs):
    # These variables can be tweaked
    startYear = 1960
    endYear = 2010
    
    yrs = df["year"].tolist()
    yrs = list(filter(lambda x: x < 2010, yrs))
    yrs = list(map(lambda x: x - startYear, yrs))
    yrs = list(filter(lambda x: x > 0, yrs))

    return yrs

#### Algorithms

In [110]:
def predictPrice(X_train, X_test, Y_train, Y_test):
    linReg = LinearRegression().fit(X_train, Y_train)
    
    train_pred = linReg.predict(X_train)
    test_pred = linReg.predict(X_test)
    
    print("Train MSE: ", mean_squared_error(Y_train, train_pred))
    print("Test MSE: ", mean_squared_error(Y_test, test_pred))

## Code

In [56]:
begin = time.time()
original_df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  original_df = pd.read_csv('used_cars_data.csv')


Time taken:  185.64249396324158


In [113]:
# Sample a portion of the rows

df = original_df.sample(frac=0.30, replace=False)
print(df.shape)
df.head()

(900012, 66)


Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
2138893,1GTR9CED7LZ287038,35.2 in,,--,79.4 in,Pickup Truck,,Broken Arrow,,,...,A,Automatic,t86406,Elevation Double Cab 4WD,,4WD,Four-Wheel Drive,147.4 in,81.2 in,2020
2112512,1FMCU0GD6HUD65261,37.3 in,,,,SUV / Crossover,,Independence,23.0,,...,A,6-Speed Automatic,t66217,SE FWD,,FWD,Front-Wheel Drive,105.9 in,81.8 in,2017
1716887,55SWF8EBXLU327804,35.2 in,,,,Sedan,,Little Rock,23.0,,...,A,Automatic,t87100,C 300 4MATIC Sedan AWD,,AWD,All-Wheel Drive,111.8 in,79.4 in,2020
1757792,LRBFXBSA6LD159656,37.5 in,,,,SUV / Crossover,,Cut Off,22.0,,...,A,6-Speed Automatic,t85032,Preferred FWD,,FWD,Front-Wheel Drive,108.3 in,72.4 in,2020
655960,YV4A22PK5M1677799,37 in,,,,SUV / Crossover,,Northfield,19.0,,...,A,Automatic,t93613,T6 Momentum 7-Passenger AWD,,AWD,All-Wheel Drive,117.5 in,84.3 in,2021


#### Initial cleaning

In [114]:
# Remove rows with duplicate vehicle id numbers

df.drop_duplicates(subset=["vin"], inplace=True)
print(df.shape)

(900007, 66)


In [115]:
# Take subset of columns (put back listed_date and year later)
df = df[["daysonmarket",  "engine_cylinders",  "frame_damaged",  "fuel_type",  "has_accidents",  "horsepower",  "is_new",  "mileage",  "make_name",  "model_name",  "salvage",  "wheel_system",  "exterior_color", "price"]]
print(df.shape)
df.head()

(900007, 14)


Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,mileage,make_name,model_name,salvage,wheel_system,exterior_color,price
2138893,65,V8,,Gasoline,,355.0,True,,GMC,Sierra 1500,,4WD,Onyx Black,39900.0
2112512,42,I4,False,Gasoline,False,245.0,False,41421.0,Ford,Escape,False,FWD,Shadow Black,16750.0
1716887,364,I4,False,Gasoline,False,255.0,False,5998.0,Mercedes-Benz,C-Class,False,AWD,MOJAVE SILVER,38490.0
1757792,100,I4,,Gasoline,,197.0,True,,Buick,Envision,,FWD,Dark Moon Blue Metallic,28190.0
655960,17,I4,,Gasoline,,316.0,True,0.0,Volvo,XC90,,AWD,Savile Gray,64515.0


In [116]:
# Remove all NaN columns

df = df.dropna(axis=1, how='all')
print(df.shape)
df.head()

(900007, 14)


Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,mileage,make_name,model_name,salvage,wheel_system,exterior_color,price
2138893,65,V8,,Gasoline,,355.0,True,,GMC,Sierra 1500,,4WD,Onyx Black,39900.0
2112512,42,I4,False,Gasoline,False,245.0,False,41421.0,Ford,Escape,False,FWD,Shadow Black,16750.0
1716887,364,I4,False,Gasoline,False,255.0,False,5998.0,Mercedes-Benz,C-Class,False,AWD,MOJAVE SILVER,38490.0
1757792,100,I4,,Gasoline,,197.0,True,,Buick,Envision,,FWD,Dark Moon Blue Metallic,28190.0
655960,17,I4,,Gasoline,,316.0,True,0.0,Volvo,XC90,,AWD,Savile Gray,64515.0


#### Process bool columns

In [117]:
bool_cols = ["has_accidents", "salvage", "is_new", "frame_damaged"]

for c in bool_cols:
    df[c] = convertBool(df[c])
    
df.head()

Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,mileage,make_name,model_name,salvage,wheel_system,exterior_color,price
2138893,65,V8,0,Gasoline,0,355.0,1,,GMC,Sierra 1500,0,4WD,Onyx Black,39900.0
2112512,42,I4,0,Gasoline,0,245.0,0,41421.0,Ford,Escape,0,FWD,Shadow Black,16750.0
1716887,364,I4,0,Gasoline,0,255.0,0,5998.0,Mercedes-Benz,C-Class,0,AWD,MOJAVE SILVER,38490.0
1757792,100,I4,0,Gasoline,0,197.0,1,,Buick,Envision,0,FWD,Dark Moon Blue Metallic,28190.0
655960,17,I4,0,Gasoline,0,316.0,1,0.0,Volvo,XC90,0,AWD,Savile Gray,64515.0


#### Process columns with years

In [118]:
# TODO - list length not same
# yr = df["year"]
# yr = processYears(yr)
# df["year"] = yr

In [119]:
# TODO - list length not same
# listDate = df["listed_date"]
# listDate = list(map(lambda x: int(x.split('-')[0]) , listDate)) # Only take years
# listDate = processYears(listDate)
# df["listed_date"] = listDate

#### Encode categorical values to integers

In [120]:
categorical_cols = ["make_name", "wheel_system", "engine_cylinders", "model_name", "fuel_type", "exterior_color"]

for c in categorical_cols:
    fillWithMode(df[c])
    df[c] = encodeCategoricalValues(df[c])

df.head()

Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,mileage,make_name,model_name,salvage,wheel_system,exterior_color,price
2138893,65,5,0,7,0,355.0,1,,11,79,0,5,2309,39900.0
2112512,42,8,0,7,0,245.0,0,41421.0,45,721,0,4,10999,16750.0
1716887,364,8,0,7,0,255.0,0,5998.0,60,1147,0,2,3078,38490.0
1757792,100,8,0,7,0,197.0,1,,62,11,0,4,7455,28190.0
655960,17,8,0,7,0,316.0,1,0.0,32,65,0,2,1325,64515.0


#### Process numerical columns

In [121]:
numerical_cols = ["daysonmarket", "mileage", "horsepower", "make_name", "wheel_system", "engine_cylinders", "model_name", "fuel_type", "exterior_color"]

for c in numerical_cols:
    df[c] = fillWithMean(df[c])
    df[c] = rescaleNumericalValues(df[c])
    
df.head()

Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,mileage,make_name,model_name,salvage,wheel_system,exterior_color,price
2138893,-0.101481,-0.824722,0,0.249651,0,1.21653,1,-6.365944000000001e-17,-1.22417,-1.382022,0,1.194059,-1.113349,39900.0
2112512,-0.313974,-0.545451,0,0.249651,0,-0.03457,0,0.08906543,0.511725,0.461142,0,0.470193,0.675286,16750.0
1716887,2.660931,-0.545451,0,0.249651,0,0.079166,0,-0.2208605,1.277561,1.684177,0,-0.977539,-0.955068,38490.0
1757792,0.221878,-0.545451,0,0.249651,0,-0.580505,1,-6.365944000000001e-17,1.379672,-1.577248,0,0.470193,-0.054164,28190.0
655960,-0.544945,-0.545451,0,0.249651,0,0.772958,1,-0.2733388,-0.152,-1.422215,0,-0.977539,-1.315882,64515.0


#### Run algorithm

In [122]:
cols = df.columns
X_columns = cols[:-1]
Y_column = cols[-1]

X_train, X_test, Y_train, Y_test = train_test_split(df[X_columns].to_numpy(), df[Y_column].to_numpy(), test_size=0.2)

predictPrice(X_train, X_test, Y_train, Y_test)

Train MSE:  207740864.76831123
Test MSE:  295066294.022312
