In [71]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

## Functions

#### Preprocessing

In [120]:
def fillWithMean(col):
    col = col.tolist()
    c = col.dropna(axis=0).tolist()
    mean = np.mean(c)
    
    return [m if pd.isnull(x) else x for x in col]

In [121]:
def fillWithMode(col):
    col = col.tolist()
    occ = dict()
    
    for val in col:
        occ[val] = 0
        
    for val in col:
        occ[val] += 1
        
    occ = list(sorted(occ.items(), key=lambda x: x[1], reverse=True))
        
    return [occ[0][0] if pd.isnull(x) else x for x in col]

In [39]:
def convertBool(col):
    col = col.tolist()
    for index in range(len(col)):
        if pd.isnull(col[index]) or not col[index]:
            col[index] = 0
        else:
            col[index] = 1

    return col

In [141]:
def rescaleNumericalValues(col):
    col = col.tolist()
    ss = StandardScaler()
    return ss.fit_transform(col)

In [74]:
def encodeCategoricalValues(col):
    col = col.tolist()
    
    mapping = dict()
    s = set(col)
    
    i = 0
    for val in s:
        mapping[val] = i
        i += 1
    
    l = []
    
    for val in col:
        l.append(mapping[val])
    
    return l

In [136]:
def processYears(yrs):
    # These variables can be tweaked
    startYear = 1960
    endYear = 2010
    
    yrs = df["year"].tolist()
    yrs = list(filter(lambda x: x < 2010, yrs))
    yrs = list(map(lambda x: x - startYear, yrs))
    yrs = list(filter(lambda x: x > 0, yrs))

    return yrs

#### Algorithms

In [10]:
def predictPrice(X_train, Y_train, X_test):
    linReg = LinearRegression().fit(X_train)
    linReg.predict(X_test)
    
    # TODO

## Code

In [137]:
begin = time.time()
df = pd.read_csv('used_cars_data.csv')
end = time.time()

print("Time taken: ", end - begin)

  df = pd.read_csv('used_cars_data.csv')


Time taken:  201.77767896652222


#### Initial cleaning

In [138]:
# Remove rows with duplicate vehicle id numbers

df.drop_duplicates(subset=["vin"], inplace=True)
print(df.shape)

(3000000, 66)


In [139]:
# Take subset of columns
df = df[["daysonmarket",  "engine_cylinders",  "frame_damaged",  "fuel_type",  "has_accidents",  "horsepower",  "is_new", "listed_date",  "mileage",  "make_name",  "model_name",  "fuel_tank_volume",  "owner_count",  "salvage",  "wheel_system",  "exterior_color",  "year"]]
print(df.shape)
df.head()

(3000000, 17)


Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,listed_date,mileage,make_name,model_name,fuel_tank_volume,owner_count,salvage,wheel_system,exterior_color,year
0,522,I4,,Gasoline,,177.0,True,2019-04-06,7.0,Jeep,Renegade,12.7 gal,,,FWD,Solar Yellow,2019
1,207,I4,,Gasoline,,246.0,True,2020-02-15,8.0,Land Rover,Discovery Sport,17.7 gal,,,AWD,Narvik Black,2020
2,1233,H4,False,Gasoline,False,305.0,False,2017-04-25,,Subaru,WRX STI,15.9 gal,3.0,False,AWD,,2016
3,196,V6,,Gasoline,,340.0,True,2020-02-26,11.0,Land Rover,Discovery,23.5 gal,,,AWD,Eiger Gray,2020
4,137,I4,,Gasoline,,246.0,True,2020-04-25,7.0,Land Rover,Discovery Sport,17.7 gal,,,AWD,Narvik Black,2020


In [140]:
# Remove all NaN columns

df = df.dropna(axis=1, how='all')
print(df.shape)
df.head()

(3000000, 17)


Unnamed: 0,daysonmarket,engine_cylinders,frame_damaged,fuel_type,has_accidents,horsepower,is_new,listed_date,mileage,make_name,model_name,fuel_tank_volume,owner_count,salvage,wheel_system,exterior_color,year
0,522,I4,,Gasoline,,177.0,True,2019-04-06,7.0,Jeep,Renegade,12.7 gal,,,FWD,Solar Yellow,2019
1,207,I4,,Gasoline,,246.0,True,2020-02-15,8.0,Land Rover,Discovery Sport,17.7 gal,,,AWD,Narvik Black,2020
2,1233,H4,False,Gasoline,False,305.0,False,2017-04-25,,Subaru,WRX STI,15.9 gal,3.0,False,AWD,,2016
3,196,V6,,Gasoline,,340.0,True,2020-02-26,11.0,Land Rover,Discovery,23.5 gal,,,AWD,Eiger Gray,2020
4,137,I4,,Gasoline,,246.0,True,2020-04-25,7.0,Land Rover,Discovery Sport,17.7 gal,,,AWD,Narvik Black,2020


#### Process numerical columns

In [142]:
# df["daysonmarket"] = rescaleNumericalValues(df["daysonmarket"])

In [143]:
# df["mileage"] = rescaleNumericalValues(df["mileage"])

#### Process bool columns

In [144]:
# TODO
# df["has_accidents"] = convertBool(df["has_accidents"])
# df["salvage"] = convertBool(df["salvage"])
# df["is_new"] = convertBool(df["is_new"])

#### Process columns with years

In [145]:
# TODO - list length not same
# yr = df["year"]
# yr = processYears(yr)
# df["year"] = yr

In [146]:
# TODO - list length not same
# listDate = df["listed_date"]
# listDate = list(map(lambda x: int(x.split('-')[0]) , listDate)) # Only take years
# listDate = processYears(listDate)
# df["listed_date"] = listDate

#### Encode categorical values to integers

In [147]:
# fillWithMode(df["make_name"])
# df["make_name"] = encodeCategoricalValues(df["make_name"])

In [148]:
# fillWithMode(df["wheel_system"])
# df["wheel_system"] = encodeCategoricalValues(df["wheel_system"])

In [149]:
# fillWithMode(df["engine_cylinders"])
# df["engine_cylinders"] = encodeCategoricalValues(df["engine_cylinders"])

In [150]:
# fillWithMode(df["model_name"])
# df["model_name"] = encodeCategoricalValues(df["model_name"])

#### Run algorithm