In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

#Load up the craigslist car dataset
carPrices = pd.read_csv('craigslistVehicles.csv')

#And clean it up


In [2]:
print(carPrices.columns)

Index(['url', 'city', 'city_url', 'price', 'year', 'manufacturer', 'make',
       'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'desc', 'lat', 'long'],
      dtype='object')


In [14]:
carPrices = carPrices[['price', 'year', 'manufacturer', 'make', 'condition', 'cylinders', 'fuel', 'odometer', 'transmission', 'drive', 'size', 'type', 'paint_color']]


In [15]:
carPrices = carPrices.dropna()

In [16]:
price = carPrices['price']
carData = carPrices.loc[:, carPrices.columns != 'price']

In [17]:
#Isolate dummy variables-need to convert categorical variables so they usable for regression

cylinders = pd.get_dummies(carData['cylinders'])
fuel = pd.get_dummies(carData['fuel'])
#manufacturer = pd.get_dummies(carData['manufacturer'])
color = pd.get_dummies(carData['paint_color'])
#size = pd.get_dummies(carData['size'])
#type_of_car = pd.get_dummies(carData['type'])
#transmission = pd.get_dummies(carData['transmission'])
#drive = pd.get_dummies(carData['drive'])
condition = pd.get_dummies(carData['condition'])
odometer = carData['odometer']
age = (carData['year'] - 2019)*-1


In [18]:
#normalize the only numerical value we're using
#normalize odometer
oMax = odometer.max()
print("oMax: ",oMax)
oMin = odometer.min()
print("oMin: ", oMin)
odometer = (odometer - oMin) * (1/(oMax - oMin))

#normalize age
aMax = age.max()
print("aMax: ", aMax)
aMin = age.min()
print("aMin: ", aMin)
age = (age - aMin) * (1/(aMax - aMin))

oMax:  10000000.0
oMin:  0.0
aMax:  103.0
aMin:  -1.0


In [20]:
carData = pd.concat([odometer, age, cylinders, price], axis = 1)
print(carData.columns)


Index(['odometer', 'year', '10 cylinders', '12 cylinders', '3 cylinders',
       '4 cylinders', '5 cylinders', '6 cylinders', '8 cylinders', 'other',
       'price'],
      dtype='object')


In [10]:
#using the typical sklearn train test split yields a memory error so we have to manually shuffle and split the data
price = carData['price']
carData = carData.loc[:, carData.columns != 'price']
#carData = carData.to_numpy()
#price = price.to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(carData, price, test_size= 0.4, random_state = 42)

In [11]:
#def split(matrix, target, test_proportion):
 #   amount = int(matrix.shape[0]/test_proportion) #should be int
 #   X_train = matrix[amount:,:]
 #   X_test =  matrix[:amount,:]
 #   Y_train = target[amount:]
 #   Y_test =  target[:amount]
 #   return X_train, X_test, Y_train, Y_test

#X_train, X_test, Y_train, Y_test = split(carData, price, 3)

In [12]:
from sklearn.pipeline import make_pipeline

model = linear_model.Ridge()
model.fit(X_train, Y_train)
Y_predict = model.predict(X_test)
Y_predict = Y_predict * 1/100
Y_predict[Y_predict < 0] = 0



In [13]:
from sklearn.metrics import explained_variance_score
r2 = explained_variance_score(Y_test, Y_predict)
print(r2)

-0.000917689963436219


In [151]:
with open('car_pricing_model', 'wb') as handle:
    pickle.dump(model, handle, protocol = pickle.HIGHEST_PROTOCOL)