In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Disable this warning as I am overwritting the reference of the orginal data frame
pd.options.mode.chained_assignment = None  # default='warn'

In [48]:
df = pd.read_csv('../data/explored_data.csv')
#Dropping make, model, state and city
df.drop(['make','model','state','city'], axis=1, inplace=True)

In [49]:
df.head()

Unnamed: 0,price,millage,location,year,accidents,owners,use,name
0,13130,34770,"Jersey City, NJ",2019,0,1.0,Personal,Hyundai Elantra
1,22990,53762,"New York, NY",2018,0,1.0,Personal,Honda Accord
2,15788,24317,"Bronx, NY",2019,0,1.0,Personal,Ford Fusion
3,4950,146988,"Brooklyn, NY",2007,2,4.0,Fleet,BMW X3
4,4950,92136,"Brooklyn, NY",2010,0,3.0,Fleet,Dodge Journey


In [50]:
#Creating the training and test sets
from sklearn.model_selection import train_test_split
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape)

(693, 7)
(297, 7)


### Features Preprocessing

In [51]:
# Categorical Encoding:
X_train.describe(include=np.object)

Unnamed: 0,location,use,name
count,693,693,693
unique,68,2,194
top,"New York, NY",Personal,Honda Accord
freq,127,584,32


In [52]:
#It seems that use has only 2 possible categories we can use one hot encoding to encode this features.
from sklearn.preprocessing import OneHotEncoder
enc_use = OneHotEncoder(drop='first', )
X_train.loc[:,'use'] = enc_use.fit_transform(X_train[['use']]).toarray()
#Where 1 = personal and 0 = fleet


In [53]:
#For location and name the numbers seems high if I have to one hot encode those it will generate 113 extra columns total.
#Instead I'll use a different strategy:
#For location I'll use frequency encoding replacing the names with the frequency proportion in the train test; assuming this is true in the whole data set.
city_encoder = dict(X_train['location'].value_counts()/len(X_train))
X_train['location'] = X_train['location'].map(city_encoder)

In [54]:
#For make I'll use mean encoding; this is assigning the mean price for the car name everytime it shows in the data.
make_encode = dict(pd.concat([X_train,y_train], axis=1).groupby('name')['price'].mean())
X_train['name'] = X_train['name'].map(make_encode)

### Features Scaling

In [55]:
#Lets scale the features using standard scaler
# Z = x-u/s
cols = list(X_train.columns)

for c in cols:    
    u = X_train[c].mean()
    s = X_train[c].std()    
    X_train[c] = (X_train[c] - u)/s

In [56]:
# And this is How the data looks preproccesed.
X_train.head()

Unnamed: 0,millage,location,year,accidents,owners,use,name
559,-0.50017,1.772508,0.664492,-0.426569,-0.604553,0.431711,0.181943
388,-0.453438,1.772508,-1.101783,1.036859,1.390473,0.431711,-0.569103
129,1.372485,-0.322112,-0.597133,-0.426569,0.39296,0.431711,-0.373604
594,1.316007,-1.065364,0.159842,-0.426569,0.39296,0.431711,-0.379208
797,-0.698466,-0.975273,0.412167,-0.426569,-0.604553,0.431711,0.01967


In [57]:
# Creating at script that facilitate the preprocessing for new data:
def car_preproc(df):
    data_set = df.copy()
    from sklearn.preprocessing import OneHotEncoder   
   
    
    enc_use = OneHotEncoder(drop='first', )
    data_set.loc[:,'use'] = enc_use.fit_transform(data_set[['use']]).toarray()
    #Where 1 = personal and 0 = fleet
    city_encoder = dict(data_set['location'].value_counts()/len(data_set))
    data_set['location'] = data_set['location'].map(city_encoder).fillna(0,axis=0)
    
    # Add condition in case the city and the make does not show, to be equal to zero.    
    data_set['name'] = data_set['name'].map(make_encode).fillna(0,axis=0)
    
    cols = list(X_train.columns)

    for c in cols:    
        u = data_set[c].mean()
        s = data_set[c].std()    
        data_set[c] = (data_set[c] - u)/s
    
    return data_set

### Model Building and Evalution

In [58]:
# Preproccesing test set:
X_test = car_preproc(X_test)

In [59]:
X_test

Unnamed: 0,millage,location,year,accidents,owners,use,name
463,-0.388639,1.425833,0.906485,-0.381205,-0.561652,0.421865,-0.586969
272,-1.105354,1.425833,0.906485,-0.381205,-0.561652,0.421865,1.234323
42,-0.813106,0.954260,0.906485,-0.381205,-0.561652,0.421865,-0.341076
894,-0.480546,-0.760550,0.363691,-0.381205,-0.561652,0.421865,-1.281178
239,-1.150160,1.425833,0.363691,-0.381205,0.349882,0.421865,6.995616
...,...,...,...,...,...,...,...
737,3.030549,-1.103512,-2.893077,-0.381205,2.172951,0.421865,-1.281178
602,1.721520,-0.717680,0.363691,-0.381205,-0.561652,0.421865,-0.392642
893,-0.624540,-0.932031,0.635088,-0.381205,-0.561652,0.421865,0.645147
53,-0.384941,0.954260,0.906485,-0.381205,-0.561652,0.421865,-0.270891


In [60]:
#Importing evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [61]:
# Using Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

pred_test_lr= lr.predict(X_test)
print(mean_absolute_error(y_test,pred_test_lr)) 
print(r2_score(y_test, pred_test_lr))
print(cross_val_score(lr, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean())

9144.744749453448
0.3034487711603354
-3899.6274368566046


In [62]:
#Ridge Regression alpha = 0.01
from sklearn.linear_model import Ridge
rr = Ridge(alpha=0.9)
rr.fit(X_train, y_train) 

pred_test_rr= rr.predict(X_test)
print(mean_absolute_error(y_test,pred_test_rr)) 
print(r2_score(y_test, pred_test_rr))
print(cross_val_score(rr, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean())

9133.393598483342
0.30495212757947
-3900.6409911206356


In [63]:
#Lasson Regression = 0.01
from sklearn.linear_model import Lasso
ls = Lasso(alpha=0.1)
ls.fit(X_train, y_train) 

pred_test_lasso= ls.predict(X_test)
print(mean_absolute_error(y_test,pred_test_lasso)) 
print(r2_score(y_test, pred_test_lasso))
print(cross_val_score(ls, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean())

9144.6714312337
0.3034574832443112
-3899.5920428144855


In [64]:
#Using random forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)
print(mean_absolute_error(y_test,pred_rf)) 
print(r2_score(y_test, pred_rf))
print(cross_val_score(rf, X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean())

6999.993063973064
0.5526452186006175
-2928.3682598343685


In [65]:
# It seems that random forest regressor has the best score
# Let's do a grid search to find the optimal parameters
from sklearn.model_selection import GridSearchCV
param = {'n_estimators':range(10,200,5), 'criterion':('mse', 'mae'), 'random_state':[0]}
gs = GridSearchCV(rf, param, scoring='neg_mean_absolute_error')
gs.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'criterion': ('mse', 'mae'),
                         'n_estimators': range(10, 200, 5),
                         'random_state': [0]},
             scoring='neg_mean_absolute_error')

In [66]:
gs.best_score_

-3110.648263441421

In [67]:
gs.best_params_

{'criterion': 'mse', 'n_estimators': 150, 'random_state': 0}

In [68]:
#With the best parameters.
rf = RandomForestRegressor(n_estimators=150, random_state=0)
rf.fit(X_train,y_train)
pred_rf = rf.predict(X_test)
print(mean_squared_error(y_test,pred_rf)) 
print(r2_score(y_test, pred_rf))
print(cross_val_score(rf, X_train, y_train, cv=10, scoring='neg_mean_squared_error').mean())

146980384.298255
0.556677888028854
-35828985.71851291


In [69]:
# Lets save this model:
import pickle

# open a file, where you ant to store the data
file = open('../models/best_forest.pkl', 'wb')

# dump information to that file
pickle.dump(gs, file)

# close the file
file.close()

In [24]:
##How to open:
# rf_pkl = open('../models/best_forest.pkl', 'rb')
# best_forest = pickle.load(rf_pkl)