In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
from sklearn.preprocessing import MultiLabelBinarizer

In [97]:
import sys
sys.path.append('modules')

import importlib
import data

In [98]:
importlib.reload(sys.modules['data'])

<module 'data' from '/Users/vladparakhin/Documents/GitHub/ML_Gradient_Boosting_Classifier/modules/data.py'>

In [99]:
from data import set_filepath, test_loc, percent_missing, display_test, parse_features

## Train

In [100]:
stage = 'train'

In [101]:
file_path_lists = set_filepath(stage)
transform_final = file_path_lists[4]

In [102]:
price_train_final_df = pd.read_csv(transform_final)

In [103]:
price_train_final_df.columns

Index(['VehListdays', 'VehMileage', 'Dealer_Listing_Price', 'VehYear',
       'ListingID', 'VehEngine', 'SellerListSrc', 'SellerState', 'VehFuel',
       'VehColorExt', 'VehColorInt', 'VehTransmission', 'VehFeats',
       'VehHistory', 'VehPriceLabel', 'VehDriveTrain', 'Vehicle_Trim',
       'VehMake', 'VehModel', 'RatingCategory', 'SellerCategory',
       'SellerIsPriv', 'SourceCategory', 'VehCertified'],
      dtype='object')

In [104]:
percent_nan = percent_missing(price_train_final_df)
percent_nan

Series([], dtype: float64)

In [93]:
price_train_final_df.drop(columns=['ListingID'], inplace=True)

In [105]:
X = price_train_final_df.drop('Dealer_Listing_Price',axis=1)

In [106]:
# Ensure VehFeats is in the correct list format
X['VehFeats'] = X['VehFeats'].apply(parse_features)

In [107]:
X['VehHistory'] = X['VehHistory'].apply(parse_features)

#one_hot_encoded_features : veh_feats_encoded, veh_history_encoded

In [108]:
mlb_feats = MultiLabelBinarizer()

In [109]:
mlb_history = MultiLabelBinarizer()

In [110]:
encoded_feats = mlb_feats.fit_transform(X['VehFeats'])

In [111]:
encoded_history = mlb_history.fit_transform(X['VehHistory'])

In [112]:
history_col_names = mlb_history.classes_

In [113]:
history_col_names[:10]

array([' Accident(s) Reported', ' Buyback Protection Eligible',
       ' Non-Personal Use Reported', ' Title Issue(s) Reported',
       '0 Owners', '1 Owner', '2 Owners', '3 Owners', '4 Owners'],
      dtype=object)

In [114]:
# Convert the encoded matrices to DataFrames
encoded_history_df = pd.DataFrame(encoded_history, columns=mlb_history.classes_)
encoded_feats_df = pd.DataFrame(encoded_feats, columns=mlb_feats.classes_)

In [115]:
# Drop the 'mlb-trnsformed' column(s) from the original dataset
X = X.drop(['VehFeats', 'VehHistory'], axis=1)

In [116]:
X = pd.get_dummies(X, drop_first=True)

In [117]:
X = pd.concat([X, encoded_feats_df, encoded_history_df], axis=1)

In [121]:
for i in X.columns:
   print(i) 

VehListdays
VehMileage
VehYear
ListingID
SellerIsPriv
VehCertified
VehEngine_3.6L
VehEngine_3.6L V6
VehEngine_3.6L V6 24V GDI DOHC
VehEngine_3.6L V6 24V MPFI DOHC
VehEngine_5.7L V8
VehEngine_6.2L V8
VehEngine_6.4L V8
VehEngine_V6
VehEngine_V8
VehEngine_unknown
SellerListSrc_Digital Motorworks (DMi)
SellerListSrc_Five Star Certified Program
SellerListSrc_HomeNet Automotive
SellerListSrc_Inventory Command Center
SellerListSrc_Jeep Certified Program
SellerListSrc_My Dealer Center
SellerListSrc_Sell It Yourself
SellerListSrc_unknown
SellerState_AL
SellerState_AR
SellerState_AZ
SellerState_CA
SellerState_CO
SellerState_CT
SellerState_DE
SellerState_FL
SellerState_GA
SellerState_HI
SellerState_IA
SellerState_ID
SellerState_IL
SellerState_IN
SellerState_KS
SellerState_KY
SellerState_LA
SellerState_MA
SellerState_MD
SellerState_ME
SellerState_MI
SellerState_MN
SellerState_MO
SellerState_MS
SellerState_MT
SellerState_NC
SellerState_ND
SellerState_NE
SellerState_NH
SellerState_NJ
SellerState_NM


In [20]:
#X_clean = price_train_final_df.drop('Dealer_Listing_Price',axis=1)
#y_clean = price_train_final_df['Dealer_Listing_Price']

In [122]:
y = price_train_final_df['Dealer_Listing_Price']

In [123]:
from sklearn.model_selection import train_test_split

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=101)

In [125]:
from sklearn.preprocessing import StandardScaler

In [126]:
scaler = StandardScaler()

In [127]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [128]:
scaled_X_train.shape, scaled_X_test.shape

((5668, 957), (630, 957))

In [129]:
from sklearn.linear_model import ElasticNet

In [130]:
base_elastic_model = ElasticNet()

In [131]:
param_grid = {'alpha':[0.1,1,5,10,50,100],
              'l1_ratio':[.1, .5, .7, .9, .95, .99, 1]}

In [132]:
from sklearn.model_selection import GridSearchCV

In [133]:
grid_model = GridSearchCV(estimator=base_elastic_model,
                          param_grid=param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          verbose=1)

In [134]:
grid_model.fit(scaled_X_train,y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [135]:
grid_model.best_params_

{'alpha': 10, 'l1_ratio': 1}

In [136]:
y_pred = grid_model.predict(scaled_X_test)

In [137]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [138]:
mean_absolute_error(y_test,y_pred)

1644.7152749827094

In [139]:
np.sqrt(mean_squared_error(y_test,y_pred))

2248.032513905424

In [140]:
np.mean(price_train_final_df['Dealer_Listing_Price'])

32265.05287392823

 ## Test

In [141]:
stage = 'test'

In [142]:
file_path_lists = set_filepath(stage)
transform_final = file_path_lists[4]

In [143]:
test_final_df = pd.read_csv(transform_final)

FileNotFoundError: [Errno 2] No such file or directory: 'data/final/test_final_clean.csv'

In [70]:
test_final_df.columns

Index(['VehListdays', 'VehMileage', 'VehYear', 'ListingID', 'VehEngine',
       'SellerListSrc', 'SellerState', 'VehFuel', 'VehColorExt', 'VehColorInt',
       'VehTransmission', 'VehPriceLabel', 'VehDriveTrain', 'VehMake',
       'VehModel', 'RatingCategory', 'SellerCategory', 'SellerIsPriv',
       'SourceCategory', 'VehCertified'],
      dtype='object')

In [71]:
X_test_final = pd.get_dummies(test_final_df, drop_first=True)

In [72]:
# Cross check and recon the columns
missing_cols = set(X_train_clean.columns) - set(X_test_final.columns)
for col in missing_cols:
    X_test_final[col] = 0

In [73]:
X_test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 123 entries, VehListdays to VehFuel_unknown
dtypes: bool(2), float64(2), int64(22), uint8(97)
memory usage: 284.3 KB


In [74]:
X_test_final = X_test_final[X_train_clean.columns]

In [75]:
scaled_X_test_final = scaler.transform(X_test_final)

In [76]:
predictions = grid_model.predict(scaled_X_test_final)

In [77]:
report_df = pd.read_csv('data/report/report.csv')

In [78]:
report_df['Dealer_Listing_Price'] = predictions

In [79]:
report_df.to_csv('report.csv', index=False)

In [80]:
report_df.head()

Unnamed: 0,ListingID,Vehicle_Trim,Dealer_Listing_Price
0,8622015,FWD,34368.255796
1,8625693,Limited,26989.157438
2,8625750,Limited,19956.414154
3,8626885,Limited,23242.270791
4,8627430,Luxury,35480.710098
