In [541]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import sklearn as skl
import tensorflow as tf


In [542]:
#Read in cleaned data
df = pd.read_csv("Data/carsAvgPrice.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Model,Year,Status,Mileage,Age,AveragePrice
0,0,Acura,ILX,2013,Used,86557.0,11,13900.0
1,1,Acura,ILX,2014,Used,148266.0,10,10996.0
2,2,Acura,ILX,2015,Used,77223.0,9,13650.0
3,3,Acura,ILX,2016,Used,56546.0,8,16900.0
4,4,Acura,ILX,2016,Used,72301.0,8,16999.0


In [543]:
#Dropping dup index
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Brand,Model,Year,Status,Mileage,Age,AveragePrice
0,Acura,ILX,2013,Used,86557.0,11,13900.0
1,Acura,ILX,2014,Used,148266.0,10,10996.0
2,Acura,ILX,2015,Used,77223.0,9,13650.0
3,Acura,ILX,2016,Used,56546.0,8,16900.0
4,Acura,ILX,2016,Used,72301.0,8,16999.0


In [544]:
#View Brand 
brandCount = df["Brand"].value_counts()
brandCount[brandCount<1000]

Brand
Subaru                 982
Volvo                  916
Tesla                  881
Lincoln                867
Infiniti               817
RAM                    775
Genesis                516
Jaguar                 331
Buick                  324
Chrysler               307
Mitsubishi             279
Maserati               191
MINI                   181
Alfa Romeo             149
Bentley                119
Pontiac                105
Ferrari                 93
Lamborghini             85
FIAT                    77
Rolls-Royce             64
Aston Martin            61
Scion                   52
Hummer                  47
Polestar                45
McLaren                 40
Rivian                  32
Saturn                  30
Mercury                 18
Lucid                   13
Saab                    13
Lotus                   11
Smart                    8
Suzuki                   6
Karma                    6
Plymouth                 4
Oldsmobile               3
International Scout   

In [545]:
#Eliminate noisy data
brands_to_replace = list(brandCount[brandCount<1000].index)

for brand in brands_to_replace:
    df['Brand'] = df['Brand'].replace(brand,"Other")

df.drop(df.loc[df['Brand']=='Other'].index,inplace=True)
#df.drop(df.loc[df['Brand']=='Porsche'].index,inplace=True)
df["Brand"].value_counts()

Brand
Toyota        6461
Ford          6124
Chevrolet     4546
BMW           3506
Lexus         3061
Honda         2665
Mercedes      2606
Audi          2385
Jeep          2054
GMC           1949
Porsche       1691
Nissan        1578
Cadillac      1342
Kia           1273
Volkswagen    1212
Mazda         1208
Acura         1187
Dodge         1091
Hyundai       1077
Land Rover    1057
Name: count, dtype: int64

In [546]:
#View Model
modelCount = df["Model"].value_counts()
modelCount[modelCount<500]

Model
Camaro          485
MDX             469
ES              436
Bronco          434
CR-V            432
               ... 
Pickup Truck      1
Uplander          1
Spectra5          1
Venture           1
560 SL            1
Name: count, Length: 413, dtype: int64

In [547]:
#Eliminate noisy data
models_to_replace = list(modelCount[modelCount<500].index)

for model in models_to_replace:
    df['Model'] = df['Model'].replace(model,"Other")

df.drop(df.loc[df['Model']=='Other'].index,inplace=True)
df["Model"].value_counts()

Model
F-150             1796
Corvette           954
4Runner            940
Tundra             932
Mustang            871
Wrangler           807
Yukon              801
Silverado          798
RX                 774
Tacoma             773
Sierra             760
Grand Cherokee     744
Tahoe              720
Range Rover        714
Highlander         694
F-250              653
RAV4               650
Expedition         582
Camry              557
X5                 555
911                555
Accord             554
Escalade           533
Civic              512
GX                 500
Name: count, dtype: int64

In [548]:
#Turn categorical data into dummies
df_dummy = pd.get_dummies(df,dtype=int)
df_dummy.head()

Unnamed: 0,Year,Mileage,Age,AveragePrice,Brand_BMW,Brand_Cadillac,Brand_Chevrolet,Brand_Ford,Brand_GMC,Brand_Honda,...,Model_Silverado,Model_Tacoma,Model_Tahoe,Model_Tundra,Model_Wrangler,Model_X5,Model_Yukon,Status_Certified,Status_New,Status_Used
6062,2004,44671.0,20,12500.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6063,2009,108871.0,15,9495.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6064,2010,63146.0,14,8900.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6065,2011,39873.0,13,16850.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6066,2011,46114.0,13,27588.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [549]:
#split feature and target
y = df_dummy['AveragePrice']
X = df_dummy.drop(columns=['AveragePrice','Year'])

In [550]:
y

6062     12500.000000
6063      9495.000000
6064      8900.000000
6065     16850.000000
6066     27588.000000
             ...     
54248    48490.000000
54249    50849.000000
54250    58492.000000
54251    64771.000000
54252    64178.888889
Name: AveragePrice, Length: 18729, dtype: float64

In [551]:
X

Unnamed: 0,Mileage,Age,Brand_BMW,Brand_Cadillac,Brand_Chevrolet,Brand_Ford,Brand_GMC,Brand_Honda,Brand_Jeep,Brand_Land Rover,...,Model_Silverado,Model_Tacoma,Model_Tahoe,Model_Tundra,Model_Wrangler,Model_X5,Model_Yukon,Status_Certified,Status_New,Status_Used
6062,44671.0,20,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6063,108871.0,15,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6064,63146.0,14,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6065,39873.0,13,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
6066,46114.0,13,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54248,33696.0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
54249,105.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
54250,186.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
54251,1455.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [552]:
#Train Test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [553]:
scaler = StandardScaler()

In [554]:
#scale data
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [555]:
#model Random Forest  
from sklearn.ensemble import GradientBoostingRegressor
rf_model = GradientBoostingRegressor(random_state=13,n_estimators=200)
rf_model = rf_model.fit(X_train_scaled,y_train)
predictions = rf_model.predict(X_test_scaled)

In [556]:
#View Prediction results 
predictions_df = X_test.copy()
predictions_df["Actual_price"] = y_test
predictions_df["Predict_price"] = predictions
predictions_df["Error %"] = (abs(predictions_df["Predict_price"]-predictions_df["Actual_price"])/predictions_df["Actual_price"])*100
predictions_df.head()

Unnamed: 0,Mileage,Age,Brand_BMW,Brand_Cadillac,Brand_Chevrolet,Brand_Ford,Brand_GMC,Brand_Honda,Brand_Jeep,Brand_Land Rover,...,Model_Tundra,Model_Wrangler,Model_X5,Model_Yukon,Status_Certified,Status_New,Status_Used,Actual_price,Predict_price,Error %
35456,38931.0,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,24394.0,31006.201817,27.105853
48281,24569.0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,37450.0,39137.909371,4.507101
16234,28365.0,3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,38491.0,52700.316975,36.915947
50655,64819.0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,42500.0,34566.110481,18.667975
6313,44030.0,4,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,52199.0,43978.920405,15.747581


In [557]:
predictions_df.loc[:,'Error %'].mean()

15.708303777128597

In [558]:
#Result Reporting
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
r2 = r2_score(predictions_df["Actual_price"], predictions_df["Predict_price"])
mse = mean_squared_error(predictions_df["Actual_price"], predictions_df["Predict_price"])
rmse = np.sqrt(mse)
std = np.std(predictions_df["Actual_price"])

# Print relevant metrics.
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The r2 is 0.8134768833942372.
The mean squared error is 164713259.17336223.
The root mean squared error is 12834.066353785234.
The standard deviation is 29716.519950064983.


In [559]:
#Prep result for graph
result_df = pd.merge(predictions_df,df,how='left',left_index=True,right_index=True)
result_df = result_df[["Brand","Model","Year","Status","Mileage_y","AveragePrice","Actual_price","Predict_price","Error %"]]
result_df

Unnamed: 0,Brand,Model,Year,Status,Mileage_y,AveragePrice,Actual_price,Predict_price,Error %
35456,Lexus,RX,2015,Used,38931.0,24394.0,24394.0,31006.201817,27.105853
48281,Toyota,4Runner,2020,Used,24569.0,37450.0,37450.0,39137.909371,4.507101
16234,Ford,Expedition,2021,Used,28365.0,38491.0,38491.0,52700.316975,36.915947
50655,Toyota,Highlander,2021,Used,64819.0,42500.0,42500.0,34566.110481,18.667975
6313,BMW,X5,2020,Used,44030.0,52199.0,52199.0,43978.920405,15.747581
...,...,...,...,...,...,...,...,...,...
43787,Porsche,911,2020,Certified,17678.0,133293.0,133293.0,129764.703085,2.647023
53723,Toyota,Tundra,2020,Certified,21548.0,39296.0,39296.0,44967.132870,14.431832
35966,Lexus,RX,2021,Used,45131.0,44394.0,44394.0,41820.599299,5.796731
20888,Ford,Mustang,2022,Used,5697.0,38499.0,38499.0,45621.384793,18.500181


In [560]:
#Actual vs Prediction graph
import hvplot.pandas
result_df.hvplot.scatter(
    x='Actual_price',
    y='Predict_price',
    color='Brand',
    hover_cols=['Model']
)