In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_selector

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


Loading Carvana Dataset
https://www.kaggle.com/datasets/ravishah1/carvana-predict-car-prices

In [3]:
df = pd.read_csv('carvana.csv')
df.head(8)


Unnamed: 0,Name,Year,Miles,Price
0,Chevrolet Trax,2018,41946,16990
1,GMC Terrain,2020,45328,23990
2,Jeep Wrangler,2012,81068,21590
3,Jeep Renegade,2019,35372,21590
4,BMW X,20173,68992,22990
5,Buick Encore,2019,47973,18590
6,Jeep Compass,2016,57159,17590
7,Jeep Compass,2017,47619,18590


Checking for NA Values

In [4]:
df.isna().sum()

Name     0
Year     0
Miles    0
Price    0
dtype: int64

In [5]:
#Some of the years in the data were miss-inputted and five digits
df['Year'] = df['Year'].apply(lambda x: x if (x >= 2009 and x <= 2023) else np.nan)

In [6]:
df.isna().sum()

Name        0
Year     2851
Miles       0
Price       0
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
#Splitting the make and model
df['Name'] = df['Name'].apply(lambda x: x.split())
df['Make'] = df['Name'].apply(lambda x: x[0])
def splitFunc(x):
    if len(x) > 1:
        x.pop(0)
        list = []
        str = ''
        for mod in x:
            list.append(mod)
            str = str + ' ' + mod
            
        return str
  


df["Model"] = df["Name"].map(splitFunc)
df = df.drop(['Name'], axis = 1)


In [9]:
df.head(8)


Unnamed: 0,Year,Miles,Price,Make,Model
0,2018.0,41946,16990,Chevrolet,Trax
1,2020.0,45328,23990,GMC,Terrain
2,2012.0,81068,21590,Jeep,Wrangler
3,2019.0,35372,21590,Jeep,Renegade
5,2019.0,47973,18590,Buick,Encore
6,2016.0,57159,17590,Jeep,Compass
7,2017.0,47619,18590,Jeep,Compass
8,2015.0,51472,17590,Jeep,Patriot


In [10]:
print(f"{'Make':30} : {df['Make'].nunique()} unique values")
print(f"{'Model':30} : {df['Model'].nunique()} unique values")


Make                           : 34 unique values
Model                          : 345 unique values


In [11]:
#Taking a look at the top five models
makes_sum = df.groupby(['Make']).agg(count=('Make', 'count'))
makes_sum = makes_sum.sort_values(by = "count", ascending = 0)
makes_sum.head()

Unnamed: 0_level_0,count
Make,Unnamed: 1_level_1
Nissan,2467
Toyota,2447
Honda,1972
Hyundai,1758
Kia,1673


In [12]:
#Takes categorical variables and converts it to a nuimerical representation
oh=OneHotEncoder(drop='first',handle_unknown='ignore')

ms,ss=MinMaxScaler(),StandardScaler()


ct=ColumnTransformer([
    ('cat_encoder',oh,make_column_selector(dtype_include='object')),
    ('num_encoder',ms,make_column_selector(dtype_include='int64'))
],remainder='passthrough',n_jobs=-1)

In [13]:
#Dropping price as thats to be predicted
X=df.drop(['Price'],axis=1)
y=df[['Price']]

In [14]:
#Splitting training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [15]:
X_train=ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [16]:
#Random Forest Model
clf=RandomForestClassifier(n_estimators=50, max_depth = 32)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

  clf.fit(X_train, y_train)


In [17]:
#Printing the accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.762402088772846
              precision    recall  f1-score   support

       11990       0.86      0.60      0.71        10
       12590       0.77      0.87      0.82        23
       12990       0.76      0.72      0.74        39
       13590       0.90      0.72      0.80        61
       13990       0.78      0.86      0.82       113
       14590       0.84      0.85      0.84       126
       14990       0.81      0.82      0.81       222
       15491       1.00      1.00      1.00         1
       15590       0.80      0.79      0.79       118
       15990       0.73      0.80      0.76       151
       16503       0.00      0.00      0.00         1
       16590       0.82      0.85      0.83       151
       16793       0.00      0.00      0.00         0
       16803       0.00      0.00      0.00         1
       16990       0.74      0.76      0.75       206
       16999       1.00      1.00      1.00         1
       17200       1.00      1.00      1.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
#Dropping model to see if it changes result
X=df.drop(['Price'],axis=1)
X=df.drop(['Model'],axis=1)
y=df[['Price']]

In [20]:
#Splitting training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [21]:
X_train=ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [22]:
#Random Forest Model
clf=RandomForestClassifier(n_estimators=50, max_depth = 32)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

  clf.fit(X_train, y_train)


In [24]:
#Printing the accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9099216710182768
              precision    recall  f1-score   support

       11990       1.00      0.70      0.82        10
       12590       0.87      0.87      0.87        23
       12990       0.95      0.90      0.92        39
       13590       0.98      0.95      0.97        61
       13990       0.94      0.98      0.96       113
       14590       0.97      0.99      0.98       126
       14990       1.00      1.00      1.00       222
       15491       1.00      1.00      1.00         1
       15590       0.99      0.98      0.99       118
       15990       0.99      0.99      0.99       151
       16503       0.00      0.00      0.00         1
       16590       1.00      1.00      1.00       151
       16793       0.00      0.00      0.00         0
       16803       0.00      0.00      0.00         1
       16990       1.00      1.00      1.00       206
       16999       1.00      1.00      1.00         1
       17200       1.00      1.00      1.00         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
#Dropping make to see if it changes result
X=df.drop(['Price'],axis=1)
X=df.drop(['Make'],axis=1)
y=df[['Price']]

In [26]:
#Splitting training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [27]:
X_train=ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [28]:
#Random Forest Model
clf=RandomForestClassifier(n_estimators=50, max_depth = 32)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

  clf.fit(X_train, y_train)


In [29]:
#Printing the accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.8422976501305482
              precision    recall  f1-score   support

       11990       1.00      0.70      0.82        10
       12590       0.90      0.83      0.86        23
       12990       0.82      0.85      0.84        39
       13590       0.98      0.77      0.86        61
       13990       0.87      0.98      0.92       113
       14590       0.98      0.93      0.96       126
       14990       0.92      0.97      0.95       222
       15491       1.00      1.00      1.00         1
       15590       0.97      0.86      0.91       118
       15990       0.89      0.99      0.93       151
       16503       0.00      0.00      0.00         1
       16590       0.99      0.95      0.97       151
       16793       0.00      0.00      0.00         0
       16803       0.00      0.00      0.00         1
       16990       0.95      0.99      0.97       206
       16999       1.00      1.00      1.00         1
       17200       1.00      1.00      1.00         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
#Dropping miles to see if it changes result
X=df.drop(['Price'],axis=1)
X=df.drop(['Miles'],axis=1)
y=df[['Price']]

In [31]:
#Splitting training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [32]:
X_train=ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [33]:
#Random Forest Model
clf=RandomForestClassifier(n_estimators=50, max_depth = 32)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

  clf.fit(X_train, y_train)


In [34]:
#Printing the accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.866579634464752
              precision    recall  f1-score   support

       11590       0.00      0.00      0.00         0
       11990       0.70      0.70      0.70        10
       12590       0.91      0.87      0.89        23
       12990       0.97      0.87      0.92        39
       13590       0.98      0.85      0.91        61
       13990       0.91      0.98      0.94       113
       14590       0.94      0.95      0.94       126
       14990       0.96      0.96      0.96       222
       15491       1.00      1.00      1.00         1
       15590       0.97      0.90      0.93       118
       15990       0.88      0.99      0.93       151
       16503       0.00      0.00      0.00         1
       16590       0.96      0.93      0.94       151
       16803       0.00      0.00      0.00         1
       16990       0.96      0.95      0.95       206
       16999       1.00      1.00      1.00         1
       17200       1.00      1.00      1.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
#Dropping miles to see if it changes result
X=df.drop(['Price'],axis=1)
X=df.drop(['Year'],axis=1)
y=df[['Price']]

In [36]:
#Splitting training and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [37]:
X_train=ct.fit_transform(X_train)
X_test=ct.transform(X_test)

In [38]:
#Random Forest Model
clf=RandomForestClassifier(n_estimators=50, max_depth = 32)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

  clf.fit(X_train, y_train)


In [39]:
#Printing the accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.8663185378590078
              precision    recall  f1-score   support

       11990       1.00      0.60      0.75        10
       12590       0.91      0.87      0.89        23
       12990       0.89      0.87      0.88        39
       13590       0.94      0.77      0.85        61
       13990       0.89      0.95      0.92       113
       14590       0.99      0.92      0.95       126
       14990       0.90      0.96      0.93       222
       15491       1.00      1.00      1.00         1
       15590       0.97      0.84      0.90       118
       15990       0.86      0.99      0.92       151
       16503       0.00      0.00      0.00         1
       16590       0.97      0.95      0.96       151
       16803       0.00      0.00      0.00         1
       16990       0.94      0.96      0.95       206
       16999       0.00      0.00      0.00         1
       17200       1.00      1.00      1.00         1
       17295       1.00      1.00      1.00         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Insights
Interesting results from removing variables from the random forest classifier.
By using only make, year, and miles, a cars price can be calculated to a 90% accuracy. 
I personally thought that the inclusion of the model of the car would be more valuable to the accuracy of the RFC.