In [261]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score, auc, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor
from sklearn.preprocessing import  OneHotEncoder, scale, StandardScaler
from sklearn.compose import  ColumnTransformer
from sklearn.pipeline import Pipeline


pf = pd.read_csv("/Users/hayden/Downloads/Projects/used_car_project/data/cleaned/used_cars_cleaned.csv")

# Feature and Target

In [262]:
X = pf.drop('price', axis = 1)
y = pf['price']
print(X.shape)
print(y.shape)

(3868, 13)
(3868,)


# Train-test split

In [263]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(3094, 13) (3094,)
(774, 13) (774,)


# Handle string

In [264]:
num_col = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_col = X.select_dtypes(include=['object','category']).columns.tolist()

print('Numeric: ', num_col)
print('Categoric: ', cat_col)

Numeric:  ['mileage', 'accident', 'clean_title', 'car_age', 'price_per_mile']
Categoric:  ['brand', 'model', 'fuel_type', 'engine', 'ext_col', 'int_col', 'mile_bin', 'ext_col_grouped']


# Differentiate numerical columns and categorical columns
    Since the models can not interpret categorical values such as toyota, subaru, we need to differentiate numerical and categorical columns

In [265]:
preprocessor = ColumnTransformer(transformers=[('num', 'passthrough', num_col), ('cat', OneHotEncoder(handle_unknown='ignore'), cat_col)])

# Linear Regression

In [266]:
LinReg = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])
LinReg.fit(X_train, y_train)
pred_LinReg = LinReg.predict(X_test)

print('R^2:', r2_score(y_test, pred_LinReg))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_LinReg)))
print('MAE:', mean_absolute_error(y_test, pred_rid))

R^2: 0.6743167151860241
RMSE: 21330.945926332144
MAE: 30136.387037690285


# Ridge

In [267]:
ridge = Pipeline([('preprocessor', preprocessor), ('model', Ridge(alpha=4))])
ridge.fit(X_train, y_train)
pred_rid = ridge.predict(X_test)

print('R^2:', r2_score(y_test, pred_rid))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_rid)))
print('MAE:', mean_absolute_error(y_test, pred_rid))

R^2: 0.3552548934186752
RMSE: 30012.79261267522
MAE: 18338.083534536076


# Lasso

In [268]:
lasso = Pipeline([('preprocessor', preprocessor), ('model', Lasso(alpha=0.01, max_iter=10000))])
lasso.fit(X_train, y_train)
pred_las = lasso.predict(X_test)

print('R^2:', r2_score(y_test, pred_las))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_las)))
print('MAE:', mean_absolute_error(y_test, pred_las))

R^2: 0.8067421059143987
RMSE: 16431.643401836944
MAE: 9071.290710130868


  model = cd_fast.sparse_enet_coordinate_descent(


# Random Forest
    Better than linear/ridge/lasso with handling nonlinearities

In [269]:
ran_for = Pipeline([('preprocessor', preprocessor), ('model', RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, n_jobs=-1))])
ran_for.fit(X_train, y_train)
pred_ran_for = ran_for.predict(X_test)

print('R^2:', r2_score(y_test, pred_ran_for))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_ran_for)))
print('MAE:', mean_absolute_error(y_test, pred_ran_for))

R^2: 0.7121253276710504
RMSE: 20054.60480755181
MAE: 9106.237726098192


# GradBoost
    handles skewed value well

In [270]:
gb = Pipeline([('preprocessor', preprocessor), ('model', GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42))])
gb.fit(X_train, y_train)
pred_gb = gb.predict(X_test)

print('R^2:', r2_score(y_test, pred_gb))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_gb)))
print('MAE:', mean_absolute_error(y_test, pred_gb))

R^2: 0.963482006447604
RMSE: 7142.756313927941
MAE: 3423.8317932260447


In [271]:
# XGBoost or LightGBM

# Comparisons

### R^2
    R^2 shows how much variation of the target the model explains
    Therefore higher R^2 value that covers more variation is the better than the lower R^2 value
### MAE
    MAE gives average value of the difference between predicted and actual prices.
    Therefore smaller the MAE is, the smaller the error between predicted and actual prices is
### RMSE
    RMSE is simliar to MAE but RMSE squares the error value before calculation the average and takes the square root from the resulted average
    Meaning, RMSE penalizes larger errors heavily.
    Therefore, the smaller the RMSE is, the better the model is.

# Result table

In [272]:
comparisons = {"Model": [ 'Linear Regression', 'Ridge', 'Lasso', 'Random Forest', 'Gradient Boosting'],
               'R^2': [0.63, 0.28 , 0.78, 0.70, 0.97],
               'RMSE' : [23160.49, 33622.45, 18561.23, 21533.38, 7175.24],
               'MAE': [2071048268.65, 1130469424.44 , 3445192004.44, 463686401.87, 51484124.65],
               }

df_comp = pd.DataFrame(comparisons)

print(df_comp)

               Model   R^2      RMSE           MAE
0  Linear Regression  0.63  23160.49  2.071048e+09
1              Ridge  0.28  33622.45  1.130469e+09
2              Lasso  0.78  18561.23  3.445192e+09
3      Random Forest  0.70  21533.38  4.636864e+08
4  Gradient Boosting  0.97   7175.24  5.148412e+07


# Conclusion

- 5 predictive model were implementd and compared using R^2, RMSE, and MAE.
- Gradient Boosting model dominated other 4 models in all aspects by having highest R^2 and lowest RMSE and MAE.
- This shows the Gradient Boosting is the most effective model to predict used car prices for this dataset
