In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline, Pipeline
import xgboost as xgb
import numpy as np

In [2]:
df = pd.read_csv("/data/no_null_values.csv")

In [3]:
df

Unnamed: 0,Make,Model,Year,MSRP,Body Size,Body Style,Cylinders,Engine Aspiration,Drivetrain,Transmission,Horsepower,Torque
0,Aston Martin,DBX707,2024,242000.0,Large,SUV,V8,Twin-Turbo,AWD,automatic,697.0,663.0
1,Audi,A3,2024,35800.0,Compact,Sedan,I4,Turbocharged,FWD,automatic,201.0,221.0
2,Audi,A3,2024,37800.0,Compact,Sedan,I4,Turbocharged,AWD,automatic,201.0,221.0
3,Audi,A3,2024,41400.0,Compact,Sedan,I4,Turbocharged,AWD,automatic,201.0,221.0
4,Audi,A3,2024,39400.0,Compact,Sedan,I4,Turbocharged,FWD,automatic,201.0,221.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1605,Nissan,Z,2023,50990.0,Compact,Coupe,V6,Twin-Turbo,RWD,automatic,400.0,350.0
1606,Nissan,Z,2023,53990.0,Compact,Coupe,V6,Twin-Turbo,RWD,automatic,400.0,350.0
1607,Nissan,Z,2023,53990.0,Compact,Coupe,V6,Twin-Turbo,RWD,manual,400.0,350.0
1608,Nissan,Z,2023,40990.0,Compact,Coupe,V6,Twin-Turbo,RWD,manual,400.0,350.0


In [4]:
df.describe()

Unnamed: 0,Year,MSRP,Horsepower,Torque
count,1610.0,1610.0,1610.0,1610.0
mean,2023.450932,72542.032298,345.475155,364.257255
std,0.497741,54903.549349,120.476477,129.61687
min,2023.0,15980.0,122.0,103.0
25%,2023.0,46501.25,261.0,265.0
50%,2023.0,55945.0,318.0,339.0
75%,2024.0,73848.75,405.0,445.0
max,2024.0,391100.0,831.0,811.0


## Data split Model 1

In [5]:
X = df.drop(columns="MSRP")
y = df["MSRP"]
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


(1610, 11)
(1610,)


## Pipe encoding + data scaling + model

In [6]:
# 1. Define the column list
onh_cols = ["Make", "Model", "Body Style", "Cylinders", "Engine Aspiration", "Drivetrain", "Transmission"]
ordinal_cols = ["Body Size"]
numeric_cols = ["Horsepower", "Torque"]

# 2. define the categoryes for the enconder
body_size = ["Compact", "Midsize", "Large"]

In [7]:
nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ordinal_transformer = OrdinalEncoder(categories=[body_size])

In [8]:
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_cols),
        ('ohe', nominal_transformer, onh_cols),
        ('ord', ordinal_transformer, ordinal_cols)
    ],
    remainder='passthrough'
)


model = xgb.XGBRegressor(random_state=42)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

## Train + metrics Model 1

In [9]:
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))

0.9461222530250076


In [10]:
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² : {r2:.4f}")

mse = mean_squared_error(y_test, y_pred)
print(f"MSE (Mean Squared Error): ${mse:,.2f}")

rmse = sqrt(mse)
print(f"RMSE (Root Mean Square Error):${rmse:,.2f}")

mae = mean_absolute_error(y_test, y_pred)
print(f"MAE (Mean Absolute Error): ${mae:,.2f}")

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE (Mean Absolute Percentage Error): {mape*100:.2f}%")

R² : 0.9461
MSE (Mean Squared Error): $151,937,314.71
RMSE (Root Mean Square Error):$12,326.29
MAE (Mean Absolute Error): $6,018.44
MAPE (Mean Absolute Percentage Error): 7.52%


## GridSearchCV Model 1

In [11]:
scorer_to_minimize = 'neg_mean_absolute_error'
param_grid = {
    
    'regressor__n_estimators': [100, 300], 
    'regressor__learning_rate': [0.05, 0.1],

    'regressor__max_depth': [3, 5],
    'regressor__min_child_weight': [1, 3],
    
    'regressor__colsample_bytree': [0.7, 1.0], 
}

grid_search = GridSearchCV(
    estimator=pipe,           
    param_grid=param_grid,     
    scoring=scorer_to_minimize, 
    cv=3,                     
    verbose=1,                 
    n_jobs=-1                 
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'regressor__colsample_bytree': [0.7, 1.0], 'regressor__learning_rate': [0.05, 0.1], 'regressor__max_depth': [3, 5], 'regressor__min_child_weight': [1, 3], ...}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('ohe', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Compact', 'Midsize', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
best_grid = grid_search.best_estimator_
y_pred_grid = best_grid.predict(X_test)

mse = mean_squared_error(y_test, y_pred_grid)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_grid)

print("Best Model Evaluation (Optimized for Neg_MAE)")
print(f"Best parameter combination: {grid_search.best_params_}")
print(f"\nBest average MAE: {-grid_search.best_score_:.2f}")
print(f"Best Model RMSE on the TEST Set: ${rmse:,.2f}")
print(f"R² : {r2:.4f}")

Best Model Evaluation (Optimized for Neg_MAE)
Best parameter combination: {'regressor__colsample_bytree': 0.7, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__min_child_weight': 1, 'regressor__n_estimators': 300}

Best average MAE: 6980.15
Best Model RMSE on the TEST Set: $10,605.90
R² : 0.9601


Although the score is very high, and we see that the model is correct 5% of the time, we also see that on average, when it is wrong, it is wrong by $6,018.44. Our model is wrong on average by $6,018.44, which would be 7.52%. We also see that the RMSE is $12,326.29. This means that our model is more wrong in the high-end cars. This difference between the MAE and the RMSE means that the model is making very large errors in price predictions. It is almost certain that these large errors are occurring in high-end cars.

To solve this, I've considered using logarithmic scales for the column and also creating new features so the model can distinguish between high, hyper, and standard-end cars to make a better price prediction.

After extensive exploration, I encountered a problem with the dataset. The dataset has duplicate rows, but the price column changes. This confuses the machine learning model when predicting prices because the exact same features have different prices. So, I'm going to change that as well.