# Modelling

This notebook will implement various models to find best model for predicting price of used casr in our dataset

In [1]:
# Library imports
import pandas as pd
import numpy as np
import joblib
import pickle
import random
import os

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
#from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load


# Disabling warnings:
import warnings
warnings.filterwarnings('ignore') 

In [2]:
# Seeding:
def seed_all(seed):
    
    ''' A function to seed everything for getting stable results and reproducibility'''
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 786    
seed_all(seed)

## load and pre process data

In [3]:
# read the data from local file that i had already downloaded from Kaggle
data = pd.read_csv('bmw.csv')

# Transform value of categorical columns to remove any white spaces spaces
data['model'] = data['model'].apply(lambda x : x.strip().replace(' ', '_'))

# Merge Other and Electric
data['fuelType'] = data['fuelType'].replace({'Electric':'Other'})
# Construct new feature named age from year by subtracting year from 2022.
data['age'] = 2022 - data['year']
#remove the year as we have encode that value into age 
del data['year']

# get index of car to remove
idx_to_remove = data[data.price==123456].index
# drop the car
data.drop(idx_to_remove, inplace= True)
# reset index as we have removed some rows
data.reset_index(inplace=True, drop=True)

In [4]:
# Apply One Hot Encoder for categorical variables and PowerTransformer for numerical features
categorical_features = ['model', 'transmission', 'fuelType']
numerical_features = ['mileage', 'mpg', 'engineSize', 'age']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', PowerTransformer(method='yeo-johnson'), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_features)
    ])

In [5]:
# Split the data for training and testing and use seed= 786 keep results reproducible
X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=786)

In [7]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [19]:
print("data is ",data.shape)
print("X is ",X.shape)
print("X_train is ",X_train.shape)
print("y_train is ",y_train.shape)
print("X_test is ",X_test.shape)
print("y_test is ",y_test.shape)
print("\n---After Preprocessing--\n")
print("X_train is ",X_train_processed.shape)
print("X_test is ",X_test_processed.shape)

data is  (10780, 9)
X is  (10780, 8)
X_train is  (8624, 8)
y_train is  (8624,)
X_test is  (2156, 8)
y_test is  (2156,)

---After Preprocessing--

X_train is  (8624, 35)
X_test is  (2156, 35)


In [8]:
# What do the features look like
preprocessor.get_feature_names_out()

array(['num__mileage', 'num__mpg', 'num__engineSize', 'num__age',
       'cat__model_1_Series', 'cat__model_2_Series',
       'cat__model_3_Series', 'cat__model_4_Series',
       'cat__model_5_Series', 'cat__model_6_Series',
       'cat__model_7_Series', 'cat__model_8_Series', 'cat__model_M2',
       'cat__model_M3', 'cat__model_M4', 'cat__model_M5', 'cat__model_M6',
       'cat__model_X1', 'cat__model_X2', 'cat__model_X3', 'cat__model_X4',
       'cat__model_X5', 'cat__model_X6', 'cat__model_X7', 'cat__model_Z3',
       'cat__model_Z4', 'cat__model_i3', 'cat__model_i8',
       'cat__transmission_Automatic', 'cat__transmission_Manual',
       'cat__transmission_Semi-Auto', 'cat__fuelType_Diesel',
       'cat__fuelType_Hybrid', 'cat__fuelType_Other',
       'cat__fuelType_Petrol'], dtype=object)

## Apply Models with Hyperparameter tuning

We are testing following three model using GridSeachCV to detrmine best model

1. DecisionTreeRegressor
2. BaggingRegressor
3. RandomForestRegressor

Note: The criterion to choose best model will be based on R2 metric

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

https://insidelearningmachines.com/tune_hyperparameters_in_decision_trees/#Regression-3

In [14]:
# Define models
models = [
    DecisionTreeRegressor(random_state=786),
    BaggingRegressor(random_state=786, n_jobs=-1),
    RandomForestRegressor(random_state=786, n_jobs=-1)
]

# Define hyperparameters
params = [
    {'criterion': ['squared_error', 'friedman_mse', 'poisson'], 'max_depth': [2, 4, 6, 8], 'min_samples_split': [2, 4, 6, 8], 'max_leaf_nodes': [2, 4, 6, 8]},
    {'n_estimators': [50, 100,150, 200], 'max_samples': [0.5, 0.7, 1.0], 'max_features': [0.5, 0.7, 1.0]},
    {'n_estimators': [50, 100,150, 200], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [2, 4, 6, 8], 'criterion': ['squared_error', 'poisson', 'friedman_mse']}
]

# Initialize dataframe to store results
results_df = pd.DataFrame(columns=['Model', 'Best Params', 'MSE', 'R2', 'MAE'])

# Iterate over models
for model, param in zip(models, params):
    # Perform grid search
    grid_search = GridSearchCV(model, param, cv=5, scoring='r2')
    grid_search.fit(X_train_processed, y_train)

    # Get best model
    best_model = grid_search.best_estimator_

    # Predict on test set
    y_pred = best_model.predict(X_test_processed)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Append results to dataframe
    result = {'Model': type(model).__name__, 'Best Params': grid_search.best_params_, 'MSE': mse, 'R2': r2, 'MAE': mae}
    print(result)
    results_df = pd.concat([results_df, pd.DataFrame(result)], ignore_index=True)
    #results_df = results_df.append({'Model': type(model).__name__, 'Best Params': grid_search.best_params_, 'MSE': mse, 'R2': r2, 'MAE': mae}, ignore_index=True)

    # Save best model
    joblib.dump(best_model, f'{type(model).__name__}_model.pkl')

# Print results
#print(results_df)

                    Model Best Params           MSE        R2          MAE
0   DecisionTreeRegressor     poisson  3.150054e+07  0.759436  4007.284149
1   DecisionTreeRegressor           4  3.150054e+07  0.759436  4007.284149
2   DecisionTreeRegressor           8  3.150054e+07  0.759436  4007.284149
3   DecisionTreeRegressor           2  3.150054e+07  0.759436  4007.284149
4        BaggingRegressor         1.0  5.209281e+06  0.960218  1505.883353
5        BaggingRegressor         0.7  5.209281e+06  0.960218  1505.883353
6        BaggingRegressor        50.0  5.209281e+06  0.960218  1505.883353
7   RandomForestRegressor     poisson  1.315890e+07  0.899508  2535.768768
8   RandomForestRegressor           8  1.315890e+07  0.899508  2535.768768
9   RandomForestRegressor        sqrt  1.315890e+07  0.899508  2535.768768
10  RandomForestRegressor         100  1.315890e+07  0.899508  2535.768768


In [21]:
results_df.style.highlight_max(subset=["R2"], axis=0)

Unnamed: 0,Model,Best Params,MSE,R2,MAE
0,DecisionTreeRegressor,poisson,31500543.075444,0.759436,4007.284149
1,DecisionTreeRegressor,4,31500543.075444,0.759436,4007.284149
2,DecisionTreeRegressor,8,31500543.075444,0.759436,4007.284149
3,DecisionTreeRegressor,2,31500543.075444,0.759436,4007.284149
4,BaggingRegressor,1.000000,5209281.118136,0.960218,1505.883353
5,BaggingRegressor,0.700000,5209281.118136,0.960218,1505.883353
6,BaggingRegressor,50.000000,5209281.118136,0.960218,1505.883353
7,RandomForestRegressor,poisson,13158904.960952,0.899508,2535.768768
8,RandomForestRegressor,8,13158904.960952,0.899508,2535.768768
9,RandomForestRegressor,sqrt,13158904.960952,0.899508,2535.768768


## Conclusion

After looking at the table above we can infer that BaggingRegressor is the best model so we will use it for predictions

The parameters we will use are : {'max_features': 1.0, 'max_samples': 0.7, 'n_estimators': 50}