# Model Training

In [None]:
!pip install optuna
!pip install category_encoders
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import os
import gc
import warnings
import statistics
from google.colab import drive
from xgboost import XGBRegressor
from category_encoders import TargetEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
import optuna

warnings.filterwarnings("ignore")

drive.mount('drive') # , force_remount=True

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
def import_data(file):
    """create a dataframe and load csv file"""
    df = pd.read_csv(file, index_col=0)
    return df

In [None]:
folder = "drive/MyDrive/Colab Notebooks/Autotrader Portfolio"
df = import_data(folder+"/train_clean.csv")

In [None]:
# Define the function that will help us measure our model's performance
def measure_model_performance(pred_validation, y_validation, model, pred_train=None, y_train=None, verbose=True):
    
    mape_validation = round(mean_absolute_percentage_error(y_validation, pred_validation),3)
    rmse_validation = round(mean_squared_error(y_validation, pred_validation, squared=False),0)
    
    if verbose == True:
        # Print train data metrics only if it has been defined
        if pred_train is not None:
            print("--TRAIN--")
            print(f"MAPE: {round(mean_absolute_percentage_error(y_train, pred_train),3)}")
            print(f"RMSE: {round(mean_squared_error(y_train, pred_train,squared=False),0)}")

        print("--VALIDATION--")
        print(f"MAPE: {mape_validation}")
        print(f"RMSE: {rmse_validation}")
    
    return rmse_validation, mape_validation

# Memory optimization on the dataframe
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

Memory usage of dataframe is 4.33 MB
Memory usage after optimization is: 3.30 MB
Decreased by 23.9%


In [None]:
# Reduce memory usage of dataframe
df = reduce_mem_usage(df)

Memory usage of dataframe is 3.30 MB
Memory usage after optimization is: 3.30 MB
Decreased by 0.0%


## XGBoost - Base model

In [None]:
df

Unnamed: 0,make,model,odometer,price,bodytype,trim,year,drivetrain,transmission_manual,province
0,Chevrolet,Silverado 1500,70103,45950,truck,,2019,AWD,0,quebec
1,Chevrolet,Silverado 1500,74951,62200,truck,,2020,AWD,0,ontario
2,Chevrolet,Silverado 1500,71012,32999,truck,Silverado 1500-Lt,2015,AWD,0,ontario
3,Chevrolet,Silverado 1500,118010,38589,truck,,2018,AWD,0,ontario
4,Chevrolet,Silverado 1500,122000,29995,truck,Silverado 1500-2Lt,2015,AWD,0,ontario
...,...,...,...,...,...,...,...,...,...,...
52785,Chevrolet,City Express,101409,26995,minivan,,2015,FWD,0,ontario
52786,Mercedes-Benz,Amg Gle 53,37000,99950,suv,,2021,AWD,0,quebec
52787,Mercedes-Benz,Amg Gle 53,1781,145770,suv,,2022,AWD,0,quebec
52788,Mercedes-Benz,E53 Amg,37258,100995,convertible,,2019,AWD,0,quebec


In [None]:
# Prepare the features and target dataframes
X = df.drop(['price'], axis=1)
y = df.price

# Identify categorical features
cat_features = X.select_dtypes(include=["object"]).columns.to_list()
# Remove 3 categorical columns that will later be transformed into numerical features in the pipeline
cat_features = [ele for ele in cat_features if ele not in {'make','model','trim'}] 

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('target_encode', TargetEncoder(cols=['make','model','trim'],smoothing=1, min_samples_leaf=1), ['make','model','trim']),
        ('one_hot_encode', OneHotEncoder(use_cat_names=True, handle_unknown='ignore'), cat_features),
    ], remainder='passthrough')

# Initialize the regressor
xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror', random_state=42, n_jobs=-1,
                   learning_rate=0.15, n_estimators= 2000, tree_method = 'gpu_hist',
                   #subsample= 0.81, min_child_weight= 16, colsample_bytree= 0.8, max_depth = 7, early_stopping_rounds=100   
                  )

# Bundle preprocessing and modeling in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb)
                            ])

#my_pipeline.fit(X,y)

In [None]:
# 5 Fold Cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_percentage_error', cv=kf, n_jobs=-1)

# Print MAPE score
mape_scores = list(-scores)
print(f'MAPE mean: {round(statistics.mean(mape_scores),3)}')

MAPE mean: 0.077


In [None]:
# Garbage collection
gc.collect()

596

## XGBoost Hyperparameter tuning using Optuna library

In [102]:
def objective(trial):
    # Prepare the features and target dataframes
    X = df.drop(['price'], axis=1)
    y = df.price

    param_grid = {
      "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 8),
      "smoothing": trial.suggest_int("smoothing", 1, 8),
      "cols": ['make','model','trim']
    }

    # Prepare the features and target dataframes
    X = df.drop(['price'], axis=1)
    y = df.price

    # Identify categorical features
    cat_features = X.select_dtypes(include=["object"]).columns.to_list()
    # Remove 3 categorical columns that will later be transformed into numerical features in the pipeline
    cat_features = [ele for ele in cat_features if ele not in {'make','model','trim'}] 

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
    transformers=[
        ('target_encode', TargetEncoder(**param_grid), ['make','model','trim']),
        ('one_hot_encode', OneHotEncoder(use_cat_names=True, handle_unknown='ignore'), cat_features),
    ], remainder='passthrough')

    # Initiialize the regressor

    xgb_params = {
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 0.4),
      "n_estimators": trial.suggest_int("n_estimators", 1000, 3000),
      "max_depth": trial.suggest_int("max_depth", 4, 8),
      #"subsample": trial.suggest_categorical("subsample", [0.96, 0.98, 1]),
      #"colsample_bytree": trial.suggest_categorical("colsample_bytree", [0.90, 0.96, 1.0]),
      "min_child_weight": trial.suggest_int("min_child_weight", 1, 35),
      #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),      
      #"colsample_bynode": trial.suggest_float("colsample_bylevel", 0.5, 1),
      'alpha': trial.suggest_loguniform('alpha', 0.02, 5),
      'lambda': trial.suggest_loguniform('lambda', 0.01, 0.20),
      "early_stopping_rounds": 100,
      "n_jobs": -1,
      'tree_method' : 'gpu_hist',
      "model__booster": 'gbtree',
      "objective" : 'reg:squarederror',
      'random_state': 42
    }
    xgb = XGBRegressor(**xgb_params)
      

    # Bundle preprocessing and modeling in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', xgb)
                          ])


    # 5 Fold Cross validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_percentage_error', cv=kf, n_jobs=-1)

    # Print MAPE scores
    mape_scores = list(-scores)
    score = round(statistics.mean(mape_scores),4)

    return score

In [None]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_jobs=-1, n_trials = 200, timeout= 60*60*3) # , timeout= 60*60*2

In [104]:
best_trial = study.best_trial
print('Best params:', study.best_trial.params)
print("Best Score: ", best_trial.value)

Best params: {'min_samples_leaf': 1, 'smoothing': 2, 'learning_rate': 0.14177629544827303, 'n_estimators': 1760, 'max_depth': 5, 'min_child_weight': 2, 'alpha': 0.04666093503628314, 'lambda': 0.1407597525574459}
Best Score:  0.0715


In [105]:
'''plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search
went and which parts of the space were explored more.'''
optuna.visualization.plot_slice(study)

In [106]:
# Define the regressor with optimal hyperparameters
param_grid = {
  'min_samples_leaf': 1, 
  'smoothing': 2,
  "cols": ['make','model','trim']
}

preprocessor = ColumnTransformer(
transformers=[
    ('target_encode', TargetEncoder(**param_grid), ['make','model','trim']),
    ('one_hot_encode', OneHotEncoder(use_cat_names=True, handle_unknown='ignore'), cat_features),
], remainder='passthrough')

params = {
  'learning_rate': 0.1417, 
  'n_estimators': 1760, 
  'max_depth': 5,
  'min_child_weight':2,
  'alpha':0.0466,
  'lambda':0.14,
  'subsample': 1, 
  'colsample_bytree': 1,
  'tree_method' : 'gpu_hist',
  "model__booster": 'gbtree',
  "objective" : 'reg:squarederror',
  'early_stopping_rounds': 100,
  'random_state': 42
}

xgb = XGBRegressor(**params) 

# Bundle preprocessing and model in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', xgb)
                        ])
my_pipeline.fit(X, y)

# 5 Fold Cross validation
from sklearn.model_selection import cross_validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'rmse': 'neg_mean_squared_error',
           'mape': 'neg_mean_absolute_percentage_error'}
scores = cross_validate(my_pipeline, X, y, scoring=scoring, cv=kf, n_jobs=-1)

mean_rmse = np.sqrt(pd.Series(scores['test_rmse']).mean()*(-1))
mean_mape = pd.Series(scores['test_mape']).mean()*(-1)

print(f'Avg. RMSE: {mean_rmse}')
print(f'Avg. MAPE: {mean_mape}')

Avg. RMSE: 4813.301846128091
Avg. MAPE: 0.07148179438813994


In [107]:
# Get prediction on a single new data
test_dict = pd.DataFrame({"make":"Porsche","model":'Macan', "bodytype": "suv", "odometer":104000, "trim":"nan", 
                          "year":2017, "drivetrain":"AWD", "transmission_manual": 0, "province": "ontario"},index=[1])

print(f"Prediction: {int(my_pipeline.predict(test_dict))}")

Prediction: 43756


## Model predictions on test data

In [111]:
test_df = import_data(folder+"/test_clean.csv").reset_index(drop=True)

# Drop any NA values from df
test_df.dropna(inplace=True)

test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4625 entries, 1 to 9475
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   make                 4625 non-null   object
 1   model                4625 non-null   object
 2   odometer             4625 non-null   int64 
 3   price                4625 non-null   int64 
 4   bodytype             4625 non-null   object
 5   trim                 4625 non-null   object
 6   year                 4625 non-null   int64 
 7   drivetrain           4625 non-null   object
 8   province             4625 non-null   object
 9   transmission_manual  4625 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 397.5+ KB


In [112]:
# Prepare the features and target dataframes
X_test = test_df.drop(['price'], axis=1)
y_test = test_df.price

# Preprocessing of validation data, get predictions
pred_test = my_pipeline.predict(X_test)

# Print evaluation metrics
mape_test = round(mean_absolute_percentage_error(y_test, pred_test),3)
rmse_test = round(mean_squared_error(y_test, pred_test, squared=False),0)

print(f"MAPE: {mape_test}")
print(f"RMSE: {rmse_test}")

MAPE: 0.067
RMSE: 3330.0


### Save predictions to csv file

In [None]:
test_df['predicted_price'] = pred_test

filepath = "drive/MyDrive/Colab Notebooks/Autotrader Portfolio"
test_df.to_csv(f"{filepath}/pred_test.csv")