# Uber Fares Dataset - Fine-Tunning Our Models  
In this fifth notebook, our aim is to fine tune our three best models used in the earlier notebook. After tunned them, we will have the best set of parameters for each model and, after that, we are ready to test them on the test set.

## Imports 

In [1]:
# basic libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

# scikit-learn libraries 
from sklearn.model_selection import RandomizedSearchCV # gridsearch CV 
from sklearn.ensemble import RandomForestRegressor # random forest regression 
from sklearn.metrics import mean_squared_error # mean squared error is the metric to be used 

# xgboost and lightgbm 
import xgboost as xgb 
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# joblib and pickle to save models
import joblib

## Loading the Data and Models

In [2]:
root_path = '../../uber-fares-prediction/data/processed/'

# prepared training set 
X_train_prepared = (
    pd.read_csv(root_path + 'uber_prepared_train_set.csv')
)

# prepared validation set 
X_test_prepared = (
    pd.read_csv(root_path + 'uber_prepared_validation_set.csv')
)

# target validation set 
y_train = (
    pd.read_csv(root_path + 'uber_validation_target.csv')
)

In [3]:
# converting into an array
y_train = np.ravel(y_train)

In [4]:
# Random Forest Regression 
random_forest_reg = joblib.load('../models/interim/random_forest_regression.pkl')

In [5]:
# XGBoost Regression 
xgboost_reg = joblib.load('../models/interim/xgboost_regression.pkl')

In [6]:
# LGBM Regression 
lgbm_reg = joblib.load('../models/interim/lgbm_regression.pkl')

## Randomized Search Parameters 

### Random Forest Regression

Let's first search for the best set of hyperparameters for the Random Forest model using a Randomized Search CV. We will define a large set of parameters to be searched on and we expect that the results give the best metric in comparison to the earlier vanilla model to us.

#### Defining a large set of hyperparameter grid for Random Forest

In [20]:
# defining the parameter grid for Randomized Search CV
param_grid = {
    'n_estimators': [None] + list(np.random.randint(100, 300, 50)),  # Number of trees in the forest
    'max_features': [1.0, 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None] + list(np.random.randint(5, 25, 5)),  # Maximum depth of the tree
    'min_samples_split': np.random.randint(2, 11, 10),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': np.random.randint(1, 11, 10),  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

In [21]:
# performing randomized search cv 
random_search_random_forest_reg = RandomizedSearchCV(
    random_forest_reg,
    param_distributions=param_grid,
    n_iter=60,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

In [22]:
# fit in our training data 
random_search_random_forest_reg.fit(X_train_prepared, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

Now, let us display the best results in a dataframe: 

In [33]:
# displaying the results in a dataframe
results_random_forest_df = pd.DataFrame(random_search_random_forest_reg.cv_results_)[['params', 'mean_test_score', 'std_test_score']]
results_random_forest_df['rmse'] = np.sqrt(-results_random_forest_df['mean_test_score'])
results_random_forest_df = results_random_forest_df.sort_values(by='rmse', ascending=True).reset_index(drop=True)

In [34]:
# showing the ten best results 
results_random_forest_df.head(10)

Unnamed: 0,params,mean_test_score,std_test_score,rmse
0,"{'n_estimators': 179, 'min_samples_split': 5, ...",-17.754965,4.130691,4.213664
1,"{'n_estimators': 266, 'min_samples_split': 6, ...",-17.843136,4.183933,4.224114
2,"{'n_estimators': 189, 'min_samples_split': 3, ...",-17.852615,4.115142,4.225235
3,"{'n_estimators': 117, 'min_samples_split': 6, ...",-17.907727,4.13455,4.231752
4,"{'n_estimators': 142, 'min_samples_split': 3, ...",-17.917168,4.157494,4.232868
5,"{'n_estimators': 142, 'min_samples_split': 8, ...",-17.944253,4.152275,4.236066
6,"{'n_estimators': 168, 'min_samples_split': 3, ...",-17.99663,4.173543,4.242244
7,"{'n_estimators': 110, 'min_samples_split': 2, ...",-18.023534,4.219607,4.245413
8,"{'n_estimators': 159, 'min_samples_split': 2, ...",-18.058879,4.225692,4.249574
9,"{'n_estimators': 165, 'min_samples_split': 3, ...",-18.147698,4.141965,4.260012


Let's see what were the selected sets of parameters for the five best results:

In [35]:
# creating a dictionary with the five best results 
dict_best_results = {}
key_list = ['first', 'second', 'third', 'fourth', 'fifth']
for i, key in zip(range(5), key_list):
    dict_best_results[key] = results_random_forest_df['params'][i]

In [36]:
dict_best_results

{'first': {'n_estimators': 179,
  'min_samples_split': 5,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 22,
  'bootstrap': False},
 'second': {'n_estimators': 266,
  'min_samples_split': 6,
  'min_samples_leaf': 2,
  'max_features': 'log2',
  'max_depth': 18,
  'bootstrap': False},
 'third': {'n_estimators': 189,
  'min_samples_split': 3,
  'min_samples_leaf': 4,
  'max_features': 'log2',
  'max_depth': None,
  'bootstrap': False},
 'fourth': {'n_estimators': 117,
  'min_samples_split': 6,
  'min_samples_leaf': 2,
  'max_features': 'log2',
  'max_depth': 22,
  'bootstrap': True},
 'fifth': {'n_estimators': 142,
  'min_samples_split': 3,
  'min_samples_leaf': 2,
  'max_features': 'sqrt',
  'max_depth': 17,
  'bootstrap': False}}

Let us save the best model - which is the Random Forest Regression model trained using the best set of parameters that we have obtained.

In [37]:
# getting the best set of parameters 
best_params_random_forest = random_search_random_forest_reg.best_params_

In [38]:
# initializing a random forest regression model using these set of parameters 
best_model_random_forest =  RandomForestRegressor(**best_params_random_forest)

In [30]:
# training the best model on the training data 
best_model_random_forest.fit(X_train_prepared, y_train)

In [31]:
# saving the best model 
best_model_random_forest_path = '../models/interim/best_model_random_forest.pkl'
joblib.dump(best_model_random_forest, best_model_random_forest_path)

['../models/interim/best_model_random_forest.pkl']

### XGBoost Regression 

Now we will repeat the process for the XGBoost Regression model, i.e., we will define a large set of hyperparameters and use the Randomized Search CV to find the best set. 

#### Defining a large set of hyperparameter grid for XGBoost

In [43]:
# defining the parameter grid for XGBoost Regression
param_grid = {
    'learning_rate': [None] + list(np.linspace(0.01, 0.3, 10)),
    'n_estimators': np.random.randint(100, 300, 50),
    'max_depth': [None] + list(np.random.randint(5, 25, 5)),
    'subsample': [None] + list(np.linspace(0.5, 1.0, 6)),
    'colsample_bytree': [None] + list(np.linspace(0.5, 1.0, 6)),
}

In [44]:
# performing randomized search cv for xgboost
random_search_xgboost_reg = RandomizedSearchCV(
    xgboost_reg,
    param_distributions=param_grid,
    n_iter=60,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

In [45]:
# fit in our training data 
random_search_xgboost_reg.fit(X_train_prepared, y_train)

  if is_sparse(dtype):
  if is_sparse(dtype):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  elif is_categorical_dtype(dtype) and enable_categorical:
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_catego

Now, let us display the best results in a dataframe: 

In [46]:
# displaying the results in a dataframe
results_xgboost_reg_df = pd.DataFrame(random_search_xgboost_reg.cv_results_)[['params', 'mean_test_score', 'std_test_score']]
results_xgboost_reg_df['rmse'] = np.sqrt(-results_xgboost_reg_df['mean_test_score'])
results_xgboost_reg_df = results_xgboost_reg_df.sort_values(by='rmse', ascending=True).reset_index(drop=True)

In [47]:
# showing the ten best results 
results_xgboost_reg_df.head(10)

Unnamed: 0,params,mean_test_score,std_test_score,rmse
0,"{'subsample': None, 'n_estimators': 144, 'max_...",-17.596144,4.233402,4.194776
1,"{'subsample': 1.0, 'n_estimators': 171, 'max_d...",-17.693469,4.263764,4.206361
2,"{'subsample': 0.9, 'n_estimators': 180, 'max_d...",-17.857323,4.211055,4.225793
3,"{'subsample': None, 'n_estimators': 109, 'max_...",-17.872571,4.206534,4.227596
4,"{'subsample': 0.8, 'n_estimators': 248, 'max_d...",-17.895121,4.219283,4.230262
5,"{'subsample': 0.8, 'n_estimators': 169, 'max_d...",-17.926677,4.073916,4.233991
6,"{'subsample': 0.8, 'n_estimators': 152, 'max_d...",-17.936296,4.192941,4.235126
7,"{'subsample': 0.9, 'n_estimators': 249, 'max_d...",-18.047197,4.053039,4.248199
8,"{'subsample': 0.8, 'n_estimators': 101, 'max_d...",-18.051254,4.206613,4.248677
9,"{'subsample': None, 'n_estimators': 289, 'max_...",-18.08059,4.258816,4.252128


Let's see what were the selected sets of parameters for the five best results:

In [48]:
# creating a dictionary with the five best results 
dict_best_results = {}
key_list = ['first', 'second', 'third', 'fourth', 'fifth']
for i, key in zip(range(5), key_list):
    dict_best_results[key] = results_xgboost_reg_df['params'][i]

In [49]:
dict_best_results

{'first': {'subsample': None,
  'n_estimators': 144,
  'max_depth': None,
  'learning_rate': 0.10666666666666666,
  'colsample_bytree': 0.9},
 'second': {'subsample': 1.0,
  'n_estimators': 171,
  'max_depth': None,
  'learning_rate': 0.10666666666666666,
  'colsample_bytree': 0.6},
 'third': {'subsample': 0.9,
  'n_estimators': 180,
  'max_depth': 6,
  'learning_rate': 0.1711111111111111,
  'colsample_bytree': 0.9},
 'fourth': {'subsample': None,
  'n_estimators': 109,
  'max_depth': 6,
  'learning_rate': 0.07444444444444444,
  'colsample_bytree': 1.0},
 'fifth': {'subsample': 0.8,
  'n_estimators': 248,
  'max_depth': 6,
  'learning_rate': 0.07444444444444444,
  'colsample_bytree': 0.9}}

Let us save the best model - which is the XGBoost Regression model trained using the best set of parameters that we have obtained.

In [50]:
# getting the best set of parameters 
best_params_xgboost_reg = random_search_xgboost_reg.best_params_

In [53]:
# initializing a random forest regression model using these set of parameters 
best_model_xgboost_reg =  xgb.XGBRegressor(**best_params_xgboost_reg)

In [54]:
# training the best model on the training data 
best_model_xgboost_reg.fit(X_train_prepared, y_train)

In [55]:
# saving the best model 
best_model_xgboost_reg_path = '../models/interim/best_model_xgboost_reg.pkl'
joblib.dump(best_model_xgboost_reg, best_model_xgboost_reg_path)

['../models/interim/best_model_xgboost_reg.pkl']

### LightGBM Regression

Finally, we will repeat the process for the LGBM model. Again, one defines a large set of hyperparameters and realize a Randomized Search CV to search the best set of parameters. 

#### Defining a large set of hyperparameter grid for LGBM

In [58]:
# defining the parameter grid for LGBM Regression
param_grid = {
    'n_estimators': np.random.randint(100, 300, 50), 
    'learning_rate': [None] + list(np.linspace(0.01, 0.3, 10)),
    'max_depth': [None] + list(np.random.randint(3, 10, 7)), 
    'subsample': np.linspace(0.5, 1.0, 10), 
    'colsample_bytree': np.linspace(0.5, 1.0, 10), 
    'reg_alpha': np.logspace(-3, 3, 10),  # Logarithmic space between 0.001 and 1000
    'reg_lambda': np.logspace(-3, 3, 10)  # Logarithmic space between 0.001 and 1000
}

In [59]:
# performing randomized search cv for xgboost
random_search_lgbm_reg = RandomizedSearchCV(
    lgbm_reg,
    param_distributions=param_grid,
    n_iter=60,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

In [60]:
# fit in our training data 
random_search_lgbm_reg.fit(X_train_prepared, y_train)

Now, let us display the best results in a dataframe: 

In [61]:
# displaying the results in a dataframe
results_lgbm_reg_df = pd.DataFrame(random_search_lgbm_reg.cv_results_)[['params', 'mean_test_score', 'std_test_score']]
results_lgbm_reg_df['rmse'] = np.sqrt(-results_lgbm_reg_df['mean_test_score'])
results_lgbm_reg_df = results_lgbm_reg_df.sort_values(by='rmse', ascending=True).reset_index(drop=True)

In [62]:
# showing the ten best results 
results_lgbm_reg_df.head(10)

Unnamed: 0,params,mean_test_score,std_test_score,rmse
0,"{'subsample': 0.5, 'reg_lambda': 46.4158883361...",-18.252741,4.347122,4.272323
1,"{'subsample': 0.7222222222222222, 'reg_lambda'...",-18.292071,4.227597,4.276923
2,"{'subsample': 0.9444444444444444, 'reg_lambda'...",-18.322519,4.314351,4.280481
3,"{'subsample': 0.8333333333333333, 'reg_lambda'...",-18.394858,4.216225,4.288923
4,"{'subsample': 1.0, 'reg_lambda': 10.0, 'reg_al...",-18.444379,4.188824,4.294692
5,"{'subsample': 0.7222222222222222, 'reg_lambda'...",-18.461958,4.226941,4.296738
6,"{'subsample': 0.5555555555555556, 'reg_lambda'...",-18.492653,4.149495,4.300308
7,"{'subsample': 0.9444444444444444, 'reg_lambda'...",-18.499837,4.162446,4.301144
8,"{'subsample': 0.8888888888888888, 'reg_lambda'...",-18.538816,4.25497,4.305672
9,"{'subsample': 0.6111111111111112, 'reg_lambda'...",-18.543537,4.232943,4.306221


Let's see what were the selected sets of parameters for the five best results:

In [63]:
# creating a dictionary with the five best results 
dict_best_results = {}
key_list = ['first', 'second', 'third', 'fourth', 'fifth']
for i, key in zip(range(5), key_list):
    dict_best_results[key] = results_lgbm_reg_df['params'][i]

In [64]:
dict_best_results

{'first': {'subsample': 0.5,
  'reg_lambda': 46.41588833612773,
  'reg_alpha': 215.44346900318823,
  'n_estimators': 296,
  'max_depth': 8,
  'learning_rate': 0.2677777777777778,
  'colsample_bytree': 0.6666666666666666},
 'second': {'subsample': 0.7222222222222222,
  'reg_lambda': 2.154434690031882,
  'reg_alpha': 215.44346900318823,
  'n_estimators': 261,
  'max_depth': 8,
  'learning_rate': None,
  'colsample_bytree': 0.6666666666666666},
 'third': {'subsample': 0.9444444444444444,
  'reg_lambda': 10.0,
  'reg_alpha': 46.41588833612773,
  'n_estimators': 213,
  'max_depth': None,
  'learning_rate': 0.1711111111111111,
  'colsample_bytree': 0.8888888888888888},
 'fourth': {'subsample': 0.8333333333333333,
  'reg_lambda': 10.0,
  'reg_alpha': 0.1,
  'n_estimators': 243,
  'max_depth': 5,
  'learning_rate': 0.1711111111111111,
  'colsample_bytree': 0.6111111111111112},
 'fifth': {'subsample': 1.0,
  'reg_lambda': 10.0,
  'reg_alpha': 46.41588833612773,
  'n_estimators': 259,
  'max_dep

Let us save the best model - which is the XGBoost Regression model trained using the best set of parameters that we have obtained.

In [65]:
# getting the best set of parameters 
best_params_lgbm_reg = random_search_lgbm_reg.best_params_

In [66]:
# initializing a random forest regression model using these set of parameters 
best_model_lgbm_reg =  xgb.XGBRegressor(**best_params_lgbm_reg)

In [67]:
# training the best model on the training data 
best_model_lgbm_reg.fit(X_train_prepared, y_train)

In [68]:
# saving the best model 
best_model_lgbm_reg_path = '../models/interim/best_model_lgbm_reg.pkl'
joblib.dump(best_model_lgbm_reg, best_model_lgbm_reg_path)

['../models/interim/best_model_lgbm_reg.pkl']

The next step will be apply these best models in our test set and explore the final results. 