In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

pd.set_option('display.max_columns', None)

In [2]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.stattools import adfuller

from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae, mean_absolute_percentage_error as mape
import time

import warnings

from statsmodels.tools.sm_exceptions import ValueWarning
warnings.filterwarnings("ignore", category=ValueWarning)

from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# Loading Datasets

In [4]:
df_ts_final = pd.read_csv('./data_artifacts/df_ts_final.csv')
df_ts_final.shape

(37422, 13)

In [5]:
with open('./data_artifacts/ts_train_test_dict.pkl', 'rb') as f:
    ts_train_test_splitted_dict = pickle.load(f)

In [6]:
with open('./data_artifacts/reg_v1_train_test_dict.pkl', 'rb') as f:
    reg_final_v1_train_test_dict = pickle.load(f)
    
with open('./data_artifacts/reg_v2_train_test_dict.pkl', 'rb') as f:
    reg_final_v2_train_test_dict = pickle.load(f)

In [7]:
# Dropping 'Capacity' columns
def drop_cols(reg_train_test_dict, col):

    reg_mod_train_test_dict = {}
    for ps_idx in list(reg_train_test_dict.keys())[:]:

        df_train = reg_train_test_dict[ps_idx]['train']
        df_test = reg_train_test_dict[ps_idx]['test']

        reg_mod_train_test_dict[ps_idx] = {}
        reg_mod_train_test_dict[ps_idx]['train'] = df_train.drop([col], axis=1)
        reg_mod_train_test_dict[ps_idx]['test'] = df_test.drop([col], axis=1)
        
    return reg_mod_train_test_dict
    

reg_final_v1_mod_train_test_dict = drop_cols(reg_train_test_dict=reg_final_v1_train_test_dict, col='Capacity')
reg_final_v2_mod_train_test_dict = drop_cols(reg_train_test_dict=reg_final_v2_train_test_dict, col='Capacity') 
    
print(reg_final_v1_mod_train_test_dict[1]['train'].shape)
print(reg_final_v1_mod_train_test_dict[1]['test'].shape)
print(reg_final_v2_mod_train_test_dict[1]['train'].shape)
print(reg_final_v2_mod_train_test_dict[1]['test'].shape)

(1260, 14)
(126, 14)
(1260, 15)
(126, 15)


# Time based Train-Test Split across All Parking Spaces

In [8]:
# ts_train_test_splitted_dict.keys()

In [9]:
# 1st Parking Lot: Train Set
ts_train_test_splitted_dict[1]['train']

TimeStamp
2016-10-04 08:00:00    10.5719
2016-10-04 08:30:00    11.0919
2016-10-04 09:00:00    13.8648
2016-10-04 09:30:00    18.5442
2016-10-04 10:00:00    25.9965
                        ...   
2016-12-12 14:30:00    28.5962
2016-12-12 15:00:00    27.5563
2016-12-12 15:30:00    22.7036
2016-12-12 16:00:00    19.9307
2016-12-12 16:30:00    16.4645
Name: Occupancy_Rate, Length: 1260, dtype: float64

In [10]:
# 1st Parking Lot: Test Set
ts_train_test_splitted_dict[1]['test']

TimeStamp
2016-12-13 08:00:00     2.4263
2016-12-13 08:30:00     3.4662
2016-12-13 09:00:00     5.7192
2016-12-13 09:30:00    10.2253
2016-12-13 10:00:00    15.7712
                        ...   
2016-12-19 14:30:00    53.5529
2016-12-19 15:00:00    51.9931
2016-12-19 15:30:00    47.4870
2016-12-19 16:00:00    39.8614
2016-12-19 16:30:00    33.4489
Name: Occupancy_Rate, Length: 126, dtype: float64

# Helper Functions

## Evaluation Metric:

In [11]:
# Creating a function to print values of all these metrics.
def ts_performance_metrics(actual, predicted):
    print('-'*50)
    print('Forecasting Metrics')
    print('-'*50)
    metric_mae = round(mae(actual, predicted), 3)
    metric_rmse = round(mse(actual, predicted)**0.5, 3)
    print('MAE :', metric_mae)
    print('RMSE :', metric_rmse) 
    # print('MAPE:', round(mape(actual, predicted), 3))
    
    return metric_mae, metric_rmse

# HyperParameter Tuning: Exponential Smoothing (TES)

In [12]:
def cross_val_exp_smooth(train_data, cv_folds, alpha_vals, beta_vals, gamma_vals, season_period):
    
    dict_hyp = {'alpha':[], 'beta':[], 'gamma':[], 'cv_rmse':[], 'cv_mae':[]}
    
    # Use the full training data for cross-validation within the training set
    tscv = TimeSeriesSplit(n_splits=cv_folds)
    
    # Trying out different hyper-parameters
    for alpha in alpha_vals:
        for beta in beta_vals:
            for gamma in gamma_vals:
        
                cv_rmse_lst = []
                cv_mae_lst = []

                # Performing cross-validation with the training set
                for train_index, val_index in tscv.split(train_data):
                    train_fold = train_data.iloc[train_index]
                    val_fold = train_data.iloc[val_index]

                    # Fit Exponential Smoothing model with the current alpha
                    model = ExponentialSmoothing(train_fold, trend='add', seasonal='add', 
                                                 seasonal_periods=season_period).fit(smoothing_level=alpha,                       
                                                                                     smoothing_trend=beta, 
                                                                                     smoothing_seasonal=gamma)
                    

                    # Forecast for the validation set
                    forecast = model.forecast(steps=len(val_fold))
                    
                    # # Debug
                    # print(alpha, beta, gamma, len(val_fold), len(forecast), sum(np.isnan(val_fold)), sum(np.isnan(forecast)))

                    if sum(np.isnan(forecast)) == len(val_fold): 
                        # Ensuring forecasts are not all nans
                        continue
                    else:
                        # Compute validation error (RMSE)
                        rmse_fold = round(mse(val_fold, forecast)**0.5, 3)
                        mae_fold = round(mae(val_fold, forecast), 3)

                        # Append RMSE & MAE to cv_errors
                        cv_rmse_lst.append(rmse_fold)
                        cv_mae_lst.append(mae_fold)


                # Average validation error for current hyperparameter combination
                dict_hyp['alpha'].append(alpha)
                dict_hyp['beta'].append(beta)
                dict_hyp['gamma'].append(gamma)
                dict_hyp['cv_rmse'].append(round(np.mean(cv_rmse_lst), 3))
                dict_hyp['cv_mae'].append(round(np.mean(cv_mae_lst), 3))
      
    # Finding the combination with lowest rmse
    df_hyp = pd.DataFrame(dict_hyp).sort_values(by='cv_rmse')
    df_hyp_best_combo_dict = df_hyp.iloc[0].to_dict()
    print(f'Best parameters found: {df_hyp_best_combo_dict}')
    
    return df_hyp_best_combo_dict
    
    

In [13]:
def tune_exp_smooth_all_park_lots(train_test_splitted_dict, cv, alpha_values, beta_values, gamma_values,  seasonal_period):
    
    tuned_exp_smooth_models_dict = {'ps_idx':[], 'alpha':[], 'beta':[], 'gamma':[], 
                                    'cv_rmse': [], 'cv_mae': []}

    for ps_idx in list(train_test_splitted_dict.keys())[:]:

        print(f'Finding Best Parameters for Park_Space_ID: {ps_idx}')
        print('-'*50)
        train_series = ts_train_test_splitted_dict[ps_idx]['train']
        test_series = ts_train_test_splitted_dict[ps_idx]['test']
        print(f'Train series length: {train_series.shape[0]},  Test series length: {test_series.shape[0]}')

        # Start Tuning
        # Define s
        alpha_values = np.round(np.arange(0, 0.9, 0.15), 2)  # Alpha values from 0 to 0.9
        beta_values = np.round(np.arange(0, 0.9, 0.15), 2)  # Beta values from 0 to 0.9
        gamma_values = np.round(np.arange(0, 0.9, 0.15), 2)  # Gamma values from 0 to 0.9
        
        
        print('Finding best parameters start')
        start_time = time.time()

        tes_hyp_dict = cross_val_exp_smooth(train_data=train_series, 
                                    cv_folds=cv, 
                                    alpha_vals=alpha_values, 
                                    beta_vals=beta_values,     
                                    gamma_vals=gamma_values, 
                                    season_period=seasonal_period)
        
        end_time = time.time()
        time_taken = (end_time-start_time)
        print('Finding best parameters end')
        # End Tuning

        
        # Storing the results
        tuned_exp_smooth_models_dict['ps_idx'].append(ps_idx)
        tuned_exp_smooth_models_dict['alpha'].append(tes_hyp_dict['alpha'])
        tuned_exp_smooth_models_dict['beta'].append(tes_hyp_dict['beta'])
        tuned_exp_smooth_models_dict['gamma'].append(tes_hyp_dict['gamma'])
        tuned_exp_smooth_models_dict['cv_rmse'].append(tes_hyp_dict['cv_rmse'])
        tuned_exp_smooth_models_dict['cv_mae'].append(tes_hyp_dict['cv_mae'])

        print(('-'*50)+f'Completed, Time Taken: {round(time_taken, 2)} '+('-'*50))
        
        
    return pd.DataFrame(tuned_exp_smooth_models_dict)


In [14]:
# # Define parameter search space
# alpha_vals = np.round(np.arange(0, 0.9, 0.15), 2)  # Alpha values from 0 to 0.9
# beta_vals = np.round(np.arange(0, 0.9, 0.15), 2)  # Beta values from 0 to 0.9
# gamma_vals = np.round(np.arange(0, 0.9, 0.15), 2)  # Gamma values from 0 to 0.9

# df_hyp_tuned_exp_smooth_models = tune_exp_smooth_all_park_lots(train_test_splitted_dict=ts_train_test_splitted_dict, cv=4, 
#                                                                alpha_values=alpha_vals, 
#                                                                beta_values=beta_vals, 
#                                                                gamma_values=gamma_vals, 
#                                                                seasonal_period=126)

In [15]:
# df_hyp_tuned_exp_smooth_models.head()

In [16]:
# df_hyp_tuned_exp_smooth_models.cv_rmse.describe()

In [17]:
# # Saving best parameters for the exp_smoothing_models
# df_hyp_tuned_exp_smooth_models.to_csv(path_or_buf='./data_artifacts/df_hyp_tuned_params_tes_models.csv', index=False)

# HyperParameter Tuning: SARIMA (using Auto-ARIMA)

In [18]:
def tune_SARIMAX_v2(train_data, 
                    p_max, q_max, P_max, Q_max, s_val,   
                    random_srch=False, random_srch_fits=10):
    
    # Use pmdarima's auto_arima function to find the best parameters
    model = pm.auto_arima(train_data,
                          
                          
                          # Parameter space: Non-seasonal
                          start_p=0, max_p=p_max,
                          start_q=0, max_q=q_max,
  
                          # Parameter space: Seasonal
                          seasonal=True, 
                          start_P=0, max_P=P_max,
                          start_Q=0, max_Q=Q_max,
                          m=s_val,
 
                          # Differencing: Automatically determine non-seasonal and seasonal differencing order
                          stationary=False,
                          d=0, D=1, 
                              
                          # Whether to perform grid vs random search
                          random=random_srch,
                          n_fits=random_srch_fits,
                              

                          trace=True,                 # print the progress   
                          error_action='ignore',      # ignore if a model does not work  
                          suppress_warnings=True,     # suppress warnings    
                          stepwise=True,              # apply stepwise algorithm for faster processing
                          maxiter = 25, 
                          n_jobs=-1                   # Use all processors  
                         )
    return model  

In [19]:
def tune_SARIMAX_all_park_lots(train_test_splitted_dict, SARIMAX_tuner_fn):
    
    tuned_sarima_models_dict = {'ps_idx':[], 'tuned_model':[], 'pdq':[], 'PDQs':[], 'forecast_mae':[], 'forecast_rmse':[]}

    for ps_idx in list(train_test_splitted_dict.keys())[:]:

        print(f'Finding Best Parameters for Park_Space_ID: {ps_idx}')
        print('-'*50)
        train_series = ts_train_test_splitted_dict[ps_idx]['train']
        test_series = ts_train_test_splitted_dict[ps_idx]['test']
        print(f'Train series length: {train_series.shape},  Test series length: {test_series.shape}')

        # Start Tuning
        print('-'*25+'Model Fitting start'+'-'*25)
        start_time = time.time()
        tuned_sarimax_model_with_summary = SARIMAX_tuner_fn(train_data=train_series,    
                                                            p_max=1, q_max=1, # Non-Seasonal AR, MA orders set to [0, 1]
                                                            P_max=0, Q_max=0, # Seasonal AR, MA orders set to 0  
                                                            s_val=18*7,       # Seasonal Period  
                                                            random_srch=True, random_srch_fits=10)
        end_time = time.time()
        time_taken = (end_time-start_time)
        print('-'*25+'Model Fitting end'+'-'*25)
        # End Tuning

        
        print(f'Best non-seasonal order: (p,d,q): {tuned_sarimax_model_with_summary.order}')
        print(f'Best seasonal order: (P,D,Q): {tuned_sarimax_model_with_summary.seasonal_order}')

        # Forecasting
        tuned_sarimax_model_forecast = tuned_sarimax_model_with_summary.predict(n_periods=len(test_series))
        tuned_sarimax_model_forecast.index = test_series.index
        fr_mae, fr_rmse = ts_performance_metrics(actual=test_series, predicted=tuned_sarimax_model_forecast)
        
        # Storing the results
        tuned_sarima_models_dict['ps_idx'].append(ps_idx)
        tuned_sarima_models_dict['tuned_model'].append(tuned_sarimax_model_with_summary)
        tuned_sarima_models_dict['pdq'].append(tuned_sarimax_model_with_summary.order)
        tuned_sarima_models_dict['PDQs'].append(tuned_sarimax_model_with_summary.seasonal_order)
        tuned_sarima_models_dict['forecast_mae'].append(fr_mae)
        tuned_sarima_models_dict['forecast_rmse'].append(fr_rmse)

        print(('-'*50)+f'Completed, Time Taken: {round(time_taken, 2)} '+('-'*50))
        
        
    return tuned_sarima_models_dict

# hyp_tuned_sarima_models_dict = tune_SARIMAX_all_park_lots(train_test_splitted_dict=ts_train_test_splitted_dict, 
#                                                           SARIMAX_tuner_fn=tune_SARIMAX_v2)

# Helper Functions: Regression Based

In [20]:
# Feature Engineering
def features_lagged(df_inp):
    
    df = df_inp.copy()
    

    # Lagged Features
    df['lag_1'] = df['Occupancy_Rate'].shift(1)
    df['lag_2'] = df['Occupancy_Rate'].shift(2)
    df['lag_3'] = df['Occupancy_Rate'].shift(3)

    df['lag_18'] = df['Occupancy_Rate'].shift(18)
    df['lag_19'] = df['Occupancy_Rate'].shift(19)
    df['lag_20'] = df['Occupancy_Rate'].shift(20)
    
    
    # Encoding Week-Weekend as numeric
    df['isWeekend'] = df['isWeekend'].astype('int')
    
    
    # Impute Missing data with mean imputation
    target_mean = df.Occupancy_Rate.mean()

    df.lag_1.fillna(target_mean, inplace=True)
    df.lag_2.fillna(target_mean, inplace=True)
    df.lag_3.fillna(target_mean, inplace=True)

    df.lag_18.fillna(target_mean, inplace=True)
    df.lag_19.fillna(target_mean, inplace=True)
    df.lag_20.fillna(target_mean, inplace=True)
    
    
    # # Setting timestamp as index: Not required (Doing it earlier)
    # df.set_index('TimeStamp', inplace=True)
    
    return df



# Feature Scaling
def feat_scaler(X_train_inp, X_test_inp):
    
    std_scaler = StandardScaler()
    X_train_inp_scl = std_scaler.fit_transform(X_train_inp)
    X_test_inp_scl = std_scaler.transform(X_test_inp)
    
    return X_train_inp_scl, X_test_inp_scl


# X-y splitter
def preprocess_pipe(df_train, df_test):
    
    X_train = df_train.drop(['Occupancy_Rate'], axis=1)
    y_train = df_train['Occupancy_Rate']

    X_test = df_test.drop(['Occupancy_Rate'], axis=1)
    y_test = df_test['Occupancy_Rate']
    
    X_train_scl, X_test_scl = feat_scaler(X_train_inp=X_train, X_test_inp=X_test)
    
    return X_train, X_test, X_train_scl, X_test_scl, y_train, y_test


# Generate Regression Report
def get_regression_report(model, df_train, X_train, y_train, X_test, y_test):
    
    sep_length = 50
    
    print('REGRESSION REPORT')
    print('-'*sep_length)
    
    # Fit regression model
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    # y_train_pred = pd.Series(y_train_pred)
    # y_train_pred.index = y_train.index
    
    y_test_pred = model.predict(X_test)
    # y_test_pred = pd.Series(y_test_pred)
    # y_test_pred.index = y_test.index
    
    # Model Evaluation metrics
    train_rmse = round(mse(y_train, y_train_pred)**0.5, 3)
    test_rmse = round(mse(y_test, y_test_pred)**0.5, 3)
    
    train_mae = round(mae(y_train, y_train_pred), 3)
    test_mae = round(mae(y_test, y_test_pred), 3)
    
    # Store Fitted Model & Metrics
    # metrics_arr = np.array([])
    metrics_arr = np.array([train_rmse, train_mae, test_rmse, test_mae])
    
    print('Training RMSE:', train_rmse)
    print('Testing RMSE:', test_rmse)
    print('-'*sep_length)
    
    # # Feature importances
    # ser_feat_imp = pd.Series(model.feature_importances_, index=df_train.columns).sort_values(ascending=False)
    # # Plot feature importances
    # plt.figure(figsize=(20, 4))
    # plt.bar(ser_feat_imp.index, ser_feat_imp.values)
    # plt.xticks(rotation=90)
    # plt.show()
    
    return metrics_arr


# Plot output post model training
def plot_test_set_org_predict(y_test, y_pred):
    
    y_pred_ser = pd.Series(y_pred)
    y_pred_ser.index = y_test.index
    
    plt.figure(figsize=(20, 4))
    y_test.plot(marker='o', label='TestSet-Original')
    y_pred_ser.plot(marker='o', label='TestSet-Predicted')
    plt.legend()
    plt.show()
    
    
    
def get_cross_val_score_summary(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error'):
    
    cross_val = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
    # print('-'*70)
    # print(f'Cross validation Score Summary: #Folds:{cv}, Score:{scoring}')
    # print('-'*70)
    
    mean_cv_rmse = np.mean(((-1)*cross_val)**0.5)
    
    return mean_cv_rmse

# HyperParameter Tuning:

## Random Forest

In [21]:
def hyp_param_tune_rf(X_train_scl, y_train):

    rfr_parameters = {
        "n_estimators":[50, 100, 150, 200],
        "max_features":[2, 4, 7, 10, 13],
        "max_depth":[6, 9, 12, 15],
        'criterion' :['squared_error',],
        "ccp_alpha":[0.0001, 0.001, 0.01],
        "random_state":[42]
    }



    rfr = RandomForestRegressor()

    # grid_search1 = GridSearchCV(
    #     estimator = rfr,
    #     param_grid = rfr_parameters,
    #     scoring = "neg_mean_squared_error",
    #     n_jobs = -1,
    #     refit=True,               
    #     cv=3,
    #     verbose=1
    #     # return_train_score=False
    # )

    rand_search1 = RandomizedSearchCV(
        estimator = rfr,
        param_distributions = rfr_parameters,
        n_iter=100,
        scoring = "neg_mean_squared_error",
        n_jobs = -1,
        refit=True,               
        cv=3,
        verbose=1
        # return_train_score=False
    )
    
    
    # grid_search1.fit(X_train_scl, y_train)
    rand_search1.fit(X_train_scl, y_train)

    
    return rand_search1.best_params_

In [22]:
# Train Regression Models: Random Forest
def find_best_params_rf(reg_final_train_test_splitted_dict):
    
    fitted_models_dict = {'ps_idx':[], 
                          'random_state':[],
                          'criterion':[],
                          'n_estimators':[],
                          'max_features':[],
                          'max_depth':[],
                          'ccp_alpha':[],
                          'cv_rmse':[]
                         }

    
    for ps_idx in list(reg_final_train_test_splitted_dict.keys())[:]:
        
        print(f'PARK SPACE: {ps_idx}')
        
        df_train_ps = reg_final_train_test_splitted_dict[ps_idx]['train']
        df_test_ps = reg_final_train_test_splitted_dict[ps_idx]['test']
        
        X_train, X_test, X_train_scl, X_test_scl, y_train, y_test = preprocess_pipe(df_train=df_train_ps, 
                                                                                    df_test=df_test_ps)
        
        # Best Parameter post hyper-parameter tuning
        start_time = time.time()
        best_params_ps = hyp_param_tune_rf(X_train_scl=X_train_scl, y_train=y_train)
        
        
        # Initializing RF model using best params
        rfr = RandomForestRegressor(random_state=best_params_ps['random_state'], 
                                    criterion=best_params_ps['criterion'], 
                                    n_estimators=best_params_ps['n_estimators'], 
                                    max_features=best_params_ps['max_features'], 
                                    max_depth=best_params_ps['max_depth'], 
                                    ccp_alpha=best_params_ps['ccp_alpha'])
        
        # # Fitting Regression model
        # rfr.fit(X_train_scl, y_train)
        
        # Getting Mean CV RMSE
        mean_cv_rmse = get_cross_val_score_summary(model=rfr, X_train=X_train_scl, y_train=y_train, 
                                                   cv=5, scoring='neg_mean_squared_error')
        
        
        end_time = time.time()
        print('-'*25+f'Completed Tuning: {round(end_time-start_time, 3)}'+'-'*25)
        
        
        # Saving the best parameters
        fitted_models_dict['ps_idx'].append(ps_idx)
        fitted_models_dict['random_state'].append(best_params_ps['random_state'])
        fitted_models_dict['criterion'].append(best_params_ps['criterion'])
        fitted_models_dict['n_estimators'].append(best_params_ps['n_estimators'])
        fitted_models_dict['max_features'].append(best_params_ps['max_features'])
        fitted_models_dict['max_depth'].append(best_params_ps['max_depth'])
        fitted_models_dict['ccp_alpha'].append(best_params_ps['ccp_alpha'])
        fitted_models_dict['cv_rmse'].append(mean_cv_rmse)
        
        
    return pd.DataFrame(fitted_models_dict)

## XGBoost Regressor

In [23]:
def hyp_param_tune_xgb(X_train_scl, y_train):
    
    
    xgbr_parameters = {
        "n_estimators":[50, 100, 150, 200],
        "learning_rate": [0.05, 0.1, 0.2, 0.3],
        "gamma": [0, 0.25, 0.5, 0.75, 1, 1.25],
        "max_depth":[2, 4, 6, 8],
        'subsample': [0.3, 0.5, 0.7, 0.9],
        "colsample_bytree" : [0.3, 0.5, 0.7, 0.9],
        "random_state":[42]
    }


    xgbr = xgb.XGBRegressor()

    # grid_search2 = GridSearchCV(
    #     estimator = xgbr,
    #     param_grid = xgbr_parameters,
    #     scoring = "neg_mean_squared_error",
    #     n_jobs = -1,
    #     refit=True,               
    #     cv=3,
    #     verbose=1
    #     # return_train_score=False
    # )

    rand_search2 = RandomizedSearchCV(
        estimator = xgbr,
        param_distributions = xgbr_parameters,
        n_iter=100,
        scoring = "neg_mean_squared_error",
        n_jobs = -1,
        refit=True,               
        cv=3,
        verbose=1
        # return_train_score=False
    )
    
    
    # grid_search2.fit(X_train_scl, y_train)
    rand_search2.fit(X_train_scl, y_train)

    
    return rand_search2.best_params_

In [24]:
# Train Regression Models: XGBoost
def find_best_params_xgb(reg_final_train_test_splitted_dict):
    
    fitted_models_dict = {'ps_idx':[], 
                          'random_state':[],
                          'n_estimators':[],
                          'learning_rate':[],
                          'gamma':[],
                          'max_depth':[],
                          'subsample':[],
                          'colsample_bytree':[],
                          'cv_rmse':[]
                         }

    
    for ps_idx in list(reg_final_train_test_splitted_dict.keys())[:]:
        
        print(f'PARK SPACE: {ps_idx}')
        
        df_train_ps = reg_final_train_test_splitted_dict[ps_idx]['train']
        df_test_ps = reg_final_train_test_splitted_dict[ps_idx]['test']
        
        X_train, X_test, X_train_scl, X_test_scl, y_train, y_test = preprocess_pipe(df_train=df_train_ps, 
                                                                                    df_test=df_test_ps)
        
        # Best Parameter post hyper-parameter tuning
        start_time = time.time()
        best_params_ps = hyp_param_tune_xgb(X_train_scl=X_train_scl, y_train=y_train)
        # print(best_params_ps)
        
        
        # Initializing RF model using best params
        xgbr = xgb.XGBRegressor(random_state=best_params_ps['random_state'],   
                                n_estimators=best_params_ps['n_estimators'], 
                                learning_rate=best_params_ps['learning_rate'],
                                gamma=best_params_ps['gamma'],     
                                max_depth=best_params_ps['max_depth'],
                                subsample=best_params_ps['subsample'],
                                colsample_bytree=best_params_ps['colsample_bytree'])
        
        
        # # Fitting Regression model
        # xgbr.fit(X_train_scl, y_train)
        
        # Getting Mean CV RMSE
        mean_cv_rmse = get_cross_val_score_summary(model=xgbr, X_train=X_train_scl, y_train=y_train, 
                                                   cv=5, scoring='neg_mean_squared_error')
        
        
        end_time = time.time()
        print('-'*25+f'Completed Tuning: {round(end_time-start_time, 3)}'+'-'*25)
        
        
        # Saving the best parameters
        fitted_models_dict['ps_idx'].append(ps_idx)
        fitted_models_dict['random_state'].append(best_params_ps['random_state'])
        fitted_models_dict['n_estimators'].append(best_params_ps['n_estimators'])
        fitted_models_dict['learning_rate'].append(best_params_ps['learning_rate'])
        fitted_models_dict['gamma'].append(best_params_ps['gamma'])
        fitted_models_dict['max_depth'].append(best_params_ps['max_depth'])
        fitted_models_dict['subsample'].append(best_params_ps['subsample'])
        fitted_models_dict['colsample_bytree'].append(best_params_ps['colsample_bytree'])
        fitted_models_dict['cv_rmse'].append(mean_cv_rmse)
        
        
    return pd.DataFrame(fitted_models_dict)

# HyperParameter Tuning: Random Forest Regressor (Without TES O/P)

In [25]:
df_hyp_tuned_rf_models_v1 = find_best_params_rf(reg_final_train_test_splitted_dict=reg_final_v1_mod_train_test_dict)

PARK SPACE: 1
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 9.287-------------------------
PARK SPACE: 2
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 9.88-------------------------
PARK SPACE: 3
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 9.955-------------------------
PARK SPACE: 4
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 8.861-------------------------
PARK SPACE: 5
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 9.539-------------------------
PARK SPACE: 6
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 8.571-------------------------
PARK SPACE: 7
Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------------

In [26]:
df_hyp_tuned_rf_models_v1.head()

Unnamed: 0,ps_idx,random_state,criterion,n_estimators,max_features,max_depth,ccp_alpha,cv_rmse
0,1,42,squared_error,100,10,9,0.0001,2.376033
1,2,42,squared_error,200,7,15,0.0001,5.035085
2,3,42,squared_error,150,10,15,0.001,3.339578
3,4,42,squared_error,150,13,9,0.0001,4.219668
4,5,42,squared_error,200,7,15,0.001,4.180338


In [27]:
df_hyp_tuned_rf_models_v1.cv_rmse.describe()

count    27.000000
mean      2.619489
std       0.928895
min       1.001775
25%       2.080783
50%       2.387356
75%       2.921339
max       5.035085
Name: cv_rmse, dtype: float64

In [28]:
# Saving best parameters for the exp_smoothing_models
df_hyp_tuned_rf_models_v1.to_csv(path_or_buf='./data_artifacts/df_hyp_tuned_params_rf_models_v1.csv', index=False)

# HyperParameter Tuning: Random Forest Regressor (With TES O/P)

In [29]:
df_hyp_tuned_rf_models_v2 = find_best_params_rf(reg_final_train_test_splitted_dict=reg_final_v2_mod_train_test_dict)

PARK SPACE: 1
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 11.146-------------------------
PARK SPACE: 2
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 8.61-------------------------
PARK SPACE: 3
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 8.931-------------------------
PARK SPACE: 4
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 9.722-------------------------
PARK SPACE: 5
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 8.866-------------------------
PARK SPACE: 6
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 10.656-------------------------
PARK SPACE: 7
Fitting 3 folds for each of 100 candidates, totalling 300 fits
----------------

In [30]:
df_hyp_tuned_rf_models_v2.head()

Unnamed: 0,ps_idx,random_state,criterion,n_estimators,max_features,max_depth,ccp_alpha,cv_rmse
0,1,42,squared_error,150,10,15,0.001,2.078415
1,2,42,squared_error,100,4,12,0.01,4.785509
2,3,42,squared_error,50,7,15,0.001,3.731414
3,4,42,squared_error,200,10,12,0.0001,5.335787
4,5,42,squared_error,200,4,12,0.0001,4.487857


In [31]:
df_hyp_tuned_rf_models_v2.cv_rmse.describe()

count    27.000000
mean      2.827754
std       1.077564
min       1.290961
25%       2.131648
50%       2.670427
75%       3.382621
max       5.335787
Name: cv_rmse, dtype: float64

In [32]:
# Saving best parameters for the exp_smoothing_models
df_hyp_tuned_rf_models_v2.to_csv(path_or_buf='./data_artifacts/df_hyp_tuned_params_rf_models_v2.csv', index=False)

# HyperParameter Tuning: XGBoost Regressor (Without TES O/P)

In [33]:
df_hyp_tuned_xgb_models_v3 = find_best_params_xgb(reg_final_train_test_splitted_dict=reg_final_v1_mod_train_test_dict)

PARK SPACE: 1
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 1.988-------------------------
PARK SPACE: 2
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 1.566-------------------------
PARK SPACE: 3
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 1.502-------------------------
PARK SPACE: 4
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 1.741-------------------------
PARK SPACE: 5
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 2.638-------------------------
PARK SPACE: 6
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 2.237-------------------------
PARK SPACE: 7
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-----------------

In [34]:
df_hyp_tuned_xgb_models_v3.head()

Unnamed: 0,ps_idx,random_state,n_estimators,learning_rate,gamma,max_depth,subsample,colsample_bytree,cv_rmse
0,1,42,200,0.3,1.0,2,0.9,0.3,3.056324
1,2,42,200,0.05,1.25,2,0.7,0.7,4.852581
2,3,42,100,0.2,0.5,2,0.7,0.9,4.512991
3,4,42,100,0.1,0.0,4,0.3,0.7,5.094888
4,5,42,200,0.05,1.0,6,0.5,0.7,4.099668


In [35]:
df_hyp_tuned_xgb_models_v3.cv_rmse.describe()

count    27.000000
mean      2.769192
std       1.009056
min       1.032222
25%       2.103388
50%       2.558400
75%       3.084212
max       5.094888
Name: cv_rmse, dtype: float64

In [36]:
# Saving best parameters for the exp_smoothing_models
df_hyp_tuned_xgb_models_v3.to_csv(path_or_buf='./data_artifacts/df_hyp_tuned_params_xgb_models_v3.csv', index=False)

# HyperParameter Tuning: XGBoost Regressor (With TES O/P)

In [37]:
df_hyp_tuned_xgb_models_v4 = find_best_params_xgb(reg_final_train_test_splitted_dict=reg_final_v2_mod_train_test_dict)

PARK SPACE: 1
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 1.775-------------------------
PARK SPACE: 2
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 2.25-------------------------
PARK SPACE: 3
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 3.028-------------------------
PARK SPACE: 4
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 1.987-------------------------
PARK SPACE: 5
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 2.617-------------------------
PARK SPACE: 6
Fitting 3 folds for each of 100 candidates, totalling 300 fits
-------------------------Completed Tuning: 2.839-------------------------
PARK SPACE: 7
Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------------

In [38]:
df_hyp_tuned_xgb_models_v4.head()

Unnamed: 0,ps_idx,random_state,n_estimators,learning_rate,gamma,max_depth,subsample,colsample_bytree,cv_rmse
0,1,42,200,0.1,1.0,2,0.9,0.7,2.158876
1,2,42,150,0.05,0.0,6,0.3,0.9,4.965143
2,3,42,200,0.05,1.0,6,0.5,0.7,3.707578
3,4,42,100,0.1,0.5,4,0.7,0.7,5.731208
4,5,42,150,0.1,0.5,6,0.5,0.7,4.462775


In [39]:
df_hyp_tuned_xgb_models_v4.cv_rmse.describe()

count    27.000000
mean      2.919733
std       1.073855
min       1.380360
25%       2.274084
50%       2.642384
75%       3.431730
max       5.731208
Name: cv_rmse, dtype: float64

In [40]:
# Saving best parameters for the exp_smoothing_models
df_hyp_tuned_xgb_models_v4.to_csv(path_or_buf='./data_artifacts/df_hyp_tuned_params_xgb_models_v4.csv', index=False)