## Train Best Model

- input: splitted data
- get best parameters for best model
- evaluate model
- save best model

In [124]:
import os
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [174]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def smape(real, predicted):
    return 100/len(real) * np.sum(2 * np.abs(predicted - real) / (np.abs(real) + np.abs(predicted)))

In [217]:
def get_best_model(data_splitted_path_file, param_grid, random_seed=0, save_in=None):
    np.random.seed(random_seed)
    
    # Read Splitted Data
    with open(data_splitted_path_file, 'rb') as f:
        splitted_data = pickle.load(f)

    # Get X and y
    X = pd.concat([splitted_data[i]['X_train'] for i in splitted_data.keys()])
    X = X[~X.index.duplicated(keep='first')]

    y = pd.concat([splitted_data[i]['y_train'] for i in splitted_data.keys()])
    y = y[~y.index.duplicated(keep='first')]
    
    map_x_index = {i:j for i,j in zip(X.index, list(X.reset_index().index))}
    map_y_index = {i:j for i,j in zip( y.index, list(y.reset_index().index))}
    
    cv = [(np.array([map_x_index[j] for j in splitted_data[i]['X_train'].index]),
           np.array([map_x_index[j] for j in splitted_data[i]['X_test'].index]))
          for i in splitted_data.keys()]    

    # Create a based model
    rf = RandomForestRegressor(random_state=random_seed)

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=rf,
                               param_grid=param_grid, 
                               cv=cv,
                               n_jobs=-1,
                               verbose=2)

    grid_search.fit(X, y)
    print("Best Parameters are:", grid_search.best_params_)
    
    if save_in is not None:
        directory = os.path.dirname(save_in)
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        with open(save_in, 'wb') as f:
            pickle.dump(grid_search.best_estimator_, f)
        
        with open(os.path.join(directory, "X.pkl"), 'wb') as f:
            pickle.dump(X, f)
        
        with open(os.path.join(directory, "y.pkl"), 'wb') as f:
            pickle.dump(y, f)
    
    return grid_search, X, y

In [218]:
def evaluate(splitted_data, best_grid):
    
    metrics = {}
    for i_fold in splitted_data.keys():
        y_true = splitted_data[i_fold]['y_test']
        y_pred = best_grid.predict(splitted_data[i_fold]['X_test'])

        y_true_train = splitted_data[i_fold]['y_train']
        y_pred_train = best_grid.predict(splitted_data[i_fold]['X_train'])

        mse_train = mean_squared_error(y_true_train, y_pred_train)
        smape_train = smape(y_true_train, y_pred_train)
        mape_train = mape(y_true_train, y_pred_train)

        mse_ = mean_squared_error(y_true, y_pred)
        smape_ = smape(y_true, y_pred)
        mape_ = mape(y_true, y_pred)
        
        metrics[i_fold] = {
                            "MSE":mse_, "SMAPE":smape_, "MAPE":mape_,
                            "MSE-train":mse_train, "SMAPE-train":smape_train, "MAPE-train":mape_train
                          }
        
    return metrics

## TO MAIN


In [219]:
data_splitted_path_file = "/Users/lalachaimaenaciri/PycharmProjects/SCORE_LOW_HIGH_CAPSADSTR_INTENSITY/data/ready/splitted_data.pkl"               
save_model_in = "/Users/lalachaimaenaciri/PycharmProjects/SCORE_LOW_HIGH_CAPSADSTR_INTENSITY/models/rf_model.pkl"               


param_grid = {
        'bootstrap': [True],
        'max_depth': [2, 4, 6],
        'max_features': [1, 3, 5],
        'min_samples_leaf': [1],
        'min_samples_split': [2, 4],
        'n_estimators': [50, 100, 200],
        'criterion': ["squared_error"]
    }

grid_search, X, y = get_best_model(data_splitted_path_file=data_splitted_path_file,
               param_grid=param_grid, save_in=save_model_in
              )


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best Parameters are: {'bootstrap': True, 'criterion': 'squared_error', 'max_depth': 6, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [220]:
metrcis = evaluate(splitted_data, grid_search.best_estimator_)
metrcis

{0: {'MSE': 0.027290032251858335,
  'SMAPE': 7.949226912581428,
  'MAPE': 10.656085356589275,
  'MSE-train': 0.017253068307872946,
  'SMAPE-train': 4.884852852841844,
  'MAPE-train': 5.029730787659908},
 1: {'MSE': 0.01851205693081037,
  'SMAPE': 5.416938641452723,
  'MAPE': 5.636899769291746,
  'MSE-train': 0.02164205596839693,
  'SMAPE-train': 6.150996988406194,
  'MAPE-train': 7.53932358130867},
 2: {'MSE': 0.01599407968493552,
  'SMAPE': 4.35276706423096,
  'MAPE': 4.422561806028068,
  'MSE-train': 0.02290104459133435,
  'SMAPE-train': 6.683082777017076,
  'MAPE-train': 8.146492562940512}}

In [221]:
# Mean MAPE
print("Train MAPE", np.mean([metrcis[i]["MAPE"] for i in splitted_data.keys()]))

Train MAPE 6.905182310636363
