In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from pycaret.regression import *

In [38]:
root = os.path.join('..', 'data', 'imputed')
folder = 'mice'
df = pd.read_csv(os.path.join(root, f'{folder}_df.csv'))

In [39]:
df['date'] = pd.to_datetime(df['date'])

In [40]:
df = df[['day', 'date', 'zone_b/c_patients', 'zone_b/c_cat_2', 'zone_b/c_cat_3',
       'zone_b/c_sets_of_emews', 'zone_b/c_deescalations',
       'zone_b/c_escalations']]

In [41]:
df.columns

Index(['day', 'date', 'zone_b/c_patients', 'zone_b/c_cat_2', 'zone_b/c_cat_3',
       'zone_b/c_sets_of_emews', 'zone_b/c_deescalations',
       'zone_b/c_escalations'],
      dtype='object')

## Load Model (optional)

In [42]:
# pipeline_path = '../pipelines/base_models/'
# if not os.path.exists(pipeline_path):
#     print("Path to pipeline does not exist")
# else:
#     model_names_to_load = ['br', 'lasso', 'llar', 'en', 'huber']

#     created_models = {}

#     for name in model_names_to_load:
#         created_models[name] = load_model(os.path.join(pipeline_path, name)).named_steps['trained_model']
    
#     print(created_models)

## Pycaret Setup

In [43]:
s = setup(df, target='zone_b/c_sets_of_emews', session_id=123, train_size=0.8)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,zone_b/c_sets_of_emews
2,Target type,Regression
3,Original data shape,"(599, 8)"
4,Transformed data shape,"(599, 16)"
5,Transformed train set shape,"(479, 16)"
6,Transformed test set shape,"(120, 16)"
7,Numeric features,5
8,Date features,1
9,Categorical features,1


In [44]:
compare_models(exclude=['lightgbm'], errors='raise')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,5.2395,81.9006,8.8488,0.2575,0.1567,0.082,0.045
rf,Random Forest Regressor,5.2335,83.0502,8.9061,0.2475,0.1694,0.0854,0.076
catboost,CatBoost Regressor,5.1641,80.25,8.7926,0.243,0.1641,0.0818,0.175
omp,Orthogonal Matching Pursuit,5.5171,80.7923,8.8703,0.2208,0.1794,0.0916,0.018
lasso,Lasso Regression,5.4515,81.8914,8.8916,0.2198,0.1801,0.0907,0.021
llar,Lasso Least Angle Regression,5.4515,81.8913,8.8916,0.2198,0.1801,0.0907,0.018
en,Elastic Net,5.4496,82.9561,8.9449,0.2101,0.1816,0.091,0.025
huber,Huber Regressor,5.2801,88.4381,9.0613,0.1905,0.1825,0.0889,0.02
br,Bayesian Ridge,5.4922,84.2499,8.9977,0.1885,0.1774,0.0901,0.022
ada,AdaBoost Regressor,5.6566,88.9117,9.2658,0.1723,0.1638,0.0886,0.033


## Custom Metrics

In [45]:
from sklearn.metrics import r2_score, mean_squared_error

def r2_rounded(y_true, y_pred):
    """Calculates R2 score after rounding predictions to the nearest whole number."""
    return r2_score(y_true, np.round(y_pred))

def rmse_rounded(y_true, y_pred):
    """Calculates RMSE after rounding predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.round(y_pred)))

def r2_ceil(y_true, y_pred):
    """Calculates R2 score after ceiling predictions to the nearest whole number."""
    return r2_score(y_true, np.ceil(y_pred))

def rmse_ceil(y_true, y_pred):
    """Calculates RMSE after ceiling predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.ceil(y_pred)))

In [46]:
add_metric('R2_Rounded', 'R2_RND', r2_rounded, greater_is_better=True)
add_metric('RMSE_Rounded', 'RMSE_RND', rmse_rounded, greater_is_better=False)
add_metric('R2_Ceil', 'R2_CEIL', r2_ceil, greater_is_better=True)
add_metric('RMSE_Ceil', 'RMSE_CEIL', rmse_ceil, greater_is_better=False);

## Analyze Model

In [47]:
model_names_to_create = ['et', 'rf', 'catboost', 'omp', 'lasso']
created_models = {}

for model_name in model_names_to_create:
    model_object = create_model(model_name, verbose=False)
    created_models[model_name] = model_object

### Bayesian Ridge

In [48]:
# evaluate_model(created_models['br'])

### Lasso Regression

In [49]:
# evaluate_model(created_models['lasso'])

### Lasso Least Angle Regression

In [50]:
# evaluate_model(created_models['llar'])

### Elastic Net

In [51]:
# evaluate_model(created_models['en'])

### Huber Regressor

In [52]:
# evaluate_model(created_models['huber'])

## Prediction

In [53]:
holdout_predictions_metric = {}

for model_name, model_object in created_models.items():
    predict_model(model_object, verbose=False)
    holdout_predictions_metric[model_name] = pull()

In [54]:
# pipeline_path = '../pipelines/base_models/'
# os.makedirs(pipeline_path, exist_ok=True)

# for model_name, model_object in created_models.items():
#     save_model(model_object, os.path.join(pipeline_path, model_name));

## Tuning

In [55]:
# param_grids = {
#     'br': {
#         'max_iter': np.arange(50, 301, 50),
#         'alpha_1': np.logspace(-8, -3, 6),
#         'lambda_1': np.logspace(-8, -3, 6),
#         'alpha_2': np.logspace(-8, -3, 6),
#         'lambda_2': np.logspace(-8, -3, 6)
#     },
#     'lasso': {
#         'alpha': np.logspace(-4, 1, 6),
#         'selection': ['cyclic', 'random']
#     },
#     'llar': {
#         'alpha': np.logspace(-4, 1, 6),
#         'jitter': list(np.logspace(-7, -4, 4)) + [None]
#     },
#     'en': {
#         'alpha': np.logspace(-4, 1, 6),
#         'l1_ratio': np.linspace(0, 1, 9),
#         'max_iter': np.arange(100, 1001, 100),
#         'selection': ['cyclic', 'random']
#     },
#     'huber': {
#         'epsilon': [1.0, 1.1, 1.2, 1.25, 1.3, 1.35, 1.4, 1.5, 2.0],
#         'max_iter': np.arange(10, 101, 10),
#         'alpha': np.logspace(-6, -2, 5)
#     }
# }

In [56]:
tuned_models = {}

for model_name, model_object in created_models.items():
    print(f"Tuning model: {model_name}")
    tuned_models[model_name] = tune_model(
        model_object,
        search_library='scikit-optimize',
        n_iter=20,
        early_stopping=True
    )
    print("\r")

Tuning model: et


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4.8732,75.9545,8.7152,0.3274,0.1733,0.1011,0.0,-0.0,0.0,-0.0
1,5.2301,65.629,8.1012,0.5654,0.2268,0.1382,0.0,-0.0,0.0,-0.0
2,4.9613,59.8314,7.7351,0.1014,0.1206,0.0775,0.0,-0.0,0.0,-0.0
3,3.9574,29.8451,5.4631,0.138,0.0736,0.0537,0.0,-0.0,0.0,-0.0
4,6.6655,133.9213,11.5724,0.4663,0.5264,0.0739,0.0,-0.0,0.0,-0.0
5,4.3376,55.575,7.4549,0.2679,0.092,0.0603,0.0,-0.0,0.0,-0.0
6,4.4445,50.7846,7.1263,0.1222,0.0975,0.0621,0.0,-0.0,0.0,-0.0
7,5.784,116.8838,10.8113,0.2404,0.159,0.0946,0.0,-0.0,0.0,-0.0
8,5.7724,112.9042,10.6256,0.473,0.2005,0.1199,0.0,-0.0,0.0,-0.0
9,5.3108,115.9541,10.7682,-0.0487,0.1389,0.0777,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4.9389,80.2342,8.9574,0.2895,0.1785,0.1044,0.0,-0.0,0.0,-0.0
1,5.0425,62.9179,7.9321,0.5834,0.2142,0.1261,0.0,-0.0,0.0,-0.0
2,4.6253,55.8073,7.4704,0.1618,0.1187,0.0737,0.0,-0.0,0.0,-0.0
3,3.7849,26.8088,5.1777,0.2256,0.0696,0.0514,0.0,-0.0,0.0,-0.0
4,6.7777,125.8684,11.2191,0.4984,0.507,0.0774,0.0,-0.0,0.0,-0.0
5,4.3449,63.6558,7.9785,0.1615,0.0991,0.061,0.0,-0.0,0.0,-0.0
6,4.4388,53.3973,7.3073,0.077,0.1011,0.0622,0.0,-0.0,0.0,-0.0
7,5.9417,126.516,11.2479,0.1778,0.1662,0.0972,0.0,-0.0,0.0,-0.0
8,5.501,110.0324,10.4896,0.4864,0.2014,0.117,0.0,-0.0,0.0,-0.0
9,5.1573,98.7524,9.9374,0.1069,0.1271,0.0756,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5.4882,97.0293,9.8503,0.1408,0.1908,0.1151,0.0,-0.0,0.0,-0.0
1,5.9591,76.2043,8.7295,0.4954,0.2581,0.1734,0.0,-0.0,0.0,-0.0
2,5.073,62.9478,7.934,0.0546,0.1247,0.0816,0.0,-0.0,0.0,-0.0
3,4.088,30.3417,5.5083,0.1236,0.0749,0.0556,0.0,-0.0,0.0,-0.0
4,6.7223,134.8683,11.6133,0.4625,0.5174,0.0764,0.0,-0.0,0.0,-0.0
5,4.0027,61.7411,7.8576,0.1867,0.0919,0.0529,0.0,-0.0,0.0,-0.0
6,4.6973,47.0288,6.8578,0.1871,0.0961,0.0671,0.0,-0.0,0.0,-0.0
7,6.9043,142.7782,11.949,0.0721,0.18,0.1119,0.0,-0.0,0.0,-0.0
8,6.2101,107.1983,10.3537,0.4997,0.1907,0.1222,0.0,-0.0,0.0,-0.0
9,5.278,104.7921,10.2368,0.0522,0.1366,0.0796,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5.5447,93.4844,9.6687,0.1722,0.1872,0.1144,0.0,-0.0,0.0,-0.0
1,5.6162,58.662,7.6591,0.6115,0.1978,0.1259,0.0,-0.0,0.0,-0.0
2,5.4628,66.2017,8.1364,0.0057,0.1245,0.0843,0.0,-0.0,0.0,-0.0
3,4.5973,39.4471,6.2807,-0.1394,0.0845,0.0621,0.0,-0.0,0.0,-0.0
4,6.7587,120.6849,10.9857,0.519,0.4997,0.0794,0.0,-0.0,0.0,-0.0
5,6.0219,84.1986,9.176,-0.1091,0.1162,0.0849,0.0,-0.0,0.0,-0.0
6,4.2552,41.3133,6.4275,0.2859,0.0925,0.0622,0.0,-0.0,0.0,-0.0
7,5.2808,92.4047,9.6127,0.3995,0.1424,0.0859,0.0,-0.0,0.0,-0.0
8,5.764,95.9641,9.7961,0.5521,0.2012,0.1246,0.0,-0.0,0.0,-0.0
9,5.8152,108.711,10.4265,0.0168,0.1323,0.0834,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5.3231,93.7225,9.681,0.1701,0.1882,0.1116,0.0,-0.0,0.0,-0.0
1,5.708,70.1083,8.3731,0.5357,0.2503,0.1633,0.0,-0.0,0.0,-0.0
2,4.998,61.3548,7.8329,0.0785,0.1222,0.078,0.0,-0.0,0.0,-0.0
3,4.4434,38.4994,6.2048,-0.112,0.0831,0.0598,0.0,-0.0,0.0,-0.0
4,7.0868,143.7973,11.9916,0.4269,0.5454,0.0803,0.0,-0.0,0.0,-0.0
5,5.6424,71.9535,8.4825,0.0522,0.1082,0.0799,0.0,-0.0,0.0,-0.0
6,4.3081,40.4428,6.3595,0.301,0.0879,0.0609,0.0,-0.0,0.0,-0.0
7,5.2758,100.75,10.0374,0.3452,0.1482,0.0864,0.0,-0.0,0.0,-0.0
8,5.8256,114.872,10.7178,0.4639,0.2254,0.1368,0.0,-0.0,0.0,-0.0
9,5.31,100.8475,10.0423,0.0879,0.1277,0.0767,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [57]:
# pipeline_path = '../pipelines/tuned_models/'
# if not os.path.exists(pipeline_path):
#     print("Path to pipeline does not exist")
# else:
#     model_names_to_load = ['br', 'lasso', 'llar', 'en', 'huber']

#     tuned_models = {}

#     for name in model_names_to_load:
#         tuned_models[name] = load_model(os.path.join(pipeline_path, name)).named_steps['trained_model']
    
#     print(tuned_models)

In [58]:
# pipeline_path = '../pipelines/tuned_models/'
# os.makedirs(pipeline_path, exist_ok=True)

# for model_name, model_object in tuned_models.items():
#     save_model(model_object, os.path.join(pipeline_path, model_name));

In [59]:
tuning_predictions_metric = {}

for model_name, model_object in tuned_models.items():
    predict_model(model_object, verbose=False)
    tuning_predictions_metric[model_name] = pull()

In [60]:
list_of_metric_dfs = []

file_name = 'temp_excel.xlsx'

# Iterate through the dictionary and append each DataFrame to the list
for model_name, metrics_df in holdout_predictions_metric.items():
    list_of_metric_dfs.append(metrics_df)

# Concatenate all DataFrames into a single one
print("--- Holdout Predictions DataFrame ---", end='\n\n')
results_df = pd.concat(list_of_metric_dfs, ignore_index=True)
print(results_df.to_string())

with pd.ExcelWriter(file_name) as writer:
    results_df.to_excel(writer, sheet_name='Base Model Metrics', index=False)

list_of_metric_dfs = []
for model_name, metrics_df in tuning_predictions_metric.items():
    list_of_metric_dfs.append(metrics_df)

print("\n--- Tuned Predictions DataFrame ---", end='\n\n')
results_df = pd.concat(list_of_metric_dfs, ignore_index=True)
print(results_df.to_string())

with pd.ExcelWriter(file_name, mode='a') as writer:
    results_df.to_excel(writer, sheet_name='Tuned Model Metrics', index=False)

--- Holdout Predictions DataFrame ---

                         Model     MAE       MSE     RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  R2_CEIL  RMSE_CEIL
0        Extra Trees Regressor  6.3524  168.4311  12.9781  0.1954  0.1513  0.0823  0.1955   12.9772   0.1999    12.9415
1      Random Forest Regressor  6.4262  171.7083  13.1038  0.1797  0.1498  0.0816  0.1796   13.1044   0.1864    13.0499
2           CatBoost Regressor  6.5761  176.5000  13.2853  0.1568  0.1550  0.0848  0.1517   13.3257   0.1630    13.2363
3  Orthogonal Matching Pursuit  6.4794  152.1385  12.3344  0.2732  0.1401  0.0834  0.2706   12.3565   0.2794    12.2821
4             Lasso Regression  6.3786  156.5715  12.5129  0.2520  0.1400  0.0816  0.2523   12.5103   0.2589    12.4556

--- Tuned Predictions DataFrame ---

                         Model     MAE       MSE     RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  R2_CEIL  RMSE_CEIL
0        Extra Trees Regressor  6.1853  156.4877  12.5095  0.2524  0.1413  0.0777  