In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from pycaret.regression import *

In [53]:
root = os.path.join('..', 'data', 'imputed')
folder = 'mice'
df = pd.read_csv(os.path.join(root, f'{folder}_df.csv'))

In [54]:
df['date'] = pd.to_datetime(df['date'])

In [55]:
df = df[['day', 'date', 'zone_a__patients', 'zone_a__cat_2', 'zone_a__cat_3',
       'zone_a__sets_of_emews', 'zone_a__deescalations', 'zone_a__escalations']]

In [56]:
df.columns

Index(['day', 'date', 'zone_a__patients', 'zone_a__cat_2', 'zone_a__cat_3',
       'zone_a__sets_of_emews', 'zone_a__deescalations',
       'zone_a__escalations'],
      dtype='object')

## Load Model (optional)

In [57]:
# pipeline_path = '../pipelines/base_models/'
# if not os.path.exists(pipeline_path):
#     print("Path to pipeline does not exist")
# else:
#     model_names_to_load = ['br', 'lasso', 'llar', 'en', 'huber']

#     created_models = {}

#     for name in model_names_to_load:
#         created_models[name] = load_model(os.path.join(pipeline_path, name)).named_steps['trained_model']
    
#     print(created_models)

## Pycaret Setup

In [58]:
s = setup(df, target='zone_a__sets_of_emews', session_id=123, train_size=0.8)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,zone_a__sets_of_emews
2,Target type,Regression
3,Original data shape,"(599, 8)"
4,Transformed data shape,"(599, 16)"
5,Transformed train set shape,"(479, 16)"
6,Transformed test set shape,"(120, 16)"
7,Numeric features,5
8,Date features,1
9,Categorical features,1


In [59]:
compare_models(exclude=['lightgbm'], errors='raise')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,20.6837,730.6975,26.8333,0.4216,0.3236,0.2756,0.058
llar,Lasso Least Angle Regression,20.5237,735.568,26.9409,0.4205,0.5026,0.2701,0.023
lasso,Lasso Regression,20.5224,735.5303,26.94,0.4205,0.5025,0.2701,0.021
en,Elastic Net,20.5623,737.7339,26.9926,0.4183,0.5035,0.2713,0.016
br,Bayesian Ridge,20.5725,738.5953,27.0156,0.4172,0.5035,0.2716,0.015
gbr,Gradient Boosting Regressor,20.6013,736.7935,26.9161,0.4147,0.3802,0.272,0.027
ada,AdaBoost Regressor,21.3303,738.0026,26.9585,0.4129,0.5639,0.2914,0.032
ridge,Ridge Regression,20.7287,750.3814,27.241,0.4083,0.4996,0.2729,0.022
lar,Least Angle Regression,20.7347,750.7053,27.247,0.408,0.4996,0.2729,0.016
lr,Linear Regression,20.7347,750.7053,27.247,0.408,0.4996,0.2729,0.03


## Custom Metrics

In [60]:
from sklearn.metrics import r2_score, mean_squared_error

def r2_rounded(y_true, y_pred):
    """Calculates R2 score after rounding predictions to the nearest whole number."""
    return r2_score(y_true, np.round(y_pred))

def rmse_rounded(y_true, y_pred):
    """Calculates RMSE after rounding predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.round(y_pred)))

def r2_ceil(y_true, y_pred):
    """Calculates R2 score after ceiling predictions to the nearest whole number."""
    return r2_score(y_true, np.ceil(y_pred))

def rmse_ceil(y_true, y_pred):
    """Calculates RMSE after ceiling predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.ceil(y_pred)))

In [61]:
add_metric('R2_Rounded', 'R2_RND', r2_rounded, greater_is_better=True)
add_metric('RMSE_Rounded', 'RMSE_RND', rmse_rounded, greater_is_better=False)
add_metric('R2_Ceil', 'R2_CEIL', r2_ceil, greater_is_better=True)
add_metric('RMSE_Ceil', 'RMSE_CEIL', rmse_ceil, greater_is_better=False);

## Analyze Model

In [62]:
model_names_to_create = ['llar', 'rf', 'lasso', 'et', 'huber']
created_models = {}

for model_name in model_names_to_create:
    model_object = create_model(model_name, verbose=False)
    created_models[model_name] = model_object

### Bayesian Ridge

In [63]:
# evaluate_model(created_models['br'])

### Lasso Regression

In [64]:
# evaluate_model(created_models['lasso'])

### Lasso Least Angle Regression

In [65]:
# evaluate_model(created_models['llar'])

### Elastic Net

In [66]:
# evaluate_model(created_models['en'])

### Huber Regressor

In [67]:
# evaluate_model(created_models['huber'])

## Prediction

In [68]:
holdout_predictions_metric = {}

for model_name, model_object in created_models.items():
    predict_model(model_object, verbose=False)
    holdout_predictions_metric[model_name] = pull()

In [69]:
# pipeline_path = '../pipelines/base_models/'
# os.makedirs(pipeline_path, exist_ok=True)

# for model_name, model_object in created_models.items():
#     save_model(model_object, os.path.join(pipeline_path, model_name));

## Tuning

In [70]:
# param_grids = {
#     'br': {
#         'max_iter': np.arange(50, 301, 50),
#         'alpha_1': np.logspace(-8, -3, 6),
#         'lambda_1': np.logspace(-8, -3, 6),
#         'alpha_2': np.logspace(-8, -3, 6),
#         'lambda_2': np.logspace(-8, -3, 6)
#     },
#     'lasso': {
#         'alpha': np.logspace(-4, 1, 6),
#         'selection': ['cyclic', 'random']
#     },
#     'llar': {
#         'alpha': np.logspace(-4, 1, 6),
#         'jitter': list(np.logspace(-7, -4, 4)) + [None]
#     },
#     'en': {
#         'alpha': np.logspace(-4, 1, 6),
#         'l1_ratio': np.linspace(0, 1, 9),
#         'max_iter': np.arange(100, 1001, 100),
#         'selection': ['cyclic', 'random']
#     },
#     'huber': {
#         'epsilon': [1.0, 1.1, 1.2, 1.25, 1.3, 1.35, 1.4, 1.5, 2.0],
#         'max_iter': np.arange(10, 101, 10),
#         'alpha': np.logspace(-6, -2, 5)
#     }
# }

In [71]:
tuned_models = {}

for model_name, model_object in created_models.items():
    print(f"Tuning model: {model_name}")
    tuned_models[model_name] = tune_model(
        model_object,
        search_library='scikit-optimize',
        n_iter=20,
        early_stopping=True
    )
    print("\r")

Tuning model: llar


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,22.6373,982.7263,31.3485,0.306,0.3661,0.334,0.0,-0.0,0.0,-0.0
1,16.575,432.2318,20.7902,0.6171,0.6639,0.2108,0.0,-0.0,0.0,-0.0
2,23.1639,756.1326,27.4979,0.5067,0.731,0.3873,0.0,-0.0,0.0,-0.0
3,19.274,687.5247,26.2207,0.483,0.5352,0.2562,0.0,-0.0,0.0,-0.0
4,22.9949,907.6263,30.1268,0.46,0.6804,0.2568,0.0,-0.0,0.0,-0.0
5,18.5077,564.6795,23.763,0.492,0.6478,0.2035,0.0,-0.0,0.0,-0.0
6,21.8425,850.6053,29.1651,0.1216,0.3319,0.2993,0.0,-0.0,0.0,-0.0
7,18.3867,564.9171,23.768,0.4072,0.2731,0.2341,0.0,-0.0,0.0,-0.0
8,22.1236,819.5603,28.628,0.3609,0.3131,0.2646,0.0,-0.0,0.0,-0.0
9,19.7288,789.7049,28.1017,0.4505,0.4829,0.2542,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,23.0749,946.5049,30.7653,0.3316,0.3552,0.327,0.0,-0.0,0.0,-0.0
1,16.4808,418.8575,20.466,0.6289,0.5718,0.2207,0.0,-0.0,0.0,-0.0
2,21.1909,691.3984,26.2945,0.549,0.64,0.3408,0.0,-0.0,0.0,-0.0
3,19.1153,661.7768,25.725,0.5024,0.4494,0.2421,0.0,-0.0,0.0,-0.0
4,21.5819,891.3626,29.8557,0.4697,0.5739,0.2454,0.0,-0.0,0.0,-0.0
5,17.6184,513.1748,22.6534,0.5383,0.5334,0.1983,0.0,-0.0,0.0,-0.0
6,23.733,855.2906,29.2454,0.1168,0.3458,0.3282,0.0,-0.0,0.0,-0.0
7,17.145,516.3605,22.7236,0.4582,0.267,0.2213,0.0,-0.0,0.0,-0.0
8,22.3621,811.8448,28.4929,0.3669,0.3204,0.2768,0.0,-0.0,0.0,-0.0
9,18.0309,556.6568,23.5936,0.6127,0.4178,0.2396,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,22.6182,977.1983,31.2602,0.3099,0.3661,0.3342,0.0,-0.0,0.0,-0.0
1,16.5724,431.5336,20.7734,0.6177,0.6658,0.2101,0.0,-0.0,0.0,-0.0
2,23.1865,757.2323,27.5179,0.506,0.7327,0.3882,0.0,-0.0,0.0,-0.0
3,19.2616,686.3019,26.1974,0.4839,0.5369,0.2564,0.0,-0.0,0.0,-0.0
4,23.008,909.5833,30.1593,0.4588,0.6829,0.2568,0.0,-0.0,0.0,-0.0
5,18.5077,564.2939,23.7549,0.4923,0.6506,0.2036,0.0,-0.0,0.0,-0.0
6,21.8321,849.3585,29.1438,0.1229,0.3318,0.2992,0.0,-0.0,0.0,-0.0
7,18.3782,566.005,23.7909,0.4061,0.2736,0.2341,0.0,-0.0,0.0,-0.0
8,22.1673,821.7268,28.6658,0.3592,0.3135,0.2652,0.0,-0.0,0.0,-0.0
9,19.7463,788.3317,28.0772,0.4515,0.4861,0.255,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,22.9586,944.9678,30.7403,0.3327,0.3589,0.3298,0.0,-0.0,0.0,-0.0
1,16.4817,438.3726,20.9373,0.6117,0.5401,0.2171,0.0,-0.0,0.0,-0.0
2,21.7132,714.4002,26.7283,0.534,0.6318,0.3592,0.0,-0.0,0.0,-0.0
3,19.3014,687.3483,26.2173,0.4831,0.4416,0.2515,0.0,-0.0,0.0,-0.0
4,21.3252,906.3525,30.1057,0.4607,0.5583,0.2417,0.0,-0.0,0.0,-0.0
5,17.4067,493.1279,22.2065,0.5564,0.5289,0.1956,0.0,-0.0,0.0,-0.0
6,22.8611,768.4026,27.7201,0.2065,0.3282,0.3143,0.0,-0.0,0.0,-0.0
7,16.5116,487.1383,22.0712,0.4888,0.2632,0.2174,0.0,-0.0,0.0,-0.0
8,22.3669,804.2277,28.3589,0.3729,0.3174,0.275,0.0,-0.0,0.0,-0.0
9,17.6394,512.4991,22.6384,0.6434,0.4102,0.236,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,22.5619,1043.0638,32.2965,0.2634,0.3487,0.3048,0.0,-0.0,0.0,-0.0
1,16.2459,439.7366,20.9699,0.6104,0.5696,0.2057,0.0,-0.0,0.0,-0.0
2,22.4646,747.0884,27.3329,0.5126,0.6264,0.37,0.0,-0.0,0.0,-0.0
3,20.1387,745.0609,27.2958,0.4397,0.4486,0.2529,0.0,-0.0,0.0,-0.0
4,22.0184,857.7186,29.2868,0.4897,0.5087,0.2488,0.0,-0.0,0.0,-0.0
5,18.2576,572.4947,23.9269,0.4849,0.5217,0.2055,0.0,-0.0,0.0,-0.0
6,20.997,844.414,29.0588,0.128,0.3292,0.285,0.0,-0.0,0.0,-0.0
7,18.5809,570.6229,23.8877,0.4012,0.2683,0.2323,0.0,-0.0,0.0,-0.0
8,21.5666,812.1324,28.4979,0.3667,0.3091,0.2548,0.0,-0.0,0.0,-0.0
9,19.3472,762.7296,27.6176,0.4693,0.4272,0.2351,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [72]:
# pipeline_path = '../pipelines/tuned_models/'
# if not os.path.exists(pipeline_path):
#     print("Path to pipeline does not exist")
# else:
#     model_names_to_load = ['br', 'lasso', 'llar', 'en', 'huber']

#     tuned_models = {}

#     for name in model_names_to_load:
#         tuned_models[name] = load_model(os.path.join(pipeline_path, name)).named_steps['trained_model']
    
#     print(tuned_models)

In [73]:
# pipeline_path = '../pipelines/tuned_models/'
# os.makedirs(pipeline_path, exist_ok=True)

# for model_name, model_object in tuned_models.items():
#     save_model(model_object, os.path.join(pipeline_path, model_name));

In [74]:
tuning_predictions_metric = {}

for model_name, model_object in tuned_models.items():
    predict_model(model_object, verbose=False)
    tuning_predictions_metric[model_name] = pull()

In [75]:
list_of_metric_dfs = []

file_name = 'temp_excel.xlsx'

# Iterate through the dictionary and append each DataFrame to the list
for model_name, metrics_df in holdout_predictions_metric.items():
    list_of_metric_dfs.append(metrics_df)

# Concatenate all DataFrames into a single one
print("--- Holdout Predictions DataFrame ---", end='\n\n')
results_df = pd.concat(list_of_metric_dfs, ignore_index=True)
print(results_df.to_string())

with pd.ExcelWriter(file_name) as writer:
    results_df.to_excel(writer, sheet_name='Base Model Metrics', index=False)

list_of_metric_dfs = []
for model_name, metrics_df in tuning_predictions_metric.items():
    list_of_metric_dfs.append(metrics_df)

print("\n--- Tuned Predictions DataFrame ---", end='\n\n')
results_df = pd.concat(list_of_metric_dfs, ignore_index=True)
print(results_df.to_string())

with pd.ExcelWriter(file_name, mode='a') as writer:
    results_df.to_excel(writer, sheet_name='Tuned Model Metrics', index=False)

--- Holdout Predictions DataFrame ---

                          Model      MAE        MSE     RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  R2_CEIL  RMSE_CEIL
0  Lasso Least Angle Regression  22.4764   808.0530  28.4263  0.2859  0.3397  0.3038  0.2842   28.4612   0.2861    28.4236
1       Random Forest Regressor  24.8310  1014.2894  31.8479  0.1037  0.3752  0.3329  0.1051   31.8222   0.1035    31.8512
2              Lasso Regression  22.4764   808.0561  28.4263  0.2859  0.3397  0.3038  0.2842   28.4612   0.2861    28.4236
3         Extra Trees Regressor  24.4420   968.2271  31.1163  0.1444  0.3727  0.3270  0.1414   31.1706   0.1440    31.1235
4               Huber Regressor  22.6155   839.5115  28.9743  0.2581  0.3398  0.2991  0.2581   28.9756   0.2566    29.0045

--- Tuned Predictions DataFrame ---

                          Model      MAE       MSE     RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  R2_CEIL  RMSE_CEIL
0  Lasso Least Angle Regression  22.4764  808.0530  28.4263  0.