In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from pycaret.regression import *

In [2]:
root = os.path.join('..', 'data', 'imputed')
folder = 'mice'
df = pd.read_csv(os.path.join(root, f'{folder}_df.csv'))

In [3]:
df['date'] = pd.to_datetime(df['date'])

## Load Model (optional)

In [4]:
pipeline_path = '../pipelines/base_models/'
if not os.path.exists(pipeline_path):
    print("Path to pipeline does not exist")
else:
    model_names_to_load = ['br', 'lasso', 'llar', 'en', 'huber']

    created_models = {}

    for name in model_names_to_load:
        created_models[name] = load_model(os.path.join(pipeline_path, name)).named_steps['trained_model']
    
    print(created_models)

Path to pipeline does not exist


## Pycaret Setup

In [5]:
s = setup(df, target='total_number_of_patients', session_id=123, train_size=0.8)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,total_number_of_patients
2,Target type,Regression
3,Original data shape,"(599, 22)"
4,Transformed data shape,"(599, 30)"
5,Transformed train set shape,"(479, 30)"
6,Transformed test set shape,"(120, 30)"
7,Numeric features,19
8,Date features,1
9,Categorical features,1


In [6]:
compare_models(exclude=['lightgbm'], errors='raise')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,3.8421,42.9591,6.1021,0.9236,0.2633,0.1242,0.018
en,Elastic Net,3.8935,42.9581,6.0962,0.9235,0.2709,0.1266,0.022
lasso,Lasso Regression,3.9411,42.986,6.0983,0.9234,0.2742,0.1286,0.182
llar,Lasso Least Angle Regression,3.941,42.9855,6.0983,0.9234,0.2741,0.1286,0.021
lr,Linear Regression,3.9291,44.3605,6.2316,0.921,0.2702,0.1258,0.53
ridge,Ridge Regression,3.928,44.3317,6.2296,0.921,0.269,0.1259,0.02
lar,Least Angle Regression,3.9744,45.7741,6.3464,0.9183,0.267,0.1268,0.024
gbr,Gradient Boosting Regressor,4.1546,46.9709,6.6321,0.9145,0.291,0.104,0.033
huber,Huber Regressor,3.1171,51.4485,6.4555,0.9091,0.2587,0.0773,0.021
catboost,CatBoost Regressor,4.0442,53.2284,7.03,0.9043,0.2978,0.0953,0.789


## Custom Metrics

In [7]:
from sklearn.metrics import r2_score, mean_squared_error

def r2_rounded(y_true, y_pred):
    """Calculates R2 score after rounding predictions to the nearest whole number."""
    return r2_score(y_true, np.round(y_pred))

def rmse_rounded(y_true, y_pred):
    """Calculates RMSE after rounding predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.round(y_pred)))

def r2_ceil(y_true, y_pred):
    """Calculates R2 score after ceiling predictions to the nearest whole number."""
    return r2_score(y_true, np.ceil(y_pred))

def rmse_ceil(y_true, y_pred):
    """Calculates RMSE after ceiling predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.ceil(y_pred)))

In [8]:
add_metric('R2_Rounded', 'R2_RND', r2_rounded, greater_is_better=True)
add_metric('RMSE_Rounded', 'RMSE_RND', rmse_rounded, greater_is_better=False)
add_metric('R2_Ceil', 'R2_CEIL', r2_ceil, greater_is_better=True)
add_metric('RMSE_Ceil', 'RMSE_CEIL', rmse_ceil, greater_is_better=False);

## Analyze Model

In [9]:
model_names_to_create = ['br', 'lasso', 'llar', 'en', 'huber']
created_models = {}

for model_name in model_names_to_create:
    model_object = create_model(model_name, verbose=False)
    created_models[model_name] = model_object

### Bayesian Ridge

In [10]:
evaluate_model(created_models['br'])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Lasso Regression

In [11]:
evaluate_model(created_models['lasso'])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Lasso Least Angle Regression

In [12]:
evaluate_model(created_models['llar'])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Elastic Net

In [13]:
evaluate_model(created_models['en'])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Huber Regressor

In [14]:
evaluate_model(created_models['huber'])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Prediction

In [15]:
holdout_predictions_metric = {}

for model_name, model_object in created_models.items():
    predict_model(model_object, verbose=False)
    holdout_predictions_metric[model_name] = pull()

In [16]:
pipeline_path = '../pipelines/base_models/'
os.makedirs(pipeline_path, exist_ok=True)

for model_name, model_object in created_models.items():
    save_model(model_object, os.path.join(pipeline_path, model_name));

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


## Tuning

In [17]:
# param_grids = {
#     'br': {
#         'max_iter': np.arange(50, 301, 50),
#         'alpha_1': np.logspace(-8, -3, 6),
#         'lambda_1': np.logspace(-8, -3, 6),
#         'alpha_2': np.logspace(-8, -3, 6),
#         'lambda_2': np.logspace(-8, -3, 6)
#     },
#     'lasso': {
#         'alpha': np.logspace(-4, 1, 6),
#         'selection': ['cyclic', 'random']
#     },
#     'llar': {
#         'alpha': np.logspace(-4, 1, 6),
#         'jitter': list(np.logspace(-7, -4, 4)) + [None]
#     },
#     'en': {
#         'alpha': np.logspace(-4, 1, 6),
#         'l1_ratio': np.linspace(0, 1, 9),
#         'max_iter': np.arange(100, 1001, 100),
#         'selection': ['cyclic', 'random']
#     },
#     'huber': {
#         'epsilon': [1.0, 1.1, 1.2, 1.25, 1.3, 1.35, 1.4, 1.5, 2.0],
#         'max_iter': np.arange(10, 101, 10),
#         'alpha': np.logspace(-6, -2, 5)
#     }
# }

In [18]:
tuned_models = {}

for model_name, model_object in created_models.items():
    print(f"Tuning model: {model_name}")
    tuned_models[model_name] = tune_model(
        model_object,
        search_library='scikit-optimize',
        n_iter=20,
        early_stopping=True
    )
    print("\r")

Tuning model: br


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.4893,23.682,4.8664,0.9541,0.2549,0.1415,0.0,-0.0,0.0,-0.0
1,4.3508,62.042,7.8767,0.8692,0.2243,0.1362,0.0,-0.0,0.0,-0.0
2,4.5716,33.4238,5.7813,0.9416,0.5324,0.1637,0.0,-0.0,0.0,-0.0
3,2.8724,11.2653,3.3564,0.9814,0.3133,0.0998,0.0,-0.0,0.0,-0.0
4,4.4493,96.2345,9.8099,0.8464,0.155,0.1078,0.0,-0.0,0.0,-0.0
5,3.6795,18.619,4.315,0.9628,0.24,0.1094,0.0,-0.0,0.0,-0.0
6,3.1848,12.7432,3.5698,0.9697,0.1311,0.1135,0.0,-0.0,0.0,-0.0
7,3.6604,38.3216,6.1904,0.929,0.1547,0.1183,0.0,-0.0,0.0,-0.0
8,4.7943,110.9769,10.5346,0.8192,0.3738,0.1347,0.0,-0.0,0.0,-0.0
9,3.3703,22.2271,4.7146,0.9628,0.2545,0.117,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.4321,22.3782,4.7306,0.9566,0.2645,0.1407,0.0,-0.0,0.0,-0.0
1,4.3325,61.2512,7.8263,0.8708,0.2267,0.1362,0.0,-0.0,0.0,-0.0
2,4.5543,33.3649,5.7762,0.9417,0.5414,0.1645,0.0,-0.0,0.0,-0.0
3,2.8398,11.388,3.3746,0.9812,0.3227,0.0993,0.0,-0.0,0.0,-0.0
4,4.4846,96.5138,9.8241,0.8459,0.1557,0.1091,0.0,-0.0,0.0,-0.0
5,3.7208,19.3577,4.3997,0.9613,0.2451,0.1114,0.0,-0.0,0.0,-0.0
6,3.1748,12.7079,3.5648,0.9698,0.1312,0.1135,0.0,-0.0,0.0,-0.0
7,3.7255,38.9651,6.2422,0.9278,0.1558,0.1199,0.0,-0.0,0.0,-0.0
8,4.7192,111.0712,10.539,0.8191,0.3653,0.1329,0.0,-0.0,0.0,-0.0
9,3.4364,22.4274,4.7358,0.9625,0.258,0.119,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.4398,22.4689,4.7401,0.9564,0.2647,0.1407,0.0,-0.0,0.0,-0.0
1,4.3449,61.3124,7.8302,0.8707,0.2267,0.1367,0.0,-0.0,0.0,-0.0
2,4.5442,33.1108,5.7542,0.9422,0.5417,0.1644,0.0,-0.0,0.0,-0.0
3,2.8451,11.401,3.3765,0.9811,0.3228,0.0995,0.0,-0.0,0.0,-0.0
4,4.4864,96.4719,9.822,0.846,0.1557,0.1092,0.0,-0.0,0.0,-0.0
5,3.7168,19.2751,4.3903,0.9615,0.245,0.1114,0.0,-0.0,0.0,-0.0
6,3.1782,12.7117,3.5653,0.9698,0.1312,0.1135,0.0,-0.0,0.0,-0.0
7,3.7255,38.9144,6.2381,0.9279,0.1558,0.1199,0.0,-0.0,0.0,-0.0
8,4.7182,111.1003,10.5404,0.819,0.365,0.1329,0.0,-0.0,0.0,-0.0
9,3.4207,22.3948,4.7323,0.9625,0.2582,0.1183,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3.484,23.0139,4.7973,0.9554,0.2645,0.1411,0.0,-0.0,0.0,-0.0
1,4.3846,61.9295,7.8695,0.8694,0.2274,0.138,0.0,-0.0,0.0,-0.0
2,4.4783,31.6734,5.6279,0.9447,0.5489,0.1633,0.0,-0.0,0.0,-0.0
3,2.8708,11.4097,3.3778,0.9811,0.3231,0.1004,0.0,-0.0,0.0,-0.0
4,4.5095,96.721,9.8347,0.8456,0.1558,0.1096,0.0,-0.0,0.0,-0.0
5,3.6961,18.7889,4.3346,0.9625,0.2436,0.1115,0.0,-0.0,0.0,-0.0
6,3.2046,12.9342,3.5964,0.9692,0.1315,0.1141,0.0,-0.0,0.0,-0.0
7,3.7166,38.7099,6.2217,0.9283,0.1559,0.1199,0.0,-0.0,0.0,-0.0
8,4.7237,111.3221,10.5509,0.8187,0.3646,0.1333,0.0,-0.0,0.0,-0.0
9,3.3881,21.8238,4.6716,0.9635,0.2573,0.1177,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2_RND,RMSE_RND,R2_CEIL,RMSE_CEIL
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.6319,23.8182,4.8804,0.9538,0.3702,0.0796,0.0,-0.0,0.0,-0.0
1,4.2342,76.9602,8.7727,0.8377,0.2561,0.1189,0.0,-0.0,0.0,-0.0
2,3.84,34.6827,5.8892,0.9394,0.6016,0.114,0.0,-0.0,0.0,-0.0
3,1.9694,10.6563,3.2644,0.9824,0.3718,0.0499,0.0,-0.0,0.0,-0.0
4,4.5244,128.6978,11.3445,0.7946,0.167,0.0874,0.0,-0.0,0.0,-0.0
5,2.1645,10.6823,3.2684,0.9787,0.1093,0.0551,0.0,-0.0,0.0,-0.0
6,2.2413,11.8852,3.4475,0.9717,0.099,0.0669,0.0,-0.0,0.0,-0.0
7,3.0498,47.6227,6.9009,0.9117,0.1408,0.0654,0.0,-0.0,0.0,-0.0
8,4.1043,129.8662,11.3959,0.7884,0.2624,0.0879,0.0,-0.0,0.0,-0.0
9,2.3388,13.7358,3.7062,0.977,0.1736,0.0688,0.0,-0.0,0.0,-0.0


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [19]:
pipeline_path = '../pipelines/tuned_models/'
if not os.path.exists(pipeline_path):
    print("Path to pipeline does not exist")
else:
    model_names_to_load = ['br', 'lasso', 'llar', 'en', 'huber']

    tuned_models = {}

    for name in model_names_to_load:
        tuned_models[name] = load_model(os.path.join(pipeline_path, name)).named_steps['trained_model']
    
    print(tuned_models)

Path to pipeline does not exist


In [20]:
pipeline_path = '../pipelines/tuned_models/'
os.makedirs(pipeline_path, exist_ok=True)

for model_name, model_object in tuned_models.items():
    save_model(model_object, os.path.join(pipeline_path, model_name));

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


In [21]:
tuning_predictions_metric = {}

for model_name, model_object in tuned_models.items():
    predict_model(model_object, verbose=False)
    tuning_predictions_metric[model_name] = pull()

In [22]:
list_of_metric_dfs = []

file_name = 'temp_excel.xlsx'

# Iterate through the dictionary and append each DataFrame to the list
for model_name, metrics_df in holdout_predictions_metric.items():
    list_of_metric_dfs.append(metrics_df)

# Concatenate all DataFrames into a single one
print("--- Holdout Predictions DataFrame ---", end='\n\n')
results_df = pd.concat(list_of_metric_dfs, ignore_index=True)
print(results_df.to_string())

with pd.ExcelWriter(file_name) as writer:
    results_df.to_excel(writer, sheet_name='Base Model Metrics', index=False)

list_of_metric_dfs = []
for model_name, metrics_df in tuning_predictions_metric.items():
    list_of_metric_dfs.append(metrics_df)

print("\n--- Tuned Predictions DataFrame ---", end='\n\n')
results_df = pd.concat(list_of_metric_dfs, ignore_index=True)
print(results_df.to_string())

with pd.ExcelWriter(file_name, mode='a') as writer:
    results_df.to_excel(writer, sheet_name='Tuned Model Metrics', index=False)

--- Holdout Predictions DataFrame ---

                          Model     MAE      MSE    RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  R2_CEIL  RMSE_CEIL
0                Bayesian Ridge  3.5377  19.1429  4.3753  0.9753  0.3481  0.1180  0.9750    4.4064   0.9741     4.4852
1              Lasso Regression  3.7385  20.2916  4.5046  0.9738  0.3562  0.1253  0.9737    4.5157   0.9721     4.6485
2  Lasso Least Angle Regression  3.7381  20.2889  4.5043  0.9738  0.3562  0.1253  0.9737    4.5157   0.9721     4.6485
3                   Elastic Net  3.6556  19.3864  4.4030  0.9750  0.3510  0.1227  0.9750    4.4036   0.9733     4.5470
4               Huber Regressor  3.5040  40.7143  6.3808  0.9475  0.5210  0.0649  0.9475    6.3829   0.9456     6.4949

--- Tuned Predictions DataFrame ---

                          Model     MAE      MSE    RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  R2_CEIL  RMSE_CEIL
0                Bayesian Ridge  3.5377  19.1429  4.3753  0.9753  0.3481  0.1180  0.9750  