## 1. Configuration

Set the paths and flags for the modeling process below. 
- `TRAIN_NEW_MODELS`: Set to `True` to run training and tuning. Set to `False` to load existing models.
- `INPUT_DIR`: The directory where your source data CSV is located.
- `DATAFRAME_NAME`: The name of your CSV file (without the `.csv` extension).
- `OUTPUT_DIR_WINDOWS`: The root folder on your Windows drive where model pipelines will be saved. The path is written in WSL format (e.g., `/mnt/d/` for the `D:` drive).

In [7]:
import os

# Set to True to run the full training and tuning process.
# Set to False to load pre-existing models from the output directory.
TRAIN_NEW_MODELS = True

# PATHS
INPUT_DIR = os.path.join('..', '..', 'data', 'upsampled')
DATAFRAME_NAME = 'mean_df' # Name of the .csv file without the extension

# Output directory for saving model pipelines (WSL format for Windows D: drive)
# This path corresponds to D:\ML_Pipelines in Windows
OUTPUT_ROOT_DIR_WINDOWS = '/mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression'
DATAFRAME_SPECIFIC_PATH = os.path.join(OUTPUT_ROOT_DIR_WINDOWS, DATAFRAME_NAME)
BASE_MODEL_PATH = os.path.join(DATAFRAME_SPECIFIC_PATH, 'base_models')
TUNED_MODEL_PATH = os.path.join(DATAFRAME_SPECIFIC_PATH, 'tuned_models')

# Create directories if they don't exist
if TRAIN_NEW_MODELS:
    print(f"Creating directory for base models: {BASE_MODEL_PATH}")
    os.makedirs(BASE_MODEL_PATH, exist_ok=True)
    print(f"Creating directory for tuned models: {TUNED_MODEL_PATH}")
    os.makedirs(TUNED_MODEL_PATH, exist_ok=True)
else:
    print("TRAIN_NEW_MODELS is False. Will attempt to load existing models.")

Creating directory for base models: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/base_models
Creating directory for tuned models: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/tuned_models


## 2. Setup and Data Loading

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from pycaret.regression import *

In [9]:
df = pd.read_csv(os.path.join(INPUT_DIR, f'{DATAFRAME_NAME}.csv'))
df['date'] = pd.to_datetime(df['date'])
df = df[['date', 'total_number_of_patients']]

## 3. Pycaret Setup

In [10]:
fh = 60
split_point = df.shape[0] - fh

train_data = df.iloc[:split_point]
test_data = df.iloc[split_point:]

In [11]:
setup(
    data=train_data,
    target='total_number_of_patients',
    test_data=test_data,
    fold_strategy='timeseries',
    data_split_shuffle=False,
    fold_shuffle=False,
    session_id=123
    );

Unnamed: 0,Description,Value
0,Session id,123
1,Target,total_number_of_patients
2,Target type,Regression
3,Original data shape,"(618, 2)"
4,Transformed data shape,"(618, 4)"
5,Transformed train set shape,"(558, 4)"
6,Transformed test set shape,"(60, 4)"
7,Date features,1
8,Preprocess,True
9,Imputation type,simple


In [12]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


## 4. Model Training or Loading

Based on the `TRAIN_NEW_MODELS` flag, this section will either train and save new models or load existing ones.

In [13]:
created_models = {}
tuned_models = {}

if TRAIN_NEW_MODELS:
    print("Starting Model Training and Tuning")
    
    # Step 1: Compare base models
    compare_models(exclude=['lightgbm', 'par', 'dummy', 'lar'], errors='raise')
    best_models_df = pull()
    model_names_to_process = best_models_df.index[:5].tolist()

    # Step 2: Create, save, and tune base models
    for model_name in model_names_to_process:
        print(f'\n--- Processing Model: {model_name} ---')
        
        # Create base model
        print(f'Creating base model: {model_name}')
        base_model = create_model(model_name, verbose=False)
        created_models[model_name] = base_model
        
        # Save base model pipeline
        save_path_base = os.path.join(BASE_MODEL_PATH, model_name)
        print(f'Saving base model to: {save_path_base}')
        save_model(base_model, save_path_base)
        
        # Tune model
        print(f'Tuning model: {model_name}')
        tuned_model = tune_model(
            base_model,
            search_library='scikit-optimize',
            n_iter=20,
            early_stopping=True,
            verbose=False
        )
        tuned_models[model_name] = tuned_model
        
        # Save tuned model pipeline
        save_path_tuned = os.path.join(TUNED_MODEL_PATH, model_name)
        print(f'Saving tuned model to: {save_path_tuned}')
        save_model(tuned_model, save_path_tuned)

else:
    print("--- Loading Existing Models ---")
    # Load base and tuned models if they exist
    if os.path.exists(BASE_MODEL_PATH):
        model_names_to_process = [os.path.splitext(f)[0] for f in os.listdir(BASE_MODEL_PATH) if f.endswith('.pkl')]
        print(f"Found models in {BASE_MODEL_PATH}: {model_names_to_process}")
    else:
        print(f"ERROR: Base model directory not found at {BASE_MODEL_PATH}. Cannot load models.")
        model_names_to_process = []
    
    for name in model_names_to_process:
        base_path = os.path.join(BASE_MODEL_PATH, name)
        tuned_path = os.path.join(TUNED_MODEL_PATH, name)
        
        # Load Base Model
        if os.path.exists(f'{base_path}.pkl'):
            print(f'Loading base model: {name} from {base_path}')
            created_models[name] = load_model(base_path, verbose=False)
        else:
            print(f'WARNING: Base model for {name} not found at {base_path}.pkl')
            
        # Load Tuned Model
        if os.path.exists(f'{tuned_path}.pkl'):
            print(f'Loading tuned model: {name} from {tuned_path}')
            tuned_models[name] = load_model(tuned_path, verbose=False)
        else:
            print(f'WARNING: Tuned model for {name} not found at {tuned_path}.pkl')

print("\nModel processing complete.")
print(f"\nBase models available: {list(created_models.keys())}")
print(f"Tuned models available: {list(tuned_models.keys())}")

Starting Model Training and Tuning


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ada,AdaBoost Regressor,18.4324,496.7756,22.0516,-0.385,0.5891,0.685,0.012
et,Extra Trees Regressor,17.6821,535.6783,22.7733,-0.4746,0.6052,0.5928,0.038
catboost,CatBoost Regressor,18.3304,542.1815,22.9699,-0.4784,0.6069,0.6194,0.395
rf,Random Forest Regressor,18.0772,537.9417,22.7869,-0.4841,0.6005,0.6193,0.053
knn,K Neighbors Regressor,18.644,555.7389,23.2315,-0.5123,0.6141,0.6551,0.013
gbr,Gradient Boosting Regressor,18.3697,555.4403,23.1354,-0.5146,0.6069,0.6188,0.02
en,Elastic Net,19.3045,589.9751,23.5926,-0.5592,0.6219,0.6539,0.009
xgboost,Extreme Gradient Boosting,18.4192,573.6637,23.6055,-0.5958,0.6305,0.6206,0.234
lasso,Lasso Regression,19.3841,610.9734,23.9255,-0.6122,0.6275,0.6407,0.227
llar,Lasso Least Angle Regression,19.3841,610.9735,23.9255,-0.6122,0.6275,0.6407,0.013



--- Processing Model: ada ---
Creating base model: ada
Saving base model to: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/base_models/ada
Transformation Pipeline and Model Successfully Saved
Tuning model: ada
Saving tuned model to: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/tuned_models/ada
Transformation Pipeline and Model Successfully Saved

--- Processing Model: et ---
Creating base model: et
Saving base model to: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/base_models/et
Transformation Pipeline and Model Successfully Saved
Tuning model: et
Saving tuned model to: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/tuned_models/et
Transformation Pipeline and Model Successfully Saved

--- Processing Model: catboost ---
Creating base model: catboost
Saving base model to: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/base_models/catboost
Transformation Pipeline and Model Successfully Saved
Tu

## 5. Evaluate and Analyze Models

This section provides detailed plots and metrics for each of the models (either newly trained or loaded).

### Bayesian Ridge

In [14]:
if 'br' in created_models:
    print("--- Base Bayesian Ridge ---")
    evaluate_model(created_models['br'])
if 'br' in tuned_models:
    print("--- Tuned Bayesian Ridge ---")
    evaluate_model(tuned_models['br'])

### Lasso Regression

In [15]:
if 'lasso' in created_models:
    print("--- Base Lasso Regression ---")
    evaluate_model(created_models['lasso'])
# if 'lasso' in tuned_models:
#     print("--- Tuned Lasso Regression ---")
#     evaluate_model(tuned_models['lasso'])

### Lasso Least Angle Regression

In [16]:
if 'llar' in created_models:
    print("--- Base Lasso Least Angle Regression ---")
    evaluate_model(created_models['llar'])
if 'llar' in tuned_models:
    print("--- Tuned Lasso Least Angle Regression ---")
    evaluate_model(tuned_models['llar'])

### Elastic Net

In [17]:
if 'en' in created_models:
    print("--- Base Elastic Net ---")
    evaluate_model(created_models['en'])
if 'en' in tuned_models:
    print("--- Tuned Elastic Net ---")
    evaluate_model(tuned_models['en'])

### Huber Regressor

In [18]:
if 'huber' in created_models:
    print("--- Base Huber Regressor ---")
    evaluate_model(created_models['huber'])
if 'huber' in tuned_models:
    print("--- Tuned Huber Regressor ---")
    evaluate_model(tuned_models['huber'])

## 6. Custom Metrics and Final Predictions
This section defines and adds custom metrics for evaluating predictions on the hold-out set, then generates and saves the final performance metrics to an Excel file.

In [19]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def r2_rounded(y_true, y_pred):
    """Calculates R2 score after rounding predictions to the nearest whole number."""
    return r2_score(y_true, np.round(y_pred))

def rmse_rounded(y_true, y_pred):
    """Calculates RMSE after rounding predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.round(y_pred)))

def r2_ceil(y_true, y_pred):
    """Calculates R2 score after ceiling predictions to the nearest whole number."""
    return r2_score(y_true, np.ceil(y_pred))

def rmse_ceil(y_true, y_pred):
    """Calculates RMSE after ceiling predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.ceil(y_pred)))

def mae_rounded(y_true, y_pred):
    """Calculates MAE after rounding predictions to the nearest whole number."""
    return mean_absolute_error(y_true, np.round(y_pred))

def mae_ceil(y_true, y_pred):
    """Calculates MAE after ceiling predictions to the nearest whole number."""
    return mean_absolute_error(y_true, np.ceil(y_pred))

In [20]:
try:
    add_metric('R2_Rounded', 'R2_RND', r2_rounded, greater_is_better=True)
    add_metric('RMSE_Rounded', 'RMSE_RND', rmse_rounded, greater_is_better=False)
    add_metric('MAE_Rounded', 'MAE_RND', mae_rounded, greater_is_better=False)
    add_metric('R2_Ceil', 'R2_CEIL', r2_ceil, greater_is_better=True)
    add_metric('RMSE_Ceil', 'RMSE_CEIL', rmse_ceil, greater_is_better=False)
    add_metric('MAE_Ceil', 'MAE_CEIL', mae_ceil, greater_is_better=False)
except ValueError:
    print("Metrics may have already been added in this session.")

In [21]:
# Generate predictions for base models
holdout_predictions_metric = {}
if not created_models:
    print("No base models available to make predictions.")
else:
    for model_name, model_object in created_models.items():
        print(f"Generating predictions for base model: {model_name}")
        predict_model(model_object, verbose=False)
        holdout_predictions_metric[model_name] = pull()

# Generate predictions for tuned models
tuning_predictions_metric = {}
if not tuned_models:
    print("No tuned models available to make predictions.")
else:
    for model_name, model_object in tuned_models.items():
        print(f"Generating predictions for tuned model: {model_name}")
        predict_model(model_object, verbose=False)
        tuning_predictions_metric[model_name] = pull()

Generating predictions for base model: ada
Generating predictions for base model: et
Generating predictions for base model: catboost
Generating predictions for base model: rf
Generating predictions for base model: knn
Generating predictions for tuned model: ada
Generating predictions for tuned model: et
Generating predictions for tuned model: catboost
Generating predictions for tuned model: rf
Generating predictions for tuned model: knn


In [22]:
output_excel_path = os.path.join(DATAFRAME_SPECIFIC_PATH, 'model_performance_metrics.xlsx')
print(f"Saving performance metrics to: {output_excel_path}")

with pd.ExcelWriter(output_excel_path) as writer:
    # --- Process and Save Base Model Metrics ---
    if holdout_predictions_metric:
        list_of_metric_dfs_base = []
        for model_name, metrics_df in holdout_predictions_metric.items():
            list_of_metric_dfs_base.append(metrics_df)
        
        results_df_base = pd.concat(list_of_metric_dfs_base, ignore_index=True).sort_values('R2', ascending=False)
        print("\n--- Base Model Holdout Predictions ---")
        print(results_df_base.to_string())
        results_df_base.to_excel(writer, sheet_name='Base Model Metrics', index=False)
    else:
        print("\nNo base model metrics to save.")

    # --- Process and Save Tuned Model Metrics ---
    if tuning_predictions_metric:
        list_of_metric_dfs_tuned = []
        for model_name, metrics_df in tuning_predictions_metric.items():
            list_of_metric_dfs_tuned.append(metrics_df)
            
        results_df_tuned = pd.concat(list_of_metric_dfs_tuned, ignore_index=True).sort_values('R2', ascending=False)
        print("\n--- Tuned Model Holdout Predictions ---")
        print(results_df_tuned.to_string())
        results_df_tuned.to_excel(writer, sheet_name='Tuned Model Metrics', index=False)
    else:
        print("\nNo tuned model metrics to save.")

Saving performance metrics to: /mnt/d/EMEWS_ML_Pipelines_Output/patient_only_regression/mean_df/model_performance_metrics.xlsx

--- Base Model Holdout Predictions ---
                     Model      MAE       MSE     RMSE      R2   RMSLE    MAPE  R2_RND  RMSE_RND  MAE_RND  R2_CEIL  RMSE_CEIL  MAE_CEIL
0       AdaBoost Regressor  17.0920  452.8652  21.2806 -0.0855  0.4548  0.4453 -0.0877   21.3018  17.1000  -0.0830    21.2564   17.0333
3  Random Forest Regressor  17.0331  513.1463  22.6527 -0.2300  0.4852  0.4759 -0.2381   22.7274  17.1000  -0.2269    22.6245   16.9333
4    K Neighbors Regressor  19.1200  547.0867  23.3899 -0.3114  0.4878  0.4687 -0.3162   23.4329  19.1667  -0.3084    23.3631   19.0667
2       CatBoost Regressor  17.8406  556.1205  23.5822 -0.3330  0.4994  0.4831 -0.3218   23.4826  17.7667  -0.3439    23.6784   17.9000
1    Extra Trees Regressor  18.3403  598.2740  24.4596 -0.4341  0.5259  0.5484 -0.4414   24.5221  18.3667  -0.4525    24.6164   18.4333

--- Tuned Model 

## 7. Final Model Check

As a final step, let's examine the best performing tuned model and its predictions on the test set.

In [23]:
# Example: Check the tuned Bayesian Ridge model
best_model_name = 'br'

if best_model_name in tuned_models:
    print(f"--- Final Check of Tuned Model: {best_model_name} ---")
    predict_model(tuned_models[best_model_name], verbose=True, data=test_data)
    final_metrics = pull()
    print("\nPerformance on Test Data:")
    display(final_metrics)
else:
    print(f"Model '{best_model_name}' not found in the tuned_models dictionary.")

Model 'br' not found in the tuned_models dictionary.
