## 1. Configuration

Set the paths and flags for the modeling process below. 
- `TRAIN_NEW_MODELS`: Set to `True` to run training and tuning. Set to `False` to load existing models.
- `INPUT_DIR`: The directory where your source data CSV is located.
- `DATAFRAME_NAME`: The name of your CSV file (without the `.csv` extension).
- `OUTPUT_DIR_WINDOWS`: The root folder on your Windows drive where model pipelines will be saved. The path is written in WSL format (e.g., `/mnt/d/` for the `D:` drive).

In [19]:
import os

# Set to True to run the full training and tuning process.
# Set to False to load pre-existing models from the output directory.
TRAIN_NEW_MODELS = True

# PATHS
INPUT_DIR = os.path.join('..', 'data', 'upsampled')
DATAFRAME_NAME = 'mean_df' # Name of the .csv file without the extension

# Output directory for saving model pipelines (WSL format for Windows D: drive)
# This path corresponds to D:\ML_Pipelines in Windows
OUTPUT_ROOT_DIR_WINDOWS = '/mnt/d/EMEWS_ML_Pipelines_Output/timeseries'
DATAFRAME_SPECIFIC_PATH = os.path.join(OUTPUT_ROOT_DIR_WINDOWS, DATAFRAME_NAME)
BASE_MODEL_PATH = os.path.join(DATAFRAME_SPECIFIC_PATH, 'base_models')
TUNED_MODEL_PATH = os.path.join(DATAFRAME_SPECIFIC_PATH, 'tuned_models')

# Create directories if they don't exist
if TRAIN_NEW_MODELS:
    print(f"Creating directory for base models: {BASE_MODEL_PATH}")
    os.makedirs(BASE_MODEL_PATH, exist_ok=True)
    print(f"Creating directory for tuned models: {TUNED_MODEL_PATH}")
    os.makedirs(TUNED_MODEL_PATH, exist_ok=True)
else:
    print("TRAIN_NEW_MODELS is False. Will attempt to load existing models.")

Creating directory for base models: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/base_models
Creating directory for tuned models: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/tuned_models


## 2. Setup and Data Loading

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pycaret.time_series import *

In [21]:
df = pd.read_csv(os.path.join(INPUT_DIR, f'{DATAFRAME_NAME}.csv'),  parse_dates=['date'])
df.set_index('date', inplace=True)

## 3. Pycaret Setup

In [23]:
TARGET_COLUMN = 'total_number_of_patients'

In [24]:
exp = TSForecastingExperiment()
exp.setup(data=df, fh=60, target=TARGET_COLUMN, session_id=123);

Unnamed: 0,Description,Value
0,session_id,123
1,Target,total_number_of_patients
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(618, 20)"
5,Transformed data shape,"(618, 20)"
6,Transformed train set shape,"(558, 20)"
7,Transformed test set shape,"(60, 20)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


In [25]:
exp.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arima,ARIMA,sktime.forecasting.arima.ARIMA,True
auto_arima,Auto ARIMA,sktime.forecasting.arima.AutoARIMA,True
croston,Croston,sktime.forecasting.croston.Croston,True
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True
ridge_cds_dt,Ridge w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True
lasso_cds_dt,Lasso w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True
llar_cds_dt,Lasso Least Angular Regressor w/ Cond. Deseaso...,pycaret.containers.models.time_series.BaseCdsD...,True
br_cds_dt,Bayesian Ridge w/ Cond. Deseasonalize & Detren...,pycaret.containers.models.time_series.BaseCdsD...,True
huber_cds_dt,Huber w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True


## 4. Model Training or Loading

Based on the `TRAIN_NEW_MODELS` flag, this section will either train and save new models or load existing ones.

In [26]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [27]:
created_models = {}
tuned_models = {}

if TRAIN_NEW_MODELS:
    print("Starting Model Training and Tuning")
    
    # Step 1: Compare base models
    # exp.compare_models(include=['arima', 'auto_arima'], errors='raise')
    # best_models_df = exp.pull()
    # model_names_to_process = best_models_df[best_models_df['R2'] > 0.5].sort_values(by='R2', ascending=False)[:5]
    model_names_to_process = ['arima', 'auto_arima']
    # Step 2: Create, save, and tune base models
    for model_name in model_names_to_process:
        print(f'\n--- Processing Model: {model_name} ---')
        
        # Create base model
        print(f'Creating base model: {model_name}')
        base_model = exp.create_model(model_name, verbose=False)
        created_models[model_name] = base_model
        
        # Save base model pipeline
        save_path_base = os.path.join(BASE_MODEL_PATH, model_name)
        print(f'Saving base model to: {save_path_base}')
        exp.save_model(base_model, save_path_base, model_only=True)
        
        # Tune model
        print(f'Tuning model: {model_name}')

        if model_name == 'auto_arima':
            print('Auto Arima model already tuned - skipping tuning step.')
            tuned_model = base_model
        else:
            tuned_model = exp.tune_model(base_model)
        tuned_models[model_name] = tuned_model
        
        # Save tuned model pipeline
        save_path_tuned = os.path.join(TUNED_MODEL_PATH, model_name)
        print(f'Saving tuned model to: {save_path_tuned}')
        exp.save_model(tuned_model, save_path_tuned, model_only=True)

else:
    print("--- Loading Existing Models ---")
    # Load base and tuned models if they exist
    if os.path.exists(BASE_MODEL_PATH):
        model_names_to_process = [os.path.splitext(f)[0] for f in os.listdir(BASE_MODEL_PATH) if f.endswith('.pkl')]
        print(f"Found models in {BASE_MODEL_PATH}: {model_names_to_process}")
    else:
        print(f"ERROR: Base model directory not found at {BASE_MODEL_PATH}. Cannot load models.")
        model_names_to_process = []
    
    for name in model_names_to_process:
        base_path = os.path.join(BASE_MODEL_PATH, name)
        tuned_path = os.path.join(TUNED_MODEL_PATH, name)
        
        # Load Base Model
        if os.path.exists(f'{base_path}.pkl'):
            print(f'Loading base model: {name} from {base_path}')
            created_models[name] = exp.load_model(base_path, verbose=False)
        else:
            print(f'WARNING: Base model for {name} not found at {base_path}.pkl')
            
        # Load Tuned Model
        if os.path.exists(f'{tuned_path}.pkl'):
            print(f'Loading tuned model: {name} from {tuned_path}')
            tuned_models[name] = exp.load_model(tuned_path, verbose=False)
        else:
            print(f'WARNING: Tuned model for {name} not found at {tuned_path}.pkl')

print("\nModel processing complete.")
print(f"\nBase models available: {list(created_models.keys())}")
print(f"Tuned models available: {list(tuned_models.keys())}")

Starting Model Training and Tuning

--- Processing Model: arima ---
Creating base model: arima
Saving base model to: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/base_models/arima
Model Successfully Saved
Tuning model: arima


Unnamed: 0,cutoff,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,2024-07-10 12:00,0.164,0.2394,3.031,5.8664,0.0757,0.0966,0.8956
1,2024-08-09 12:00,0.2749,0.3239,5.1984,8.0942,0.1347,0.1609,0.8693
2,2024-09-08 12:00,0.2332,0.232,4.5613,5.9551,0.1106,0.1122,0.9223
Mean,NaT,0.224,0.2651,4.2636,6.6386,0.107,0.1232,0.8957
SD,NaT,0.0457,0.0417,0.9095,1.0299,0.0242,0.0274,0.0217


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.2min finished


Saving tuned model to: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/tuned_models/arima
Model Successfully Saved

--- Processing Model: auto_arima ---
Creating base model: auto_arima
Saving base model to: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/base_models/auto_arima
Model Successfully Saved
Tuning model: auto_arima
Auto Arima model already tuned - skipping tuning step.
Saving tuned model to: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/tuned_models/auto_arima
Model Successfully Saved

Model processing complete.

Base models available: ['arima', 'auto_arima']
Tuned models available: ['arima', 'auto_arima']


## 5. Custom Metrics and Final Predictions
This section defines and adds custom metrics for evaluating predictions on the hold-out set, then generates and saves the final performance metrics to an Excel file.

In [28]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def r2_rounded(y_true, y_pred):
    """Calculates R2 score after rounding predictions to the nearest whole number."""
    return r2_score(y_true, np.round(y_pred))

def rmse_rounded(y_true, y_pred):
    """Calculates RMSE after rounding predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.round(y_pred)))

def r2_ceil(y_true, y_pred):
    """Calculates R2 score after ceiling predictions to the nearest whole number."""
    return r2_score(y_true, np.ceil(y_pred))

def rmse_ceil(y_true, y_pred):
    """Calculates RMSE after ceiling predictions to the nearest whole number."""
    return np.sqrt(mean_squared_error(y_true, np.ceil(y_pred)))

def mae_rounded(y_true, y_pred):
    """Calculates MAE after rounding predictions to the nearest whole number."""
    return mean_absolute_error(y_true, np.round(y_pred))

def mae_ceil(y_true, y_pred):
    """Calculates MAE after ceiling predictions to the nearest whole number."""
    return mean_absolute_error(y_true, np.ceil(y_pred))

In [29]:
try:
    exp.add_metric('R2_Rounded', 'R2_RND', r2_rounded, greater_is_better=True)
    exp.add_metric('RMSE_Rounded', 'RMSE_RND', rmse_rounded, greater_is_better=False)
    exp.add_metric('MAE_Rounded', 'MAE_RND', mae_rounded, greater_is_better=False)
    exp.add_metric('R2_Ceil', 'R2_CEIL', r2_ceil, greater_is_better=True)
    exp.add_metric('RMSE_Ceil', 'RMSE_CEIL', rmse_ceil, greater_is_better=False)
    exp.add_metric('MAE_Ceil', 'MAE_CEIL', mae_ceil, greater_is_better=False)
except ValueError:
    print("Metrics may have already been added in this session.")

In [30]:
# Generate predictions for base models
holdout_predictions_metric = {}
if not created_models:
    print("No base models available to make predictions.")
else:
    for model_name, model_object in created_models.items():
        print(f"Generating predictions for base model: {model_name}")
        exp.predict_model(model_object, verbose=False)
        holdout_predictions_metric[model_name] = exp.pull()

# Generate predictions for tuned models
tuning_predictions_metric = {}
if not tuned_models:
    print("No tuned models available to make predictions.")
else:
    for model_name, model_object in tuned_models.items():
        print(f"Generating predictions for tuned model: {model_name}")
        exp.predict_model(model_object, verbose=False)
        tuning_predictions_metric[model_name] = exp.pull()

Generating predictions for base model: arima
Generating predictions for base model: auto_arima
Generating predictions for tuned model: arima
Generating predictions for tuned model: auto_arima


In [31]:
output_excel_path = os.path.join(DATAFRAME_SPECIFIC_PATH, 'model_performance_metrics.xlsx')
print(f"Saving performance metrics to: {output_excel_path}")

with pd.ExcelWriter(output_excel_path) as writer:
    # --- Process and Save Base Model Metrics ---
    if holdout_predictions_metric:
        list_of_metric_dfs_base = []
        for model_name, metrics_df in holdout_predictions_metric.items():
            list_of_metric_dfs_base.append(metrics_df)
        
        results_df_base = pd.concat(list_of_metric_dfs_base, ignore_index=True).sort_values('R2', ascending=False)
        print("\n--- Base Model Holdout Predictions ---")
        print(results_df_base.to_string())
        results_df_base.to_excel(writer, sheet_name='Base Model Metrics', index=False)
    else:
        print("\nNo base model metrics to save.")

    # --- Process and Save Tuned Model Metrics ---
    if tuning_predictions_metric:
        list_of_metric_dfs_tuned = []
        for model_name, metrics_df in tuning_predictions_metric.items():
            list_of_metric_dfs_tuned.append(metrics_df)
            
        results_df_tuned = pd.concat(list_of_metric_dfs_tuned, ignore_index=True).sort_values('R2', ascending=False)
        print("\n--- Tuned Model Holdout Predictions ---")
        print(results_df_tuned.to_string())
        results_df_tuned.to_excel(writer, sheet_name='Tuned Model Metrics', index=False)
    else:
        print("\nNo tuned model metrics to save.")

Saving performance metrics to: /mnt/d/EMEWS_ML_Pipelines_Output/timeseries/mean_df/model_performance_metrics.xlsx

--- Base Model Holdout Predictions ---
        Model    MASE   RMSSE     MAE    RMSE    MAPE   SMAPE      R2  R2_RND  RMSE_RND  MAE_RND  R2_CEIL  RMSE_CEIL  MAE_CEIL
1  Auto ARIMA  0.1634  0.1729  3.2829  4.5522  0.0778  0.0745  0.9503  0.9500    4.5662   3.2833   0.9471     4.6957    3.3833
0       ARIMA  0.1851  0.1818  3.7196  4.7857  0.0753  0.0751  0.9451  0.9459    4.7487   3.6833   0.9437     4.8460    3.7500

--- Tuned Model Holdout Predictions ---
        Model    MASE   RMSSE     MAE    RMSE    MAPE   SMAPE      R2  R2_RND  RMSE_RND  MAE_RND  R2_CEIL  RMSE_CEIL  MAE_CEIL
1  Auto ARIMA  0.1634  0.1729  3.2829  4.5522  0.0778  0.0745  0.9503  0.9500    4.5662   3.2833   0.9471     4.6957    3.3833
0       ARIMA  0.2924  0.2744  5.8763  7.2233  0.1446  0.1264  0.8749  0.8738    7.2549   5.8667   0.8642     7.5266    6.2500


In [32]:
INPUT_DIR = os.path.join('..', '..', 'data', 'upsampled')
OUTPUT_DIR_ROOT = '/mnt/d/EMEWS_ML_Pipelines_Output'
MODEL_PATH = 'regression/mean_df/tuned_models/br'
PRE_TRAINED_MODEL_PATH = os.path.join(OUTPUT_DIR_ROOT, MODEL_PATH)
DATAFRAME_NAME = 'mean_df'

In [33]:
OUTPUT_DIR_ROOT = '/mnt/d/EMEWS_ML_Pipelines_Output'
MODEL_PATH = 'timeseries/mean_df/base_models/arima'
PRE_TRAINED_MODEL_PATH = os.path.join(OUTPUT_DIR_ROOT, MODEL_PATH)

In [34]:
arima = exp.load_model(PRE_TRAINED_MODEL_PATH)

Transformation Pipeline and Model Successfully Loaded


In [35]:
type(arima)

sktime.forecasting.arima.ARIMA

In [36]:
arima.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,558.0
Model:,"SARIMAX(1, 0, 0)x(0, 1, 0, 26)",Log Likelihood,-1823.752
Date:,"Fri, 22 Aug 2025",AIC,3691.504
Time:,04:49:20,BIC,3785.59
Sample:,01-04-2024,HQIC,3728.324
,- 10-08-2024,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.1181,0.400,0.295,0.768,-0.666,0.902
zone_a_mwr_patients,0.8537,0.317,2.696,0.007,0.233,1.474
zone_a_mwr_cat_3,-0.0500,0.318,-0.157,0.875,-0.673,0.573
zone_a_mwr_cat_4,-0.2007,0.320,-0.627,0.531,-0.828,0.427
zone_a_mwr_sets_of_emews,-0.2536,0.016,-15.792,0.000,-0.285,-0.222
zone_a_mwr_deescalations,-0.0178,0.055,-0.326,0.744,-0.125,0.089
zone_a_mwr_escalations,-0.1450,0.264,-0.549,0.583,-0.662,0.372
zone_a__patients,0.9899,0.244,4.062,0.000,0.512,1.468
zone_a__cat_2,-0.0434,0.245,-0.177,0.859,-0.523,0.436

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,4161.39
Prob(Q):,1.0,Prob(JB):,0.0
Heteroskedasticity (H):,0.8,Skew:,-0.03
Prob(H) (two-sided):,0.15,Kurtosis:,16.7
