In [32]:
# !pip install autogluon

In [2]:
import numpy as np
import pandas as pd
import pathlib
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# AutoGluon

AutoGluon can be used to run automatically or with more precise configurations. Included models are listed here:
https://auto.gluon.ai/stable/tutorials/timeseries/forecasting-model-zoo.html

To fit our experiment design, it may make the most sense to run models separately. This way we'll can collect all of the forecasts produced by each model type, rather than ending up with forecasts produced by different methods all being called AutoGluon. That said, if we want to treat AutoGluon as a closed method, we can (any maybe should!) do that as well, if for no other reason to highlight the power of Auto* frameworks.

So to be clear, we can initially experiment with the underlying models' separately, e.g. with:
- NaiveModel
- ...
- TemporalFusionTransformerModel

And at the end, we can consider the use of AutoGluon's presets (e.g. `'fast_training'` and `'best_quality'`) where we would just consider those to be 'techniques' themselves to compare in a special analysis to see whether AutoGluon could have just produced the best forecast on its own.


Higher quality presets usually result in better forecasts but take longer to train. The following presets are available:

- "fast_training": fit simple statistical models (ETS, Theta, Naive, SeasonalNaive) + fast tree-based model RecursiveTabular. These models are fast to train but may not be very accurate.

- "medium_quality": all models mentioned above + deep learning model DeepAR. Default setting that produces good forecasts with reasonable training time.

- "high_quality": all models mentioned above + automatically tuned statistical models (AutoETS, AutoARIMA) + tree-based model DirectTabular + deep learning models TemporalFusionTransformer and PatchTST . Much more accurate than medium_quality, but takes longer to train.

- "best_quality": all models mentioned above + more tabular models + training multiple copies of DeepAR. Usually better than high_quality, but takes even longer to train.


In [3]:
AVAILABLE_PRESETS = [
    "fast_training",
    "medium_quality",
    "high_quality",
    "best_quality"
]

In [4]:
# EXP_BASE = "autogluon_global_with_scaled_covariates_experiment_preset"
# EXP_BASE = "autogluon_global_mean_SWE__covariates_preset"
# EXP_BASE = "autogluon_global_with_scaled_covariates_experiment_preset"
# EXP_BASE = "autogluon_preset_best_quality_global_model"

# EXP_BASE = "autogluon_with_scaled_covariates_Energy_EXCAUS"
# EXP_BASE = "autogluon_global_with_scaled_covariates_experiment"


# Run all of these:
EXP_BASE = "autogluon"
# EXP_BASE = "autogluon_global_with_scaled_covariates_climate_vars"
# EXP_BASE = "autogluon_with_covariates"
# EXP_BASE = "autogluon_global_with_scaled_covariates"
# EXP_BASE = "autogluon_global_with_covariates"

EXP_MODEL_LIST = ["NaiveModel", "SeasonalNaiveModel", "ARIMAModel", "ETSModel", "AutoETSModel"]
# EXP_MODEL_LIST = ["DLinearModel"]
# EXP_MODEL_LIST = ["DeepARModel"]
# EXP_MODEL_LIST = ["DirectTabularModel", "RecursiveTabularModel", "PatchTSTModel", "SimpleFeedForwardModel", "TemporalFusionTransformerModel"]
# EXP_MODEL_LIST = ["PatchTSTModel", "SimpleFeedForwardModel", "TemporalFusionTransformerModel"]

# EXP_MODEL_LIST = ["TemporalFusionTransformerModel", "best_quality"]
# EXP_MODEL_LIST = ["TemporalFusionTransformerModel", "DirectTabularModel"]
# EXP_MODEL_LIST = ["best_quality"]
# EXP_MODEL_LIST = ["medium_quality"]

# EXPERIMENT_NAME = "autogluon"

## Load data

In [40]:
df = pd.read_csv("../data/processed_data_inputs/Connex.csv", index_col=0)
df.index = pd.to_datetime(df.index)
df.head()

prediction_length = 12
context_length=52

In [41]:
target_categories = ['IS_MENTAL_HEALTH', 'IS_SUBSTANCE_ABUSE']
target_categories

['IS_MENTAL_HEALTH', 'IS_SUBSTANCE_ABUSE']

In [42]:
## As a place-holder using parallel CONNEX datasets as covariates 
# all_data = df.drop(columns=target_categories)
all_data = df
all_data.index.name = None

In [43]:
all_covariates = all_data.columns.to_list()
all_covariates

['IS_MENTAL_HEALTH',
 'IS_SUBSTANCE_ABUSE',
 'IS_PROBLEM_GAMBLING',
 'IS_OTHER',
 'IS_MENTAL_HEALTH_Concurrent Disorder Clients',
 'IS_SUBSTANCE_ABUSE_Concurrent Disorder Clients',
 'IS_PROBLEM_GAMBLING_Concurrent Disorder Clients',
 'IS_OTHER_Concurrent Disorder Clients',
 'IS_MENTAL_HEALTH_Concurrent Disorders',
 'IS_SUBSTANCE_ABUSE_Concurrent Disorders',
 'IS_PROBLEM_GAMBLING_Concurrent Disorders',
 'IS_OTHER_Concurrent Disorders',
 'IS_MENTAL_HEALTH_Families',
 'IS_SUBSTANCE_ABUSE_Families',
 'IS_PROBLEM_GAMBLING_Families',
 'IS_OTHER_Families',
 'IS_MENTAL_HEALTH_Clients with Legal Issues',
 'IS_SUBSTANCE_ABUSE_Clients with Legal Issues',
 'IS_PROBLEM_GAMBLING_Clients with Legal Issues',
 'IS_OTHER_Clients with Legal Issues',
 'IS_MENTAL_HEALTH_LGBTQ+',
 'IS_SUBSTANCE_ABUSE_LGBTQ+',
 'IS_PROBLEM_GAMBLING_LGBTQ+',
 'IS_OTHER_LGBTQ+',
 'IS_MENTAL_HEALTH_Homeless Clients',
 'IS_SUBSTANCE_ABUSE_Homeless Clients',
 'IS_PROBLEM_GAMBLING_Homeless Clients',
 'IS_OTHER_Homeless Clients',
 

## Define experiment cutoff dates

Our experiment design uses 6 annual cutoff dates that simulate the generation of forecast once per year over the last 6 years. We'll comsume data up to each cutoff date to fit/train models, and then evaluate over the next 18 months. In this notebook, we're only concerned with producing the retrospective forecasts and we'll do the analysis all together in another notebook.

In [44]:
report_sim_dates = open("../data/utils/experiment_cutoff_dates.txt", 'r').read().split()
report_sim_dates

['2020-3-1',
 '2020-9-27',
 '2021-3-14',
 '2021-9-5',
 '2021-12-12',
 '2022-4-3',
 '2022-7-31']

## AutoGluon Data Format

Instructions to format data properly for AutoGluon are here:
https://auto.gluon.ai/stable/tutorials/timeseries/forecasting-quick-start.html

In [45]:
all_data_df_autogluon = all_data.reset_index().melt(id_vars='index', var_name="item_id", value_name="target").rename({"index": "timestamp"}, axis=1)
all_data_df_autogluon

Unnamed: 0,timestamp,item_id,target
0,2015-01-01,IS_MENTAL_HEALTH,40.0
1,2015-01-02,IS_MENTAL_HEALTH,68.0
2,2015-01-03,IS_MENTAL_HEALTH,68.0
3,2015-01-04,IS_MENTAL_HEALTH,89.0
4,2015-01-05,IS_MENTAL_HEALTH,100.0
...,...,...,...
381047,2023-05-27,IS_OTHER_Caribbean,0.0
381048,2023-05-28,IS_OTHER_Caribbean,0.0
381049,2023-05-29,IS_OTHER_Caribbean,0.0
381050,2023-05-30,IS_OTHER_Caribbean,0.0


In [46]:
pd.options.mode.chained_assignment = None  # Suppress SettingWithCopyWarning

def get_autogluon_df(study_category, cutoff_date):

    other_categories = [category for category in all_covariates if category != study_category]
    study_df = all_data_df_autogluon.loc[(all_data_df_autogluon.item_id == study_category) & (all_data_df_autogluon.timestamp <= cutoff_date)]
    for category in other_categories:
        study_df.loc[study_df.index, f"exogenous_{category}"] = all_data[category][study_df.timestamp].values
    return study_df

In [47]:
pd.options.mode.chained_assignment = None  # Suppress SettingWithCopyWarning

def get_autogluon_global_df(cutoff_date):

    return all_data_df_autogluon.loc[all_data_df_autogluon.timestamp <= cutoff_date]

In [48]:
pd.options.mode.chained_assignment = None  # Suppress SettingWithCopyWarning

def get_autogluon_global_with_covariates_df(cutoff_date="2023-07-01"):

    all_autogluon_dfs = []
    for category in df.columns:
        all_autogluon_dfs.append(get_autogluon_df(category, cutoff_date))
    return pd.concat(all_autogluon_dfs, axis=0).dropna(axis=1)

In [49]:
get_autogluon_df("IS_MENTAL_HEALTH", "2017-07-01")

  self.obj[key] = empty_value


Unnamed: 0,timestamp,item_id,target,exogenous_IS_SUBSTANCE_ABUSE,exogenous_IS_PROBLEM_GAMBLING,exogenous_IS_OTHER,exogenous_IS_MENTAL_HEALTH_Concurrent Disorder Clients,exogenous_IS_SUBSTANCE_ABUSE_Concurrent Disorder Clients,exogenous_IS_PROBLEM_GAMBLING_Concurrent Disorder Clients,exogenous_IS_OTHER_Concurrent Disorder Clients,...,exogenous_IS_PROBLEM_GAMBLING_Spanish,exogenous_IS_OTHER_Spanish,exogenous_IS_MENTAL_HEALTH_Italian,exogenous_IS_SUBSTANCE_ABUSE_Italian,exogenous_IS_PROBLEM_GAMBLING_Italian,exogenous_IS_OTHER_Italian,exogenous_IS_MENTAL_HEALTH_Caribbean,exogenous_IS_SUBSTANCE_ABUSE_Caribbean,exogenous_IS_PROBLEM_GAMBLING_Caribbean,exogenous_IS_OTHER_Caribbean
0,2015-01-01,IS_MENTAL_HEALTH,40.0,30.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2015-01-02,IS_MENTAL_HEALTH,68.0,69.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2015-01-03,IS_MENTAL_HEALTH,68.0,45.0,11.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2015-01-04,IS_MENTAL_HEALTH,89.0,50.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015-01-05,IS_MENTAL_HEALTH,100.0,107.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,2017-06-27,IS_MENTAL_HEALTH,135.0,119.0,11.0,43.0,2.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
909,2017-06-28,IS_MENTAL_HEALTH,151.0,105.0,16.0,37.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
910,2017-06-29,IS_MENTAL_HEALTH,118.0,112.0,17.0,26.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
911,2017-06-30,IS_MENTAL_HEALTH,102.0,74.0,21.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
autogluon_global_with_covariates_df = get_autogluon_global_with_covariates_df()
autogluon_global_with_covariates_df

  self.obj[key] = empty_value


Unnamed: 0,timestamp,item_id,target
0,2015-01-01,IS_MENTAL_HEALTH,40.0
1,2015-01-02,IS_MENTAL_HEALTH,68.0
2,2015-01-03,IS_MENTAL_HEALTH,68.0
3,2015-01-04,IS_MENTAL_HEALTH,89.0
4,2015-01-05,IS_MENTAL_HEALTH,100.0
...,...,...,...
381047,2023-05-27,IS_OTHER_Caribbean,0.0
381048,2023-05-28,IS_OTHER_Caribbean,0.0
381049,2023-05-29,IS_OTHER_Caribbean,0.0
381050,2023-05-30,IS_OTHER_Caribbean,0.0


In [51]:
get_autogluon_global_df("2023-07-01")

Unnamed: 0,timestamp,item_id,target
0,2015-01-01,IS_MENTAL_HEALTH,40.0
1,2015-01-02,IS_MENTAL_HEALTH,68.0
2,2015-01-03,IS_MENTAL_HEALTH,68.0
3,2015-01-04,IS_MENTAL_HEALTH,89.0
4,2015-01-05,IS_MENTAL_HEALTH,100.0
...,...,...,...
381047,2023-05-27,IS_OTHER_Caribbean,0.0
381048,2023-05-28,IS_OTHER_Caribbean,0.0
381049,2023-05-29,IS_OTHER_Caribbean,0.0
381050,2023-05-30,IS_OTHER_Caribbean,0.0


## Plot forecast range with context

We could add different elements to plots including some error analysis, emphasis on different quantiles, etc.

In [53]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

def plot_quantile_forecast(category, context_df, forecast_df, actual_df, cutoff_date, save_path=None, show_plots=True, model_name=""):

    fig, ax = plt.subplots(figsize=(10,6))

    # Context
    context_df = pd.concat((context_df, actual_df))
    ax.plot(context_df.index, context_df.values, color='black', label='Historical CPI')

    # Confidence range between 0.01 and 0.99 quantiles
    ax.fill_between(
        forecast_df.index,
        forecast_df[f"q_0.05"],
        forecast_df[f"q_0.95"],
        facecolor='purple',
        alpha=0.5,
        label='95% Confidence'
    )

    # Add a line trace for the median
    ax.plot(forecast_df.index, forecast_df[f"q_0.5"], color='purple', label='Median Forecast')

    # Update the layout as needed
    ax.set_title(f'{category}\nRetrospective Forecast - {cutoff_date} - COVID-19 Wave#{wave_num} \n{model_name}')  

    ax.set_xlabel('Date')
    ax.set_ylabel('CPI (% 2002 Prices)')
    ax.axvline(pd.to_datetime(cutoff_date), label="Cutoff Date", color='black', ls='--', ms=1, alpha=0.5)
    ax.legend()
    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)

    # Show the figure
    plt.grid(axis='y')

    if show_plots:
        plt.show()

    # Save the figure if the path is specified
    if save_path:
        fig.savefig(save_path, dpi=300 if save_path.endswith("png") else None) # High res for png


## Main experiment loop - non-global models

In [54]:
if "global" not in EXP_BASE:
    for model_type in EXP_MODEL_LIST:
        EXPERIMENT_NAME = f"{EXP_BASE}_{model_type}"
        wave_num = 0

        for cutoff_date in report_sim_dates:
            wave_num+=1
            forecast_output_dir = f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/forecasts"
            plot_output_dir = f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/plots"
            training_output_dir = f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/training_results"
            pathlib.Path(forecast_output_dir).mkdir(parents=True, exist_ok=True)
            pathlib.Path(plot_output_dir).mkdir(parents=True, exist_ok=True)
            pathlib.Path(training_output_dir).mkdir(parents=True, exist_ok=True)
            for category in target_categories:
                
                train_data = get_autogluon_df(category, cutoff_date)

                predictor = TimeSeriesPredictor(
                    prediction_length=18,
                    path=f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/model_files/{category}/",
                    target="target",
                    eval_metric="MAPE",
                    # eval_metric="mean_wQuantileLoss",
                    quantile_levels=[0.01, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99],
                )

                if model_type in AVAILABLE_PRESETS:
                    predictor.fit(
                        train_data,
                        presets=model_type,
                        excluded_model_types=["DirectTabular"],
                        time_limit=1800,
                    )
                else:
                    model_params = {}
                    predictor.fit(
                        train_data,
                        hyperparameters={
                            model_type: model_params,
                        },
                        time_limit=1800,
                    )

                with open(f"{training_output_dir}/{category}.txt", 'w') as f:
                    f.write(str(predictor.fit_summary()))

                forecast_df = predictor.predict(train_data).loc[category]
                forecast_df = forecast_df.rename(
                    {"mean": "q_0.5"} | {col: f"q_{col}" for col in forecast_df.columns if col != "mean"},
                    axis=1
                )

                context_df = foodprice_df[category].loc[(foodprice_df.index >= pd.to_datetime(cutoff_date) - pd.DateOffset(months=120)) & (foodprice_df.index <= cutoff_date)]
                actual_df = foodprice_df[category].loc[(foodprice_df.index > cutoff_date) & (foodprice_df.index <= forecast_df.index.max())]

                forecast_df.to_csv(f"{forecast_output_dir}/{category}.csv")
                plot_quantile_forecast(
                    category=category,          # The target category name
                    context_df=context_df,      # Historical data to plot
                    forecast_df=forecast_df,    # Quantile forecast dataframe
                    actual_df=actual_df,        # 'actual' data to plot against forecast
                    save_path=f"{plot_output_dir}/{category}.svg", 
                    cutoff_date=cutoff_date,
                    show_plots=True,
                    model_name=predictor.get_model_best()
                )
            #     break
            # break

NameError: name 'EXP_BASE' is not defined

# Main experiment loop - global models

In [None]:
if "global" in EXP_BASE:
    for model_type in EXP_MODEL_LIST:
        EXPERIMENT_NAME = f"{EXP_BASE}_{model_type}"
        wave_num = 0

        for cutoff_date in report_sim_dates:
            wave_num +=1
            forecast_output_dir = f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/forecasts"
            plot_output_dir = f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/plots"
            training_output_dir = f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/training_results"
            pathlib.Path(forecast_output_dir).mkdir(parents=True, exist_ok=True)
            pathlib.Path(plot_output_dir).mkdir(parents=True, exist_ok=True)
            pathlib.Path(training_output_dir).mkdir(parents=True, exist_ok=True)

            if "covariates" in EXP_BASE:
                train_data = get_autogluon_global_with_covariates_df(cutoff_date)
            else:
                train_data = get_autogluon_global_df(cutoff_date)

            predictor = TimeSeriesPredictor(
                prediction_length=18,
                path=f"./output/experiments/{EXPERIMENT_NAME}/{cutoff_date}/model_files",
                target="target",
                eval_metric="MAPE",
                # eval_metric="mean_wQuantileLoss",
                quantile_levels=[0.01, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99],
            )

            if model_type in AVAILABLE_PRESETS:
                    predictor.fit(
                    train_data,
                    presets=model_type,
                    excluded_model_types=["DirectTabular"],  # Seems to be the cause of weighted ensembles failing due to shape error!
                    time_limit=1800*9,
                )
            else:
                model_params = {}
                predictor.fit(
                    train_data,
                    hyperparameters={
                        model_type: model_params,
                    },
                    time_limit=1800*9,
                )
        
            for category in target_categories:
                
                with open(f"{training_output_dir}/{category}.txt", 'w') as f:
                    f.write(str(predictor.fit_summary()))

                forecast_df = predictor.predict(train_data).loc[category]
                forecast_df = forecast_df.rename(
                    {"mean": "q_0.5"} | {col: f"q_{col}" for col in forecast_df.columns if col != "mean"},
                    axis=1
                )

                context_df = foodprice_df[category].loc[(foodprice_df.index >= pd.to_datetime(cutoff_date) - pd.DateOffset(weeks=context_length)) & (foodprice_df.index <= cutoff_date)]
                actual_df = foodprice_df[category].loc[(foodprice_df.index > cutoff_date) & (foodprice_df.index <= forecast_df.index.max())]

                forecast_df.to_csv(f"{forecast_output_dir}/{category}.csv")
                plot_quantile_forecast(
                    category=category,          # The target category name
                    context_df=context_df,      # Historical data to plot
                    forecast_df=forecast_df,    # Quantile forecast dataframe
                    actual_df=actual_df,        # 'actual' data to plot against forecast
                    save_path=f"{plot_output_dir}/{category}.svg", 
                    cutoff_date=cutoff_date,
                    show_plots=True,
                    model_name=predictor.get_model_best()
                )
            #     break
            # break

In [None]:
predictor.get_model_best()

'TemporalFusionTransformer'

In [None]:
predictor.fit_summary()

****************** Summary of fit() ******************
Estimated performance of each model:
                       model  score_val  pred_time_val  fit_time_marginal  \
0  TemporalFusionTransformer  -0.029947       0.016716         378.340475   

   fit_order  
0          1  
Number of models trained: 1
Types of models trained:
{'MultiWindowBacktestingModel'}
****************** End of fit() summary ******************


{'model_types': {'TemporalFusionTransformer': 'MultiWindowBacktestingModel'},
 'model_performance': {'TemporalFusionTransformer': -0.029947208326056813},
 'model_best': 'TemporalFusionTransformer',
 'model_paths': {'TemporalFusionTransformer': './output/experiments/autogluon_with_scaled_covariates_TemporalFusionTransformerModel/2022-07-01/model_files/Vegetables and vegetable preparations/models/TemporalFusionTransformer'},
 'model_fit_times': {'TemporalFusionTransformer': 378.34047532081604},
 'model_pred_times': {'TemporalFusionTransformer': 0.016716480255126953},
 'model_hyperparams': {'TemporalFusionTransformer': {}},
 'leaderboard':                        model  score_val  pred_time_val  fit_time_marginal  \
 0  TemporalFusionTransformer  -0.029947       0.016716         378.340475   
 
    fit_order  
 0          1  }