## Data Preprocessing

In [None]:
import logging
import pandas as pd
from src.data import (
    preprocess, 
    make_hierarchy_level_dict, 
    make_hierarchy_agg_dict, 
    inspect_hierarchy_agg_dict,
    split, 
    build_datasets 
)

# prepare data
tourism_data = pd.read_csv('/root/data/raw/TourismData_v3.csv')
tourism_data, level_counts, prefix_idxs = preprocess(tourism_data)

# create train/val/test datasets, one for each of 10 CV folds
splits = split(tourism_data.values, horizon = 12, min_train_size = 108, max_train_size = 108)
train_datasets = build_datasets(tourism_data, splits)
test_datasets = build_datasets(tourism_data, splits, val = False)

# create mappings of hierarchy that will be used for fitting/evaluation
hierarchy_agg_dict = make_hierarchy_agg_dict(prefix_idxs)
hierarchy_level_dict = make_hierarchy_level_dict(level_counts)

## Fit Models 

In [None]:
from src.model import fit_deepar#, fit_predict_arima, serialize_all
from time import time

EPOCHS=1

# fit DeepAR models with no embedding aggregation penalty
fit_models_cat_var = [
   fit_deepar(
        training_data, 
        epochs=EPOCHS,
        use_cat_var = True,
        cardinality = [tourism_data.shape[1]],
        hierarchy_agg_dict = hierarchy_agg_dict,
        print_rec_penalty = False
    ) 
    for (training_data, _) in test_datasets
]

# fit DeepAR models with embedding aggregation penalty
fit_models_embed_agg = [
    fit_deepar(
        training_data, 
        epochs=EPOCHS,
        use_cat_var = True,
        cardinality = [tourism_data.shape[1]],
        hierarchy_agg_dict = hierarchy_agg_dict,
        embedding_agg_penalty = 1,
        print_rec_penalty = False
    ) 
    for (training_data, _) in test_datasets
]

# fit DeepAR models with self-supervised penalty
fit_models_self_sup = [
    fit_deepar(
        training_data, 
        epochs=EPOCHS,
        use_cat_var = True,
        cardinality = [tourism_data.shape[1]],
        hierarchy_agg_dict = hierarchy_agg_dict,
        self_supervised_penalty = 10e-8,
        print_rec_penalty = False
    ) 
    for (training_data, _) in test_datasets
]


## Evaluate Models

In [None]:
from glob import glob
from src.model import unserialize_all
from src.evaluation import evaluate_optimal_rec, evaluate_deepar

# baseline DeepAR w/ embedding
filenames_cat_var = [f'/root/data/test_preds/test_model_cat_var_fold_{i}_preds.csv' for i in range(len(test_datasets))]
evaluations_cat_var = [
    evaluate_deepar(predictor, train_data, test_data, hierarchy_level_dict, filename) 
    for (predictor, _),  (train_data, test_data), filename in zip(fit_models_cat_var, test_datasets, filenames_cat_var)
]

# embedding aggregation penalty
filenames_embed_agg = [f'/root/data/test_preds/test_model_embed_agg_fold_{i}_preds.csv' for i in range(len(test_datasets))]
evaluations_embed_agg = [
    evaluate_deepar(predictor, train_data, test_data, hierarchy_level_dict, filename) 
    for (predictor, _), (train_data, test_data), filename in zip(fit_models_embed_agg, test_datasets, filenames_embed_agg)
]

# self-supervised penalty
evaluations_self_sup = [
    evaluate_deepar(predictor, train_data, test_data, hierarchy_level_dict) 
    for (predictor, _), (train_data, test_data) in zip(fit_models_self_sup, test_datasets)
]



## Reconcile Models

In [None]:
reconciled_preds = [pd.read_csv(f) for f in glob(f'/root/data/test_reconciled_preds/test_model_cat_var*')]
evaluations_mint_model = [
    evaluate_optimal_rec(preds, test_data, hierarchy_level_dict) 
    for preds, (_, test_data) in zip(reconciled_preds, test_datasets)
]
reconciled_preds = [pd.read_csv(f) for f in glob(f'/root/data/test_reconciled_preds/test_model_embed_agg*')]
evaluations_mint_embed_agg = [
    evaluate_optimal_rec(preds, test_data, hierarchy_level_dict) 
    for preds, (_, test_data) in zip(reconciled_preds, test_datasets)
]

## Compare Models

In [None]:
from src.evaluation import agg_evaluations, compare_performance

baseline = agg_evaluations(evaluations_cat_var)
embedd_agg = agg_evaluations(evaluations_embed_agg)
reconciled = agg_evaluations(evaluations_mint_model)
reconciled_embedd_agg = agg_evaluations(evaluations_mint_embed_agg)

compare_performance(
    [
        baseline, 
        embedd_agg, 
        reconciled, 
        reconciled_embedd_agg, 
    ],
    model_names = [
        'DeepAR',
        'DeepAR-Embed-Agg',
        'DeepAR-MinT',
        'DeepAR-Embed-Agg-MinT',
    ],
    levels = ['all', 'country', 'region-by-travel']
)