In [None]:
# TODO:
#    1. early stopping not working optuna experiment

In [None]:
# for pyfunc model
MODEL_NAME = 'spp_weis'

In [None]:
import os
import shutil
import pickle
import random
import sys
import numpy as np
import pandas as pd
import duckdb
from typing import List

import requests
from io import StringIO

import ibis
import ibis.selectors as s
from ibis import _
ibis.options.interactive = True

from sklearn.preprocessing import RobustScaler

import torch

from darts import TimeSeries, concatenate
from darts.dataprocessing.transformers import (
    Scaler,
    MissingValuesFiller,
    Mapper,
    InvertibleMapper,
)
from darts.dataprocessing import Pipeline
from darts.metrics import mape, smape, mae, ope, rmse
from darts.utils.statistics import check_seasonality, plot_acf
from darts.datasets import AirPassengersDataset, IceCreamHeaterDataset
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.likelihood_models import QuantileRegression, GumbelLikelihood, GaussianLikelihood

from darts import TimeSeries
from darts.utils.timeseries_generation import (
    gaussian_timeseries,
    linear_timeseries,
    sine_timeseries,
)
from darts.models import (
    TFTModel,
    TiDEModel,
    DLinearModel,
    NLinearModel,
    TSMixerModel,
    NaiveEnsembleModel,
    RegressionEnsembleModel,
)


from torchmetrics import (
    SymmetricMeanAbsolutePercentageError, 
    MeanAbsoluteError, 
    MeanSquaredError,
)

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import mlflow

import warnings
warnings.filterwarnings("ignore")

# logging
import logging

# define log
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

In [None]:
# https://github.com/Lightning-AI/pytorch-lightning/issues/3431
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
logging.getLogger("pytorch_lightning.utilities.rank_zero").setLevel(logging.WARNING)

In [None]:
import optuna
from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization import (
    plot_optimization_history,
    plot_contour,
    plot_param_importances,
    plot_pareto_front,
)

In [None]:
os.chdir('../..')

In [None]:
# custom modules
import src.data_engineering as de
from src import parameters
from src import plotting
from src.modeling import (
    get_ci_err, build_fit_tsmixerx, build_fit_tide, build_fit_tft, log_pretty
)

## will be loaded from root when deployed
from src.darts_wrapper import DartsGlobalModel

In [None]:
log.info(f'FORECAST_HORIZON: {parameters.FORECAST_HORIZON}')
log.info(f'INPUT_CHUNK_LENGTH: {parameters.INPUT_CHUNK_LENGTH}')

In [None]:
torch.set_float32_matmul_precision('medium')

In [None]:
# optuna.delete_study(study_name="spp_weis_tide", storage="sqlite:///spp_trials.db")

## Data prep

In [None]:
# connect to database
con = ibis.duckdb.connect("data/spp.ddb", read_only=True)
con.list_tables()

In [None]:
lmp = de.prep_lmp(con)
lmp

In [None]:
lmp_df = lmp.to_pandas().rename(
    columns={
        'LMP': 'LMP_HOURLY',
        'unique_id':'node', 
        'timestamp_mst':'time'
    })

In [None]:
mtrf = de.prep_mtrf(con)
mtrf

In [None]:
mtlf = de.prep_mtlf(con)
mtlf

In [None]:
all_df = de.prep_all_df(con)
all_df

In [None]:
all_df_pd = de.all_df_to_pandas(de.prep_all_df(con))
all_df_pd

In [None]:
all_df_pd.info()

## Prep model training data

In [None]:
lmp_all, train_all, test_all, train_test_all = de.get_train_test_all(con)

In [None]:
lmp_all

In [None]:
all_series = de.get_series(lmp_all)
all_series[0].plot()

In [None]:
train_test_all_series = de.get_series(train_test_all)
train_test_all_series[0].plot()

In [None]:
train_series = de.get_series(train_all)
train_series[0].plot()

In [None]:
test_series = de.get_series(test_all)
test_series[0].plot()

In [None]:
futr_cov = de.get_futr_cov(all_df_pd)
futr_cov[0].plot()

In [None]:
past_cov = de.get_past_cov(all_df_pd)
past_cov[0].plot()

In [None]:
con.disconnect()

## MLFlow setup

In [None]:
# mlflow.set_tracking_uri("sqlite:///mlruns.db")
log.info(f'mlflow.get_tracking_uri(): {mlflow.get_tracking_uri()}')
exp_name = 'spp_weis'

if mlflow.get_experiment_by_name(exp_name) is None:
    exp = mlflow.create_experiment(exp_name)
    
exp = mlflow.get_experiment_by_name(exp_name)
exp

## Get model signature

In [None]:
node_series = train_series[0]
future_cov_series = futr_cov[0]
past_cov_series = past_cov[0]

data = {
    'series': [node_series.to_json()],
    'past_covariates': [past_cov_series.to_json()],
    'future_covariates': [future_cov_series.to_json()],
    'n': parameters.FORECAST_HORIZON,
    'num_samples': 200
}

df = pd.DataFrame(data)

ouput_example = 'the endpoint return json as a string'

from mlflow.models import infer_signature
darts_signature = infer_signature(df, ouput_example)
darts_signature

## Pretrain models with the best params

In [None]:
parameters.TSMIXER_PARAMS

In [None]:
models_tsmixer = []
for i, param in enumerate(parameters.TSMIXER_PARAMS):
    print(f'\ni: {i} \t' + '*'*25, flush=True)
    model_tsmixer = build_fit_tsmixerx(
        series=train_test_all_series,
        val_series=test_series,
        future_covariates=futr_cov,
        past_covariates=past_cov,
        **param
    )
    models_tsmixer += [model_tsmixer]

In [None]:
parameters.TIDE_PARAMS

In [None]:
models_tide = []
for i, param in enumerate(parameters.TIDE_PARAMS):
    print(f'\ni: {i} \t' + '*'*25, flush=True)
    model_tide = build_fit_tide(
        series=train_test_all_series,
        val_series=test_series,
        future_covariates=futr_cov,
        past_covariates=past_cov,
        **param
    )
    models_tide += [model_tide]

In [None]:
# logging.disable(logging.WARNING)
# logging.basicConfig(level=logging.INFO)

In [None]:
with mlflow.start_run(experiment_id=exp.experiment_id) as run:

    MODEL_TYPE = 'naive_ens'
    
    # fit model with best params from study
    model = NaiveEnsembleModel(
        forecasting_models=models_tsmixer + models_tide, 
        train_forecasting_models=False
    )

    model.MODEL_TYPE = MODEL_TYPE
    model.TRAIN_TIMESTAMP = pd.Timestamp.utcnow()
    
    log.info(f'run.info: \n{run.info}')
    artifact_path = "model_artifacts"
    metrics = {}
    model_params = model.model_params
    
    # final model back test on validation data
    acc = model.backtest(
            series=test_series,
            past_covariates=past_cov,
            future_covariates=futr_cov,
            retrain=False,
            forecast_horizon=parameters.FORECAST_HORIZON,
            stride=49,
            metric=[mae, rmse, get_ci_err],
            verbose=False,
            num_samples=200,
        )

    mean_acc = np.mean(acc, axis=0)
    log.info(f'FINAL ACC: mae - {mean_acc[0]} | rmse - {mean_acc[1]} | ci_err - {mean_acc[2]}')
    acc_df = pd.DataFrame(
        mean_acc.reshape(1,-1),
        columns=['mae', 'rmse', 'ci_error']
    )

    # add and log metrics
    metrics['final_mae'] = acc_df.mae[0]
    metrics['final_rmse'] = acc_df.rmse[0]
    metrics['final_ci_error'] = acc_df.ci_error[0]
    mlflow.log_metrics(metrics)

    # set up path to save model
    model_path = '/'.join([artifact_path, model.MODEL_TYPE])

    shutil.rmtree(artifact_path, ignore_errors=True)
    os.makedirs(artifact_path)

    # log params
    mlflow.log_params(model_params)

    # save model files (model, model.ckpt) 
    # and load them to artifacts when logging the model
    model.save(model_path)

    # save MODEL_TYPE to artifacts
    # this will be used to load the model from the artifacts
    model_type_path = '/'.join([artifact_path, 'MODEL_TYPE.pkl'])
    with open(model_type_path, 'wb') as handle:
        pickle.dump(model.MODEL_TYPE, handle)

    model_timestamp = '/'.join([artifact_path, 'TRAIN_TIMESTAMP.pkl'])
    with open(model_timestamp, 'wb') as handle:
        pickle.dump(model.TRAIN_TIMESTAMP, handle)
    
    # map model artififacts in dictionary
    artifacts = {f:f'{artifact_path}/{f}' for f in os.listdir('model_artifacts')}
    artifacts['model'] = model_path
    
    # log model
    # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html#pip-requirements-example
    mlflow.pyfunc.log_model(
        artifact_path='GlobalForecasting',
        code_path=['src/darts_wrapper.py'],
        signature=darts_signature,
        artifacts=artifacts,
        python_model=DartsGlobalModel(), 
        pip_requirements=["-r notebooks/model_training/requirements.txt"],
        registered_model_name=MODEL_NAME,
    )


## Get latest run and test predicting

In [None]:
runs = mlflow.search_runs(
    experiment_ids = exp.experiment_id,
    # order_by=['metrics.test_mae']
    order_by=['end_time']
    )

runs.sort_values('end_time', ascending=False, inplace=True)
runs.head()

In [None]:
best_run_id = runs.run_id.iloc[0]
best_run_id

In [None]:
runs['artifact_uri'].iloc[0]

In [None]:
model_path = runs['artifact_uri'].iloc[0] + '/GlobalForecasting'

In [None]:
loaded_model = mlflow.pyfunc.load_model(model_path)

## Plot test predictions

In [None]:
plot_ind = 3
plot_series = all_series[plot_ind]

In [None]:
plot_series.static_covariates.unique_id.LMP

In [None]:
plot_series.plot()

In [None]:
plot_end_times = pd.date_range(
    end=test_series[plot_ind].end_time(),
    periods=10,
    freq='d',
)

plot_end_times

In [None]:
for plot_end_time in plot_end_times:
    # plot_end_time = min(
    #     plot_series.end_time() - pd.Timedelta(f'{parameters.INPUT_CHUNK_LENGTH+1}h'), 
    #     pd.Timestamp(plot_end_time)
    # )
    log.info(f'plot_end_time: {plot_end_time}')
    
    plot_node_name = plot_series.static_covariates.unique_id.LMP
    
    # if test_end_time < test_series.end_time():
    node_series = plot_series.drop_after(plot_end_time)
        
    log.info(f'plot_end_time: {plot_end_time}')
    log.info(f'node_series.end_time(): {node_series.end_time()}')
    future_cov_series = futr_cov[0]
    past_cov_series = past_cov[0]
    
    data = {
        'series': [node_series.to_json()],
        'past_covariates': [past_cov_series.to_json()],
        'future_covariates': [future_cov_series.to_json()],
        'n': parameters.FORECAST_HORIZON,
        'num_samples': 200
    }
    df = pd.DataFrame(data)
    
    plot_cov_df = future_cov_series.pd_dataframe()
    plot_cov_df = (
        plot_cov_df
        .reset_index()
        .rename(columns={'timestamp_mst':'time', 're_ratio': 'Ratio'})
    )
    
    # Predict on a Pandas DataFrame.
    df['num_samples'] = 500
    pred = loaded_model.predict(df)
    preds = TimeSeries.from_json(pred)
    
    q_df = plotting.get_quantile_df(preds)
    
    plot_df = plotting.get_mean_df(preds).merge(
        plotting.get_quantile_df(preds),
        left_index=True,
        right_index=True,
    )
    
    plot_df = plotting.get_plot_df(
            TimeSeries.from_json(pred),
            plot_cov_df,
            lmp_df,
            plot_node_name,
        )
    plot_df.rename(columns={'mean':'mean_fcast'}, inplace=True)
    plot_df
    
    plotting.plotly_forecast(plot_df, plot_node_name, show_fig=True)

In [None]:
df