# MLFlow experiment, log, and registering model
To work remotely with Databricks several environment variable need to be set up.  These are expected to be in a `.env` file in the project root.  If you are working remotely these environment varialbles will be loaded by the `load_env()` call in the code block where the spark session is configured.  This file needs to include:

* `DATABRICKS_HOST=https://dbc-beada314-1494.cloud.databricks.com`
* `MLFLOW_TRACKING_URI=databricks`
* `DATABRICKS_TOKEN=<your token>`
* `MLFLOW_TRACKING_TOKEN=<your token>`
* `CLUSTER_ID=0609-202631-imzzg29c`

In [1]:
# !pip install darts --upgrade

In [3]:
import os
import shutil
import pickle
import random
import sys
import numpy as np
import pandas as pd
from databricks import sql
from scipy import stats
from tqdm import tqdm_notebook as tqdm

import mlflow

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# import torch

from darts import TimeSeries, concatenate
from darts.dataprocessing.transformers import Scaler
from darts.metrics import mape, smape, mae, ope, rmse
from darts.utils.statistics import check_seasonality, plot_acf
from darts.datasets import AirPassengersDataset, IceCreamHeaterDataset
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.likelihood_models import QuantileRegression, GumbelLikelihood, GaussianLikelihood

from darts import TimeSeries
from darts.utils.timeseries_generation import (
    gaussian_timeseries,
    linear_timeseries,
    sine_timeseries,
)
from darts.models import (
    TFTModel,
    TiDEModel,
    DLinearModel,
    NLinearModel,
    TSMixerModel
)


from torchmetrics import MeanAbsolutePercentageError, MeanAbsoluteError, MeanSquaredError
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import warnings
warnings.filterwarnings("ignore")

import logging

# define log
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from darts.models import TSMixerModel

In [5]:
import sklearn
sklearn.__version__

'1.3.1'

In [6]:
import torch
torch.cuda.is_available()

True

In [7]:
print(torch.version.cuda)

11.8


In [8]:
## will be loaded from root when deployed
from darts_wrapper import DartsGlobalModel

In [9]:
os.chdir('../..')

In [10]:
# custom modules
import src.data_engineering.data_engineering as de
from src.utils import plotting
from src.utils import utils

# cannot load from location other than root
# from src.modeling.darts_tft_wrapper import DartsGlobalModel

# Spark connection and tracking server

In [20]:
# if working_remotely:
#     # not on databricks
#     # load env variables for tracking uri
#     # and create spark connection
#     from databricks.connect import DatabricksSession
#     from dotenv import load_dotenv
#     load_dotenv()
    
#     spark = DatabricksSession.builder.remote(
#       host       = f"{os.environ['DATABRICKS_HOST']}",
#       token      = os.environ['DATABRICKS_TOKEN'],
#       cluster_id = os.environ['CLUSTER_ID']
#     ).getOrCreate()
    
# else:
#     # we're on databricks
#     mlflow.set_tracking_uri("databricks")

In [19]:
# utils.start_cluster(
#     host       = f"{os.environ['DATABRICKS_HOST']}",
#     token      = os.environ['DATABRICKS_TOKEN'],
#     cluster_id = os.environ['CLUSTER_ID']
# )

In [13]:
# utils.wait_for_cluster(
#     host       = f"{os.environ['DATABRICKS_HOST']}",
#     token      = os.environ['DATABRICKS_TOKEN'],
#     cluster_id = os.environ['CLUSTER_ID']
# )

## start cluster

In [21]:
# # make sure the cluster is started
# utils.start_cluster(
#     host=os.environ['DATABRICKS_HOST'], 
#     cluster_id=os.environ['CLUSTER_ID'], 
#     token=os.environ['DATABRICKS_TOKEN'],
# )

In [15]:
# utils.wait_for_cluster(
#     host=os.environ['DATABRICKS_HOST'], 
#     cluster_id=os.environ['CLUSTER_ID'], 
#     token=os.environ['DATABRICKS_TOKEN'],
# )

In [None]:
###################################################
# get sql warehouse connection
###################################################
from dotenv import load_dotenv
load_dotenv()

log.info('creating sql warehouse connection')
connection = sql.connect(
    server_hostname = os.getenv("DATABRICKS_HOST"),
    http_path       = os.getenv("DATABRICKS_HTTP_PATH"),
    access_token    = os.getenv("DATABRICKS_TOKEN")
    )


# get data from Databricks

In [None]:
def get_query_df(query, connection):
    
    log.info('executing query')
    cursor = connection.cursor()
    cursor.execute(query)
    log.info('fetching results')
    result = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    df = pd.DataFrame(result, columns=columns)
    cursor.close()

    return df

In [None]:
# refresh_data = True
refresh_data = (
    'lmp_df.parquet' not in os.listdir() or
    'mtlf_df.parquet' not in os.listdir() or
    'mtrf_df.parquet' not in os.listdir()
)
refresh_data

In [None]:
## lmp
if refresh_data:
    query = 'SELECT * FROM prd_landing_zone.spp_weis.lmp_hourly'
    # res = spark.sql(query).collect()
    # df = spark.createDataFrame(res).toPandas()
    # to parquet stopped working with serializeable json error

    df = get_query_df(query, connection)
    
    df.to_feather('lmp_df.parquet')
    display(df.head())


In [None]:
# df.to_feather('test_lmp_df.parquet')

In [None]:
## mtrf
if refresh_data:
    query = 'SELECT * FROM prd_landing_zone.spp_weis.mtrf'
    # res = spark.sql(query).collect()
    # df = spark.createDataFrame(res).toPandas()

    df = get_query_df(query, connection)
    
    df.to_feather('mtrf_df.parquet')
    display(df.head())

In [None]:
## mtlf
if refresh_data:
    query = 'SELECT * FROM prd_landing_zone.spp_weis.mtlf'
    # res = spark.sql(query).collect()
    # df = spark.createDataFrame(res).toPandas()
    
    df = get_query_df(query, connection)
    
    df.to_feather('mtlf_df.parquet')
    display(df.head())

# Load dataframes

In [None]:
lmp_df = pd.read_feather('lmp_df.parquet')
mtlf_df = pd.read_feather('mtlf_df.parquet').sort_values('GMTIntervalEnd')
mtrf_df = pd.read_feather('mtrf_df.parquet').sort_values('GMTIntervalEnd')

In [None]:
# TimeSeries.from_dataframe returns the following error if there is a timezone
# TypeError: Cannot interpret 'datetime64[ns, UTC]' as a data type
lmp_df.GMTIntervalEnd = lmp_df.GMTIntervalEnd.dt.tz_localize(None)
lmp_df

In [None]:
mtlf_df.Interval = mtlf_df.Interval.dt.tz_localize(None)
mtlf_df.GMTIntervalEnd = mtlf_df.GMTIntervalEnd.dt.tz_localize(None)
mtlf_df.timestamp  = mtlf_df.timestamp.dt.tz_localize(None)
mtlf_df

In [None]:
mtrf_df.Interval = mtrf_df.Interval.dt.tz_localize(None)
mtrf_df.GMTIntervalEnd = mtrf_df.GMTIntervalEnd.dt.tz_localize(None)
mtrf_df.timestamp  = mtrf_df.timestamp.dt.tz_localize(None)
mtrf_df

# Feature Engineering

In [None]:
# lmp
psco_lmp_df, list_nodes_name, psco_price_df_long = de.get_psco_price_df(lmp_df)
lmp_series = de.create_psco_price_series(psco_lmp_df, list_nodes_name)

# remove duplicates
dups = mtlf_df.GMTIntervalEnd.duplicated()
log.info(f'mtlf_df duplicated: {dups.sum()}')
mtlf_df = mtlf_df[~dups]

dups = mtrf_df.GMTIntervalEnd.duplicated()
log.info(f'mtrf_df duplicated: {dups.sum()}')
mtrf_df = mtrf_df[~dups]
    
# mtlf series
mtlf_series, avg_act_series = de.create_mtlf_series(mtlf_df)
# mtlf_series, avg_act_series = de.create_mtlf_lmp_series(mtlf_df, psco_lmp_df, list_nodes_name)

# mtrf series
mtrf_ratio_df = de.add_enrgy_ratio_to_mtrf(mtlf_df, mtrf_df)
mtrf_ratio_df = de.add_enrgy_ratio_diff_to_mtrf(mtrf_ratio_df)
mtrf_series = de.create_mtrf_series(mtrf_ratio_df)

In [None]:
mtrf_ratio_df

In [None]:
list_nodes_name

# Preprocess series

In [None]:
scalers = {}

start_time = pd.Timestamp('2023-04-02 00:00:00')
### TIME CHANGE ########################################################
N_FCAST_DAYS = 6
input_chunk_length = 2*N_FCAST_DAYS*24
forecast_horizon = 24*N_FCAST_DAYS
# training_cutoff = pd.Timestamp("2023-06-01 06:00:00")
training_cutoff = de.get_train_cutoff(lmp_series, forecast_horizon)
print(f'training_cutoff: {training_cutoff}')
########################################################################
lmp_series = de.lmp_series_drop_horizon(lmp_series, start_time, forecast_horizon)
# training_cutoff = de.get_train_cutoff(lmp_series_drop_horizon)
lmp_series_train, lmp_series_val, lmp_series_all = de.get_lmp_train_test_series(lmp_series, training_cutoff, forecast_horizon, input_chunk_length)
(lmp_series_train_transformed, 
 lmp_series_val_transformed, 
 lmp_series_transformed,
 lmp_scaler) = de.scale_series(lmp_series_train, lmp_series_val, lmp_series_all, global_fit=True)
scalers['series'] = lmp_scaler

print(f'train start: {lmp_series_train.start_time()}')
print(f'train end: {lmp_series_train.end_time()}')
print(f'val start: {lmp_series_val.start_time()}')
print(f'val end: {lmp_series_val.end_time()}')


mtlf_series_train, mtlf_series_val, mtlf_series = de.get_mtlf_train_test_series(
    mtlf_series, start_time, training_cutoff, forecast_horizon, input_chunk_length
    )
# (mtlf_series_train_transformed, 
#  mtlf_series_val_transformed, 
#  mtlf_series_transformed, 
#  mtlf_scaler) = de.scale_series(mtlf_series_train, mtlf_series_val, mtlf_series)
# scalers['mtlf'] = mtlf_scaler

# print(f'train start: {mtlf_series_train.start_time()}')
# print(f'train end: {mtlf_series_train.end_time()}')
# print(f'val start: {mtlf_series_val.start_time()}')
# print(f'val end: {mtlf_series_val.end_time()}')


avg_act_series_train, avg_act_series_val, avg_act_series = de.get_avg_act_train_test_series(avg_act_series, start_time, training_cutoff)
(avg_act_series_train_transformed, 
 avg_act_series_val_transformed, 
 avg_act_series_transformed,
 past_scaler) = de.scale_series(avg_act_series_train, avg_act_series_val, avg_act_series)
scalers['pc'] = past_scaler

print(f'train start: {avg_act_series_train.start_time()}')
print(f'train end: {avg_act_series_train.end_time()}')
print(f'val start: {avg_act_series_val.start_time()}')
print(f'val end: {avg_act_series_val.end_time()}')


mtrf_series_train, mtrf_series_val, mtrf_series = de.get_mtrf_train_test_series(
    mtrf_series, start_time, training_cutoff, forecast_horizon, input_chunk_length
    )
# mtrf_series_train_transformed, mtrf_series_val_transformed, mtrf_series_transformed = de.scale_mtrf_series(mtrf_series_train, mtrf_series_val, mtrf_series)

# print(f'train start: {mtrf_series_train.start_time()}')
# print(f'train end: {mtrf_series_train.end_time()}')
# print(f'val start: {mtrf_series_val.start_time()}')
# print(f'val end: {mtrf_series_val.end_time()}')



In [None]:
past_cov_train = avg_act_series_train_transformed
past_cov_val = avg_act_series_val_transformed
past_cov = avg_act_series_transformed

In [None]:
# Concatenate future training covariates
future_covariates_train = concatenate([mtlf_series_train, mtrf_series_train], axis=1)
future_covariates_train.values().shape

In [None]:
# Concatenate future validation covariates
end_time = mtlf_series_val.end_time() + pd.Timedelta('1H')
mtrf_series_val_end_droped = mtrf_series_val.drop_after(end_time)

future_covariates_val = concatenate([mtlf_series_val, mtrf_series_val_end_droped], axis=1)
future_covariates_val.values().shape

In [None]:
# Concatenate the entire covariate series
mtrf_series_end_droped = mtrf_series.drop_after(end_time)

future_covariates = concatenate([mtlf_series, mtrf_series_end_droped], axis=1)
future_covariates.values().shape

In [None]:
(future_covariates_train_transformed, 
 future_covariates_val_transformed, 
 future_covariates_transformed, 
 future_scaler) = de.scale_series(future_covariates_train, future_covariates_val, future_covariates)
scalers['fc'] = future_scaler

print(f'train start: {future_covariates_train.start_time()}')
print(f'train end: {future_covariates_train.end_time()}')
print(f'val start: {future_covariates_val.start_time()}')
print(f'val end: {future_covariates_val.end_time()}')

In [None]:
lmp_train_all = []
for i in range(len(list_nodes_name)):
    lmp_train_all.append(lmp_series_train_transformed[list_nodes_name[i]])

lmp_val_all = []
for i in range(len(list_nodes_name)):
    lmp_val_all.append(lmp_series_val_transformed[list_nodes_name[i]])

lmp_all = []
for i in range(len(list_nodes_name)):
    lmp_all.append(lmp_series_transformed[list_nodes_name[i]])
    

In [None]:
scalers

# Modeling

## Create Global model

**Considerations**
- Target series: `LMP`
- Past Covariate series: `Averaged_actula` 
- Future Covariate Series: `MTLF`, `Wind`, and `Solar`. Basically, these are mid-term load forecasts and mid-term renewables forecasts.
- `input_chunk-length = 168 (7 days)`, `output_chunk_length = 24 hours` and **forecasting horizon could be the length of output_chunk or my validation set?**

##### Note : You can pass the covariates to the model as individual series in a list, or you can stack them and pass them as one multivariate series. However, you cannot sack them if they do not have the same length.
##### Note from Dart's TFT example: we can just provide the ***whole covariates series*** as future_covariates argument to the model; the model will slice these covariates and use only what it needs in order to train on forecasting the target train_transformed.

## Fit the model using covariate series

In [None]:
# Parameters
num_samples = 200
figsize = (16, 5)
lowest_q, low_q, high_q, highest_q = 0.01, 0.1, 0.9, 0.99
label_q_outer = f"{int(lowest_q * 100)}-{int(highest_q * 100)}th percentiles"
label_q_inner = f"{int(low_q * 100)}-{int(high_q * 100)}th percentiles"

In [None]:
# default quantiles for QuantileRegression
quantiles = [
    0.01,
    0.05,
    0.1,
    0.15,
    0.2,
    0.25,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.75,
    0.8,
    0.85,
    0.9,
    0.95,
    0.99,
]



# Log the model

## Set up MLFlow experiment

https://mlflow.org/docs/latest/tracking.html#logging-to-a-tracking-server

need to set env vars `MLFLOW_TRACKING_URI` and `MLFLOW_TRACKING_TOKEN` to access tracking server

In [None]:
mlflow.get_tracking_uri()

In [1]:
exp_name = 'Justin.L.Fields@xcelenergy.com'
experiment_name = '.'
experiment_path = f"/Users/{user_name}/{experiment_name}"
exp = mlflow.create_experimentexperiment_path)
exp

NameError: name 'mlflow' is not defined

In [None]:
exp.experiment_id

In [None]:
exp.artifact_location

## Infer signature

In [None]:
# create an input example to infer signature
node_series = lmp_series[list_nodes_name[1]]
past_cov_series = avg_act_series
future_cov_series = future_covariates

data = {
    'series': [node_series.to_json()],
    'past_covariates': [past_cov_series.to_json()],
    'future_covariates': [future_cov_series.to_json()],
    'n': forecast_horizon,
    'num_samples': 200
}
df = pd.DataFrame(data)

ouput_example = 'the endpoint return json as a string'

In [None]:
# this signature will be logged with the model
# registered models must have signatures
from mlflow.models import infer_signature
darts_tft_signature = infer_signature(df, ouput_example)
darts_tft_signature

## Select training series

In [None]:
# select series for training
num_training_series = min(20, len(lmp_train_all))
training_series = [lmp_train_all[i] for i in range(num_training_series)]
val_series = [lmp_val_all[i] for i in range(num_training_series)]
# for final training on all data
all_series = [lmp_all[i] for i in range(num_training_series)]

## Train and log model

In [None]:
# mlflow.log_table?

In [None]:
# can turn off backtesting to save time for testing code
run_backtest = True

# torch_metrics = MeanAbsolutePercentageError()
torch_metrics = MeanAbsoluteError()
# torch_metrics = MeanSquaredError(squared=False)

artifact_path = "model_artifacts"

encoders = {
    # "cyclic": {
    #     "past": ["month"], 
    #     "future": ["month"]
    # },
    "datetime_attribute": {
        "future": ["hour", "dayofweek", "month"], # 
        "past": ["hour", "dayofweek", "month"], # 
    },
    "position": {
        "past": ["relative"], 
        "future": ["relative"]
    },
    "transformer": Scaler()
}

# MODEL_TYPE = "tft_model"
# MODEL_TYPE = "tide_model"
# MODEL_TYPE = "dlinear_model"
# MODEL_TYPE = "nlinear_model"
MODEL_TYPE = "ts_mixer_model"



with mlflow.start_run() as run:
    print(f'run.info: \n{run.info}')

    # common model parameters
    lr = 1e-4
    batch_size = 64
    n_epochs = 4
    n_epochs_final = 2
    refit = True
    final_train_val_only = True

    # set up covariate series
    past_covariates = [past_cov for i in range(num_training_series)]
    future_covariates=[future_covariates_transformed for i in range(num_training_series)]

    # common parameters across models
    model_params = {
        'input_chunk_length': input_chunk_length,
        'output_chunk_length': forecast_horizon,
        'batch_size': batch_size,
        'n_epochs': n_epochs,
        'add_encoders': encoders,
        'likelihood': QuantileRegression(quantiles=quantiles),  # QuantileRegression is set per default
        'optimizer_kwargs': {"lr": lr},
        'random_state': 42,
        'torch_metrics': torch_metrics,
    }

    if MODEL_TYPE == "tft_model":
        # https://unit8co.github.io/darts/generated_api/darts.models.forecasting.tft_model.html#temporal-fusion-transformer-tft
        #TFT
        hidden_size = 16 # Hidden state size of the TFT. It is the main hyper-parameter and common across the internal TFT architecture.
        lstm_layers = 1 # Number of layers for the Long Short Term Memory (LSTM) Encoder and Decoder (1 is a good default).
        num_attention_heads = 2 # Number of attention heads (4 is a good default)
    
        model_params.update({
            'hidden_size': hidden_size,
            'lstm_layers': lstm_layers,
            'num_attention_heads': num_attention_heads,
            'dropout': 0.1,
            })
    
        model = TFTModel(**model_params)

    elif MODEL_TYPE == "tide_model":
        # https://unit8co.github.io/darts/generated_api/darts.models.forecasting.tide_model.html#time-series-dense-encoder-tide

        #TiDE
        num_encoder_layers=1 # The number of residual blocks in the encoder.
        num_decoder_layers=1 # The number of residual blocks in the decoder.
        decoder_output_dim=16 # The dimensionality of the output of the decoder.
        hidden_size=64 # The width of the layers in the residual blocks of the encoder and decoder.
        temporal_width_past=0 # The width of the layers in the past covariate projection residual block. If 0, will bypass feature projection and use the raw feature data
        temporal_width_future=0 # The width of the layers in the future covariate projection residual block. If 0, will bypass feature projection and use the raw feature data.
        temporal_decoder_hidden=32 # The width of the layers in the temporal decoder.
    
        model_params.update({
            'num_encoder_layers': num_encoder_layers,
            'num_decoder_layers': num_decoder_layers,
            'decoder_output_dim': decoder_output_dim,
            'hidden_size': hidden_size,
            'temporal_width_past': temporal_width_past,
            'temporal_width_future': temporal_width_future,
            'temporal_decoder_hidden': temporal_decoder_hidden,
            'dropout': 0.1,
            })
        
        model = TiDEModel(**model_params)

    elif MODEL_TYPE == "dlinear_model":
        # https://unit8co.github.io/darts/generated_api/darts.models.forecasting.dlinear.html#d-linear
        # dlinear
        kernel_size=5
    
        model_params.update({
            'kernel_size': kernel_size,
            })
        
        model = DLinearModel(**model_params)

    elif MODEL_TYPE == "ts_mixer_model":
        # https://unit8co.github.io/darts/generated_api/darts.models.forecasting.tsmixer_model.html
        model_params.update({
            'hidden_size': 32,
            'ff_size': 64,
            'num_blocks': 2
            })
        
        model = TSMixerModel(**model_params)

    else:
        # https://unit8co.github.io/darts/generated_api/darts.models.forecasting.nlinear.html#n-linear
        # nlinear doesn't require additional parameters
        model = NLinearModel(**model_params)
        
    fit_params = {
        'series': training_series,
        'val_series': val_series,
        'past_covariates': past_covariates,
        'val_past_covariates': past_covariates,
        'future_covariates': future_covariates,
        'val_future_covariates': future_covariates,
    }


    
    model.fit(**fit_params)

    # log parameters for the run
    # need to add accuracy results here...
    params = {
        "model_type": MODEL_TYPE,
        "lr":lr,
        "epochs_trained": model.epochs_trained,
        "n_epochs_final": n_epochs_final,
        "refit":refit,
        "final_train_val_only": final_train_val_only,
        "num_training_series": num_training_series,
        }

    # add model params
    params = params | model_params
    metrics = {}

    # backtesting takes a moment and generates a lot of output
    # we can turn it off for testing
    if run_backtest:
        # back test on validation data
        acc = model.backtest(
            series=val_series,
            # series=all_series,
            past_covariates=past_covariates,
            future_covariates=future_covariates,
            retrain=False,
            forecast_horizon=forecast_horizon,
            stride=25,
            metric=[mae, rmse],
            verbose=False,
        )
        
        acc_df = pd.DataFrame(
            np.mean(acc, axis=0).reshape(1,-1),
            columns=['mae', 'rmse']
        )

        # log metrics
        metrics['mae'] = acc_df.mae[0]
        metrics['rmse'] = acc_df.rmse[0]


    # finish training on entire data set before logging model
    if final_train_val_only:
        final_train_series = val_series
    else:
        # for final training on all data
        final_train_series = all_series

    if refit:
        log.info('final training')
        model.fit(
                series=final_train_series,
                past_covariates=past_covariates,
                future_covariates=future_covariates,
                verbose=True,
                epochs=n_epochs_final, # continue training
                )
    
    # final model back test on validation data
    acc = model.backtest(
            series=val_series,
            past_covariates=past_covariates,
            future_covariates=future_covariates,
            retrain=False,
            forecast_horizon=forecast_horizon,
            stride=25,
            metric=[mae, rmse],
            verbose=False,
        )
    
    acc_df = pd.DataFrame(
        np.mean(acc, axis=0).reshape(1,-1),
        columns=['mae', 'rmse']
    )

    # log metrics
    metrics['mae_final'] = acc_df.mae[0]
    metrics['rmse_final'] = acc_df.rmse[0]
    mlflow.log_metrics(metrics)

    # set up path to save model
    model_path = '/'.join([artifact_path, MODEL_TYPE])

    shutil.rmtree(artifact_path, ignore_errors=True)
    os.makedirs(artifact_path)

    # log params
    mlflow.log_params(params)

    # save model files (model, model.ckpt) 
    # and load them to artifacts when logging the model
    model.save(model_path)

    # save scalers to artifacts
    scaler_name = 'scalers.pkl'
    scaler_path = '/'.join([artifact_path, scaler_name])
    with open(scaler_path, 'wb') as handle:
        pickle.dump(scalers, handle)

    # save MODEL_TYPE to artifacts
    # this will be used to load the model from the artifacts
    model_type_path = '/'.join([artifact_path, 'MODEL_TYPE.pkl'])
    with open(model_type_path, 'wb') as handle:
        pickle.dump(MODEL_TYPE, handle)
    

    # map model artififacts in dictionary
    artifacts = {
        'model': model_path,
        'model.ckpt': model_path+'.ckpt',
        'scalers': scaler_path,
        'MODEL_TYPE': model_type_path,
        'lmp_df': 'lmp_df.parquet',
        'mtlf_df': 'mtlf_df.parquet',
        'mtrf_df': 'mtrf_df.parquet'
    }

    
    # log model
    # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html#pip-requirements-example
    mlflow.pyfunc.log_model(
        artifact_path='GlobalForecasting',
        code_path=['notebooks/model_training/darts_wrapper.py'],
        signature=darts_tft_signature,
        artifacts=artifacts,
        # model will get loaded from artifacts, we don't need instantiate with one
        python_model=DartsGlobalModel(), 
        pip_requirements=["-r notebooks/model_training/requirements.txt"],
    )

    ## logging the parquet file as an artifact seems to work better
    # mlflow.log_table(data=lmp_df, artifact_file="lmp_df.parquet")
    # mlflow.log_table(data=mtlf_df, artifact_file="mtlf_df.parquet")
    # mlflow.log_table(data=mtrf_df, artifact_file="mtrf_df.parquet")

In [None]:
mlflow.end_run()

In [None]:
scalers

In [None]:
params

## Explainer

In [None]:
# ?TFTExplainer

In [None]:
if MODEL_TYPE == "tft_model":
    i=3
    test_node_name = list_nodes_name[i]
    print(test_node_name)
    
    test_series = all_series[i]
    
    
    future_covariates = concatenate([mtlf_series, mtrf_series_end_droped], axis=1)
    future_covariates.values().shape
    
    past_cov_series = avg_act_series
    future_cov_series = future_covariates

    background_series = test_series.drop_after(test_series.end_time() - pd.Timedelta('200H'))
    
    from darts.explainability import TFTExplainer
    explainer = TFTExplainer(
        model=model, 
        background_series=background_series, 
        background_past_covariates=past_cov_series,
        background_future_covariates=future_cov_series)

    explainability_result = explainer.explain()

In [None]:
if MODEL_TYPE == "tft_model":
    explainer.plot_variable_selection(explainability_result)

In [None]:
if MODEL_TYPE == "tft_model":
    explainer.plot_attention(explainability_result, plot_type="time")

In [None]:
# explainer.plot_attention(explainability_result, plot_type="all")

In [None]:
if MODEL_TYPE == "tft_model":
    explainer.plot_attention(explainability_result, plot_type="heatmap")

In [None]:
if MODEL_TYPE == "tft_model":
    explainability_result.get_encoder_importance().T

In [None]:
if MODEL_TYPE == "tft_model":
    explainability_result.get_decoder_importance().T

In [None]:
if MODEL_TYPE == "tft_model":
    explainability_result.get_static_covariates_importance().T

In [None]:
if MODEL_TYPE == "tft_model":
    attention = explainability_result.get_attention().mean(axis=1)

    time_intersection = test_series.time_index.intersection(attention.time_index)
    
    test_series[time_intersection].plot()
    attention.plot(label="mean_attention", max_nr_components=12)

# Load the model and make prediction

In [None]:
from mlflow import MlflowClient

# use client to get run information from the experiment
client = MlflowClient()

exp = client.get_experiment_by_name(experiment_path)
exp.experiment_id

runs = client.search_runs(
    experiment_ids = exp.experiment_id,
    order_by=['param.mae']
    )

# runs

In [None]:
run.to_dictionary()

In [None]:
# combine the run information into a model_log_df
info_df = pd.DataFrame([r.to_dictionary()['info'] for r in runs])
metrics_df = pd.DataFrame([r.to_dictionary()['data']['metrics'] for r in runs])
params_df = pd.DataFrame([r.to_dictionary()['data']['params'] for r in runs])
model_log_df = pd.concat([info_df, metrics_df, params_df], axis=1)
model_log_df.sort_values('mae', ascending=True, inplace=True)
model_log_df.head()

In [None]:
# sort on on end_time to get the latest run
model_log_df.sort_values('end_time', ascending=False, inplace=True)
model_log_df.head()

In [None]:
# TODO: need to update so only logs model if it's not already logged
best_run_id = model_log_df.run_id.iloc[0]
best_run_id

In [None]:
# logged_model = 'runs:/7642b08bcbe14af2a34b0fdb2eeafade/GlobalForecasting' # v45
# logged_model = 'runs:/6d3c5b0d1b7742a7b2b682b8efe97463/GlobalForecasting' # v47
# logged_model = 'runs:/934ae15de0904b8e9836b2e75cb7ffbf/GlobalForecasting' # v47
logged_model = f'runs:/{best_run_id}/GlobalForecasting'

# Load model as a PyFuncModel
# we can use this to test our custom class
loaded_model = mlflow.pyfunc.load_model(logged_model)

## Test custom end point - get predicitions and plot

In [None]:
# all_series[i]#.concatenate(val_series[i])

In [None]:
i=3
test_node_name = list_nodes_name[i]
print(test_node_name)

test_series = all_series[i]
# test_series = lmp_all[i]
# test_series = val_series[i]
# test_end_time = min(test_series.end_time(), pd.Timestamp('2023-10-15T12:00:00'))
# test_end_time = min(test_series.end_time(), pd.Timestamp('2023-09-29T23:00:00'))
# test_end_time = min(test_series.end_time(), pd.Timestamp('2024-04-01T23:00:00'))
# test_end_time = min(test_series.end_time(), pd.Timestamp('2024-04-15T23:00:00'))
test_end_time = min(test_series.end_time(), pd.Timestamp('2024-07-01T23:00:00'))
print(test_end_time)

In [None]:
mtrf_series_end_droped = mtrf_series.drop_after(end_time)
future_covariates = concatenate([mtlf_series, mtrf_series_end_droped], axis=1)
future_covariates.values().shape

# if test_end_time < test_series.end_time():
node_series = test_series.drop_after(test_end_time)
    
log.info(f'test_end_time: {test_end_time}')
log.info(f'node_series.end_time(): {node_series.end_time()}')
past_cov_series = avg_act_series
future_cov_series = future_covariates


data = {
    'series': [node_series.to_json()],
    'past_covariates': [past_cov_series.to_json()],
    'future_covariates': [future_cov_series.to_json()],
    'n': forecast_horizon,
    'num_samples': 200
}
df = pd.DataFrame(data)

plot_cov_df = future_cov_series.pd_dataframe()
plot_cov_df = plot_cov_df.reset_index().rename(columns={'GMTIntervalEnd':'time'})
plot_cov_df

In [None]:
# Predict on a Pandas DataFrame.
df['num_samples'] = 500
pred = loaded_model.predict(df)
preds = TimeSeries.from_json(pred)

In [None]:
temp_df = (preds.pd_dataframe()
        .reset_index()
        .melt(id_vars='time')
        .rename(columns={'component':'node'})
)

temp_df['sim_num'] = [int(n.split('_')[-1].replace('s', '')) for n in temp_df.node]
temp_df.node = ['_'.join(n.split('_')[:-1]) for n in temp_df.node]

temp_df_pivot = temp_df.reset_index().pivot(index=['time', 'node'], columns='sim_num', values='value')
temp_df_pivot

In [None]:
q_df = plotting.get_quantile_df(preds)
q_df

In [None]:
plot_df = plotting.get_mean_df(preds).merge(
    plotting.get_quantile_df(preds),
    left_index=True,
    right_index=True,
)

plot_df = plotting.get_plot_df(
        TimeSeries.from_json(pred),
        plot_cov_df,
        lmp_df.rename(columns={'PNODE_Name':'node', 'GMTIntervalEnd':'time'}),
        test_node_name,
    )
plot_df.rename(columns={'mean':'mean_fcast'}, inplace=True)
plot_df

plotting.plotly_forecast(plot_df, test_node_name)

In [2]:
plotting.get_plot_df?

Object `plotting.get_plot_df` not found.


# log model

In [None]:
logged_model

In [None]:
if False:
#TODO: register model only if it hasn't been logged yet
    catalog = "sandbox_data_science"
    schema = "spp_weis"
    model_name = "GlobalForecasting"
    
    registry_uri = "databricks-uc"
    mlflow.set_registry_uri(registry_uri)
    mlflow.register_model(
        model_uri=logged_model,
        name=f"{catalog}.{schema}.{model_name}"
    )

In [None]:
# transistion models stage
# https://mlflow.org/docs/latest/model-registry.html#transitioning-an-mlflow-models-stage

In [None]:
# for programtic deployments see
# https://docs.databricks.com/_extras/notebooks/source/machine-learning/model-serving-endpoint-python.html


# Prediction using API endpoints

In [None]:
# import os
# import requests
# import numpy as np
# import pandas as pd
# import json

# def create_tf_serving_json(data):
#   return {'inputs': {name: data[name].tolist() for name in data.keys()} if isinstance(data, dict) else data.tolist()}

# def score_model(dataset):
#   url = 'https://dbc-beada314-1494.cloud.databricks.com/serving-endpoints/spp_weis/invocations'
#   api_token = 'dapi0744c2d5e8ed2b39576805ba0ad5f692'
#   headers = {'Authorization': f'Bearer {api_token}', 'Content-Type': 'application/json'}
#   ds_dict = {'dataframe_split': dataset.to_dict(orient='split')} if isinstance(dataset, pd.DataFrame) else create_tf_serving_json(dataset)
#   data_json = json.dumps(ds_dict, allow_nan=True)
#   response = requests.request(method='POST', headers=headers, url=url, data=data_json)
#   if response.status_code != 200:
#     raise Exception(f'Request failed with status {response.status_code}, {response.text}')
#   return response.json()

In [None]:
# endpoint_pred = score_model(df)
# endpoint_pred

In [None]:
# preds = TimeSeries.from_json(endpoint_pred['predictions'])
# preds.mean(axis=1).plot()

In [None]:
# from databricks.sdk import WorkspaceClient

In [None]:
# WorkspaceClient?