In [0]:
%%sh
pip install darts
# pip install "u8darts[torch]"
# conda install -c conda-forge -c pytorch u8darts-all%pip install "mlflow-skinny[databricks]>=2.4.1"
# %pip install "mlflow-skinny[databricks]>=2.4.1"
# dbutils.library.restartPython()

In [0]:
import os
import sys
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm_notebook as tqdm
import time
from datetime import timedelta

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from darts import TimeSeries, concatenate
from darts.dataprocessing.transformers import Scaler
from darts.models import TFTModel
from darts.metrics import mape, smape, mae, ope, rmse
from darts.utils.statistics import check_seasonality, plot_acf
from darts.datasets import AirPassengersDataset, IceCreamHeaterDataset
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.likelihood_models import QuantileRegression, GumbelLikelihood, GaussianLikelihood

from darts import TimeSeries
from darts.utils.timeseries_generation import (
    gaussian_timeseries,
    linear_timeseries,
    sine_timeseries,
)
from darts.models import (
    TFTModel,
    LinearRegressionModel,
    LightGBMModel,
    RNNModel,
    TCNModel,
    TransformerModel,
    NBEATSModel,
    BlockRNNModel,
    VARIMA,
)

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae

from torchmetrics import MeanAbsolutePercentageError
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import warnings
warnings.filterwarnings("ignore")

import logging

# define log
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
import torch
torch.cuda.is_available()

In [0]:
print(torch.version.cuda)

In [0]:
os.chdir('../..')

In [0]:
# import src.data_engineering.data_engineering as de
# import src.feature_engineering.feature_engineering as fe
from src.data_engineering import data_engineering as de
from src.utils import plotting

# Spark connection

In [0]:
if 'spark' not in locals():
    from databricks.connect import DatabricksSession
    from dotenv import load_dotenv
    load_dotenv()
    
    spark = DatabricksSession.builder.remote(
      host       = f"https://{os.environ['DATABRICKS_HOST']}",
      token      = os.environ['SPP_WEIS'],
      cluster_id = os.environ['CLUSTER_ID']
    ).getOrCreate()

# get data from spark

In [0]:
refresh_data = 'lmp_df.parquet' not in os.listdir()
# refresh_data = True
refresh_data

In [0]:
## lmp
if refresh_data:
    t0 = time.time()
    # query = '''
    # SELECT 
    #   GMTIntervalEnd_HE as GMTIntervalEnd,
    #   PNODE_Name,
    #   avg(LMP) as LMP_HOURLY
    # FROM sandbox_data_science.spp_weis.lmp
    # GROUP BY GMTIntervalEnd_HE, PNODE_Name
    # '''

    query = 'SELECT * FROM sandbox_data_science.spp_weis.lmp_hourly'
    res = spark.sql(query).collect()
    df = spark.createDataFrame(res).toPandas()
    df.to_parquet('lmp_df.parquet')
    
    t1 = time.time()
    print(f'elapsed time: {timedelta(seconds=t1-t0)}')
    display(df.head())

# groupby query takes: 0:01:44.339718
# query aggregated table takes: 0:01:15.970348

In [0]:
## mtrf
if refresh_data:
    t0 = time.time()
    
    query = 'SELECT * FROM sandbox_data_science.spp_weis.mtrf'
    res = spark.sql(query).collect()
    df = spark.createDataFrame(res).toPandas()
    df.to_parquet('mtrf_df.parquet')

    t1 = time.time()
    print(f'elapsed time: {timedelta(seconds=t1-t0)}')
    display(df.head())

In [0]:
## mtlf
if refresh_data:
    t0 = time.time()
    
    query = 'SELECT * FROM sandbox_data_science.spp_weis.mtlf'
    res = spark.sql(query).collect()
    df = spark.createDataFrame(res).toPandas()
    df.to_parquet('mtlf_df.parquet')

    t1 = time.time()
    print(f'elapsed time: {timedelta(seconds=t1-t0)}')
    display(df.head())

# Load dataframes

In [0]:
price_df = pd.read_parquet('lmp_df.parquet')
mtlf_df = pd.read_parquet('mtlf_df.parquet').sort_values('GMTIntervalEnd')
mtrf_df = pd.read_parquet('mtrf_df.parquet').sort_values('GMTIntervalEnd')

# Feature Engineering

In [0]:
# ########################################
# # fill missing values
# ########################################
# def fill_missed_values(df):
#     """
#     """
#     df = df.fillna(method='ffill')
#     df = df.fillna(method='bfill')

#     return df


# ########################################
# # lmp
# ########################################
# def get_psco_price_df(price_df):
#     """
#     """
#     psco_idx = price_df["PNODE_Name"].str.contains("PSCO", case=False)
#     psco_price_df_long = price_df[psco_idx]
#     psco_price_df = psco_price_df_long.pivot_table(
#                             index='GMTIntervalEnd',
#                             columns='PNODE_Name',
#                             values='LMP_HOURLY',
#                             margins=False,
#                         ).reset_index()

#     psco_price_df.columns.name=None

#     ls_nodes_name = list(psco_price_df.columns[1:])

#     # return psco_price_df_long to create an overall scaler for psco lmps
#     return fill_missed_values(psco_price_df), ls_nodes_name, psco_price_df_long


# def create_psco_price_series(psco_price_df, node_name_ls):
#     """
#     """
#     psco_price_series = TimeSeries.from_dataframe(
#             psco_price_df, 
#             time_col='GMTIntervalEnd', 
#             value_cols=node_name_ls, 
#             fill_missing_dates=True, 
#             freq='H', 
#             fillna_value=0, 
#             static_covariates=None, 
#             hierarchy=None
#         ).astype(np.float32)

#     return psco_price_series


# ########################################
# # mtlf
# ########################################
# def create_mtlf_series(mtlf_df):
#     """
#     """
#     mtlf_df = fill_missed_values(mtlf_df)
#     mtlf_series = TimeSeries.from_dataframe(mtlf_df, time_col='GMTIntervalEnd', value_cols='MTLF', fill_missing_dates=True, freq='H', fillna_value=0, static_covariates=None, hierarchy=None).astype(np.float32)
#     avg_act_series = TimeSeries.from_dataframe(mtlf_df, time_col='GMTIntervalEnd', value_cols='Averaged_Actual', fill_missing_dates=True, freq='H', fillna_value=0, static_covariates=None, hierarchy=None).astype(np.float32)
    
#     return mtlf_series, avg_act_series
          

# ########################################
# # mtrf
# ########################################  
# # Add renewable/load ratio feature to mtrf dataframe
# def add_enrgy_ratio_to_mtrf(mtlf_df, mtrf_df):
#     """
#     """
#     mtrf_df = mtrf_df[['GMTIntervalEnd', 'Wind_Forecast_MW', 'Solar_Forecast_MW']].set_index('GMTIntervalEnd').asfreq('H').sort_index()
#     mtrf_df = mtrf_df.join(mtlf_df[['GMTIntervalEnd','MTLF']].set_index('GMTIntervalEnd').asfreq('H').sort_index(), on='GMTIntervalEnd', how='outer').sort_values('GMTIntervalEnd').reset_index(drop=True)
#     mtrf_df['Ratio'] = (mtrf_df['Wind_Forecast_MW'] + mtrf_df['Solar_Forecast_MW']) / mtrf_df['MTLF']
#     mtrf_df.drop('MTLF', axis=1, inplace=True) 

#     return fill_missed_values(mtrf_df)

# def create_mtrf_series(mtrf_ratio_df):
#     """
#     """
#     mtrf_series= TimeSeries.from_dataframe(mtrf_ratio_df, time_col='GMTIntervalEnd', value_cols=['Wind_Forecast_MW','Solar_Forecast_MW', 'Ratio'], fill_missing_dates=True, freq='H', fillna_value=0, static_covariates=None, hierarchy=None).astype(np.float32)
    
#     return mtrf_series     

In [0]:
# lmp
psco_lmp_df, list_nodes_name, psco_price_df_long = de.get_psco_price_df(price_df)
lmp_series = de.create_psco_price_series(psco_lmp_df, list_nodes_name)

# mtlf series
mtlf_series, avg_act_series = de.create_mtlf_series(mtlf_df)

# mtrf series
mtrf_ratio_df = de.add_enrgy_ratio_to_mtrf(mtlf_df, mtrf_df)
mtrf_series = de.create_mtrf_series(mtrf_ratio_df)

# Preprocess series

In [0]:
# def scale_series(series_train, series_val, series_all, global_fit=False):
#     """
#     """
#     # use global fit to do a single scaling
#     # this will allow us to do global forecasting
#     # for lmps
#     if global_fit:
#         scaler = series_train.pd_dataframe().abs().max().mean()
#     else:
#         scaler = series_train.pd_dataframe().abs().max()

#     series_train_transformed = TimeSeries.from_dataframe(series_train.pd_dataframe()/scaler)
#     series_val_transformed = TimeSeries.from_dataframe(series_val.pd_dataframe()/scaler)
#     series_transformed = TimeSeries.from_dataframe(series_all.pd_dataframe()/scaler)

#     return [series_train_transformed, series_val_transformed, series_transformed, scaler]

# def lmp_series_drop_horizon(lmp_series, start_time, forecast_horizon):
#     """
#     """
#     start_time_lmp = start_time
#     end_time_lmp = lmp_series.end_time() - pd.Timedelta(f'{forecast_horizon+1}H')
#     lmp_series = lmp_series.drop_before(start_time_lmp)
#     lmp_series = lmp_series.drop_after(end_time_lmp)
#     return lmp_series

# def get_train_cutoff(lmp_series):
#     """
#     """
#     #keep last 30 days (720 data points + horizon(72 hours)) of target series for validation. 
#     training_cutoff = lmp_series[:-792].end_time()
#     return training_cutoff

# def get_lmp_train_test_series(lmp_series_drop_horizon, training_cutoff, forecast_horizon, input_chunk_length):
#     """
#     """
#     lmp_series_train = lmp_series_drop_horizon.drop_after(training_cutoff - pd.Timedelta(f'{forecast_horizon+1}H')) # for future covariates
#     lmp_series_val = lmp_series_drop_horizon.drop_before(training_cutoff + pd.Timedelta(f'{input_chunk_length+1}H'))
#     lmp_series_all = lmp_series_drop_horizon

#     return [lmp_series_train, lmp_series_val, lmp_series_all]


# ############## mtlf #####################
# def get_mtlf_train_test_series(mtlf_series, start_time, training_cutoff):
#     """
#     """
#     # drop times before the starting time
#     mtlf_series = mtlf_series.drop_before(start_time)

#     # Split
#     mtlf_series_train = mtlf_series.drop_after(training_cutoff)
#     mtlf_series_val = mtlf_series.drop_before(training_cutoff)

#     return [mtlf_series_train, mtlf_series_val, mtlf_series]


# # def scale_mtlf_series(mtlf_series_train, mtlf_series_val, mtlf_series):
# #     """
# #     """
# #     transformer_mtlf = Scaler()
# #     mtlf_series_train_transformed = transformer_mtlf.fit_transform(mtlf_series_train)
# #     mtlf_series_val_transformed = transformer_mtlf.transform(mtlf_series_val)
# #     mtlf_series_transformed = transformer_mtlf.transform(mtlf_series)

# #     return [mtlf_series_train_transformed, mtlf_series_val_transformed, mtlf_series_transformed]

# ############# avg_actual #################
# def get_avg_act_train_test_series(avg_act_series, start_time, training_cutoff):
#     """
#     """
#     avg_act_series = avg_act_series.drop_before(start_time)

#     # Split
#     avg_act_series_train = avg_act_series.drop_after(training_cutoff)
#     avg_act_series_val = avg_act_series.drop_before(training_cutoff)

#     return [avg_act_series_train, avg_act_series_val, avg_act_series]


# ############## mtrf #####################
# def get_mtrf_train_test_series(mtrf_series, start_time, training_cutoff):
#     """
#     """
#     mtrf_series = mtrf_series.drop_before(start_time)

#     # Split
#     mtrf_series_train = mtrf_series.drop_after(training_cutoff)
#     mtrf_series_val = mtrf_series.drop_before(training_cutoff)

#     return [mtrf_series_train, mtrf_series_val, mtrf_series]




In [0]:
scalers = {}

start_time = pd.Timestamp('2023-04-02 04:00:00')
### TIME CHANGE ########################################################
input_chunk_length = 24*7
forecast_horizon = 24*3
training_cutoff = pd.Timestamp("2023-06-01 06:00:00")
########################################################################
lmp_series = de.lmp_series_drop_horizon(lmp_series, start_time, forecast_horizon)
# training_cutoff = get_train_cutoff(lmp_series_drop_horizon)
lmp_series_train, lmp_series_val, lmp_series_all = de.get_lmp_train_test_series(lmp_series, training_cutoff, forecast_horizon, input_chunk_length)
(lmp_series_train_transformed, 
 lmp_series_val_transformed, 
 lmp_series_transformed,
 lmp_scaler) = de.scale_series(lmp_series_train, lmp_series_val, lmp_series_all, global_fit=True)
scalers['series'] = lmp_scaler

print(f'train start: {lmp_series_train.start_time()}')
print(f'train end: {lmp_series_train.end_time()}')
print(f'val start: {lmp_series_val.start_time()}')
print(f'val end: {lmp_series_val.end_time()}')


mtlf_series_train, mtlf_series_val, mtlf_series = de.get_mtlf_train_test_series(mtlf_series, start_time, training_cutoff)
# (mtlf_series_train_transformed, 
#  mtlf_series_val_transformed, 
#  mtlf_series_transformed, 
#  mtlf_scaler) = scale_series(mtlf_series_train, mtlf_series_val, mtlf_series)
# scalers['mtlf'] = mtlf_scaler

# print(f'train start: {mtlf_series_train.start_time()}')
# print(f'train end: {mtlf_series_train.end_time()}')
# print(f'val start: {mtlf_series_val.start_time()}')
# print(f'val end: {mtlf_series_val.end_time()}')


avg_act_series_train, avg_act_series_val, avg_act_series = de.get_avg_act_train_test_series(avg_act_series, start_time, training_cutoff)
(avg_act_series_train_transformed, 
 avg_act_series_val_transformed, 
 avg_act_series_transformed,
 past_scaler) = de.scale_series(avg_act_series_train, avg_act_series_val, avg_act_series)
scalers['pc'] = past_scaler

print(f'train start: {avg_act_series_train.start_time()}')
print(f'train end: {avg_act_series_train.end_time()}')
print(f'val start: {avg_act_series_val.start_time()}')
print(f'val end: {avg_act_series_val.end_time()}')


mtrf_series_train, mtrf_series_val, mtrf_series = de.get_mtrf_train_test_series(mtrf_series, start_time, training_cutoff)
# mtrf_series_train_transformed, mtrf_series_val_transformed, mtrf_series_transformed = scale_mtrf_series(mtrf_series_train, mtrf_series_val, mtrf_series)

# print(f'train start: {mtrf_series_train.start_time()}')
# print(f'train end: {mtrf_series_train.end_time()}')
# print(f'val start: {mtrf_series_val.start_time()}')
# print(f'val end: {mtrf_series_val.end_time()}')



In [0]:
past_cov_train = avg_act_series_train_transformed
past_cov_val = avg_act_series_val_transformed
past_cov = avg_act_series_transformed

In [0]:
# Concatenate future training covariates
future_covariates_train = concatenate([mtlf_series_train, mtrf_series_train], axis=1)
future_covariates_train.values().shape

In [0]:
# Concatenate future validation covariates
end_time = mtlf_series_val.end_time() + pd.Timedelta('1H')
mtrf_series_val_end_droped = mtrf_series_val.drop_after(end_time)

future_covariates_val = concatenate([mtlf_series_val, mtrf_series_val_end_droped], axis=1)
future_covariates_val.values().shape

In [0]:
# Concatenate the entire covariate series
mtrf_series_end_droped = mtrf_series.drop_after(end_time)

future_covariates = concatenate([mtlf_series, mtrf_series_end_droped], axis=1)
future_covariates.values().shape

In [0]:
(future_covariates_train_transformed, 
 future_covariates_val_transformed, 
 future_covariates_transformed, 
 future_scaler) = de.scale_series(future_covariates_train, future_covariates_val, future_covariates)
scalers['fc'] = future_scaler

print(f'train start: {future_covariates_train.start_time()}')
print(f'train end: {future_covariates_train.end_time()}')
print(f'val start: {future_covariates_val.start_time()}')
print(f'val end: {future_covariates_val.end_time()}')

In [0]:
lmp_train_all = []
for i in range(len(list_nodes_name)):
    lmp_train_all.append(lmp_series_train_transformed[list_nodes_name[i]])


lmp_val_all = []
for i in range(len(list_nodes_name)):
    lmp_val_all.append(lmp_series_val_transformed[list_nodes_name[i]])

In [0]:
scalers

# Prediction using API endpoints

In [0]:
mtlf_df = pd.read_parquet('mtlf_df.parquet').sort_values('GMTIntervalEnd')
mtrf_df = pd.read_parquet('mtrf_df.parquet').sort_values('GMTIntervalEnd')

common_times = np.intersect1d(mtlf_df.GMTIntervalEnd, mtlf_df.GMTIntervalEnd)
mtlf_idx = [t in common_times for t in mtlf_df.GMTIntervalEnd]
mtrf_idx = [t in common_times for t in mtrf_df.GMTIntervalEnd]

mtrf_df = mtrf_df[mtrf_idx]
mtlf_df = mtlf_df[mtlf_idx]

In [0]:
price_df = pd.read_parquet('lmp_df.parquet')
price_df.rename(columns={'GMTIntervalEnd':'time', 'PNODE_Name':'node'}, inplace=True)
price_df

In [0]:
# lmp
# psco_lmp_df, list_nodes_name, psco_price_df_long = de.get_psco_price_df(price_df)
# lmp_series = de.create_psco_price_series(psco_lmp_df, list_nodes_name)

# mtlf series
mtlf_series, avg_act_series = de.create_mtlf_series(mtlf_df)

# mtrf series
mtrf_ratio_df = de.add_enrgy_ratio_to_mtrf(mtlf_df, mtrf_df)
mtrf_series = de.create_mtrf_series(mtrf_ratio_df)

future_covariates = concatenate([mtlf_series, mtrf_series], axis=1)
past_covariates = avg_act_series

In [0]:
node_name = list_nodes_name[5]
lmp_series_df = price_df[price_df.node == node_name].drop('node', axis=1)
lmp_series_df.rename(columns={'LMP_HOURLY':node_name}, inplace=True)
lmp_series_df = lmp_series_df.sort_values('time')
lmp_series_df

In [0]:
FCAST_TIME = pd.Timestamp('2023-07-28 12:00:00')
# FCAST_TIME = pd.Timestamp('2023-06-05 12:00:00')

In [0]:
lmp_series = TimeSeries.from_dataframe(
            lmp_series_df, 
            time_col='time', 
            value_cols=node_name, 
            fill_missing_dates=True, 
            freq='H', 
            fillna_value=0, 
            static_covariates=None, 
            hierarchy=None
        ).astype(np.float32)

In [0]:
lmp_series = lmp_series.drop_after(FCAST_TIME)

In [0]:
# node_series = lmp_series[list_nodes_name[1]]
past_cov_series = avg_act_series
future_cov_series = future_covariates

In [0]:
data = {
    'series': [lmp_series.to_json()],
    'past_covariates': [past_cov_series.to_json()],
    'future_covariates': [future_cov_series.to_json()],
    'n': forecast_horizon,
    'num_samples': 200
}
df = pd.DataFrame(data)

In [0]:
import os
import requests
import numpy as np
import pandas as pd
import json

def create_tf_serving_json(data):
  return {'inputs': {name: data[name].tolist() for name in data.keys()} if isinstance(data, dict) else data.tolist()}

def score_model(dataset):
  url = 'https://dbc-beada314-1494.cloud.databricks.com/serving-endpoints/spp_weis/invocations'
  api_token = 'dapi0744c2d5e8ed2b39576805ba0ad5f692'
  headers = {'Authorization': f'Bearer {api_token}', 'Content-Type': 'application/json'}
  ds_dict = {'dataframe_split': dataset.to_dict(orient='split')} if isinstance(dataset, pd.DataFrame) else create_tf_serving_json(dataset)
  data_json = json.dumps(ds_dict, allow_nan=True)
  response = requests.request(method='POST', headers=headers, url=url, data=data_json)
  if response.status_code != 200:
    raise Exception(f'Request failed with status {response.status_code}, {response.text}')
  return response.json()

In [0]:
endpoint_pred = score_model(df)
# endpoint_pred

In [0]:
preds = TimeSeries.from_json(endpoint_pred['predictions'])

In [0]:
preds.mean(axis=1).plot()

In [0]:
# preds.pd_dataframe()

In [0]:
# def get_quantile_df(preds):

#     # get dataframe from preds TimeSeries
#     plot_df = (
#         preds.pd_dataframe()
#         .reset_index()
#         .melt(id_vars='time')
#         .rename(columns={'component':'node'})
#     )

#     # remove sample numbers
#     plot_df.node = ['_'.join(n.split('_')[:-1]) for n in plot_df.node]

#     # get quanitles
#     q_df = plot_df.groupby(['time', 'node']).quantile([0.1, 0.5, 0.9])

#     # create columns from quantiles
#     q_pivot = q_df.reset_index().pivot(columns='level_2', index=['time', 'node'])

#     # level from columns after pivot
#     q_pivot.columns = q_pivot.columns.droplevel()

#     # remove index level name
#     q_pivot.columns.name = None
    
#     return q_pivot


In [0]:
q_df = plotting.get_quantile_df(preds)
q_df

In [0]:
# def get_mean_df(preds):
#     plot_df = (
#         preds.pd_dataframe()
#         .reset_index()
#         .melt(id_vars='time')
#         .rename(columns={'component':'node'})
#     )

#     # remove sample numbers
#     plot_df.node = ['_'.join(n.split('_')[:-1]) for n in plot_df.node]

#     # get quanitles
#     mean_df = plot_df.groupby(['time', 'node']).mean()
#     mean_df.rename(columns={'value':'mean_fcast'}, inplace=True)
#     return mean_df


In [0]:
plot_cov_df = future_cov_series.pd_dataframe()
plot_cov_df = plot_cov_df.reset_index().rename(columns={'GMTIntervalEnd':'time'})
plot_cov_df

In [0]:
# def get_plot_df(preds, plot_cov_df, prices_df, node_name):
    
#     fcast_df = get_mean_df(preds).merge(
#         get_quantile_df(preds),
#         left_index=True,
#         right_index=True,
#     )

#     fcast_df.reset_index().drop('node', axis=1)

#     plot_df = fcast_df.reset_index().drop('node', axis=1).merge(
#         plot_cov_df,
#         on=['time'],
#         how='right',
#     ).sort_values('time')

#     plot_df = plot_df.merge(
#         prices_df[prices_df.node == node_name],
#         on=['time'],
#         how='left',
#     ).sort_values('time')

#     return plot_df
    

In [0]:
plot_df = plotting.get_plot_df(preds, plot_cov_df, price_df, node_name)
plot_df

In [0]:
def plot_fcast(plot_df, lookback = '7D', node_name=None):
    
    fig, (ax1, ax2) = plt.subplots(2)
    
    min_fcast_time = plot_df.time[~plot_df.mean_fcast.isna()].min() - pd.Timedelta(lookback)
    max_fcast_time = plot_df.time[~plot_df.mean_fcast.isna()].max()
    plot_idx = (plot_df.time >= min_fcast_time) & (plot_df.time <= max_fcast_time)
    
    plot_data = plot_df[plot_idx]
    plot_data[['time', 'mean_fcast', 'LMP_HOURLY']].plot(x='time', ax=ax1);
    idx = ~plot_data['mean_fcast'].isna()

    acc_data = plot_data[['mean_fcast', 'LMP_HOURLY']].dropna(axis=0)
    if acc_data.shape[0] > 0:
        err = np.round(mae(acc_data.LMP_HOURLY, acc_data.mean_fcast), 2)
    else:
        err = '-'
    title_text = f'MAE forecast error: ${err}'
    if node_name:
        title_text = node_name + '\n' + title_text
        
    # https://stackoverflow.com/questions/29329725/pandas-and-matplotlib-fill-between-vs-datetime64/29329823#29329823
    ax1.fill_between(plot_data.time.values, plot_data[0.1], plot_data[0.9], where=idx, alpha=0.3)
    ax1.set_xlabel('')
    ax1.set_ylabel('$')
    ax1.set_title(title_text)
    
    plot_df.loc[plot_idx, ['time', 'Ratio']].plot(x='time', ax=ax2);
    ax2.set_ylabel('RE gen / load')
    
    fig.set_size_inches(6, 6)
    plt.tight_layout()
    
    plt.show()

    return plot_data


In [0]:
plot_data = plot_fcast(plot_df, lookback = '3D', node_name='test')

In [0]:
acc_data = plot_data[['mean_fcast', 'LMP_HOURLY']].dropna(axis=0)
acc_data

In [0]:
np.round(mae(acc_data.LMP_HOURLY, acc_data.mean_fcast), 2)

## plotly

In [0]:
import matplotlib.pyplot as plt
import matplotlib as mpl

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from IPython.display import display, HTML

import plotly.graph_objects as go


In [0]:
plot_df

In [0]:
from plotly.subplots import make_subplots
def plotly_forecast(plot_df, node_name=None, lookback='7D', show_fig=False):

    acc_data = plot_data[['mean_fcast', 'LMP_HOURLY']].dropna(axis=0)
    if acc_data.shape[0] > 0:
        err = np.round(mae(acc_data.LMP_HOURLY, acc_data.mean_fcast), 2)
    else:
        err = '-'
    title_text = f'MAE forecast error: ${err}'
    if node_name:
        title_text = node_name + '\n' + title_text
    
    start_time = plot_df[~plot_df.mean_fcast.isna()].time.min() - pd.Timedelta(lookback)
    end_time = plot_df[~plot_df.mean_fcast.isna()].time.max()
    plotly_idx = (plot_df.time >= start_time) & (plot_df.time <= end_time)
    plotly_data = plot_df[plotly_idx]
    
    x_actual = plotly_data.time
    y_actual = plotly_data.LMP_HOURLY
    
    x_fcast = plotly_data.time
    y_u_int = plotly_data[0.9]
    y_l_int = plotly_data[0.1]
    
    y_fcast = plotly_data.mean_fcast
    y_ratio = plotly_data.Ratio

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

    # Actual values (history)
    fig.append_trace(
        go.Scatter(
                x=x_actual,
                y=y_actual,
                line=dict(color='rgb(10,10,10)'),
                mode='lines',
                name='Actual'
            ), row=1, col=1)

    # confidence interval
    fig.append_trace(
        go.Scatter(
                x=(
                    x_fcast.tolist() +
                    x_fcast[::-1].tolist()
                ),
                y=(
                    y_u_int.tolist() +
                    y_l_int[::-1].tolist()
                ),
                fill='toself',
                fillcolor='rgba(200,40,40,0.2)',
                line=dict(color='rgba(255,255,255,0)'),
                showlegend=False,
                name='forecast_ci'
                ), row=1, col=1)

    # point forecast
    fig.append_trace(
        go.Scatter(
                x=x_fcast,
                y=y_fcast,
                line=dict(color='rgb(200,40,40)'),
                mode='lines',
                name='Forecast'
            ), row=1, col=1)

    # energy ratio
    fig.append_trace(
        go.Scatter(
                x=x_fcast,
                y=y_ratio,
                line=dict(color='rgb(40,40,200)'),
                mode='lines',
                name='energy_ratio',
            # visible='legendonly',
            ), row=2, col=1,
    )
    
    
    fig.update_layout(
            title=title_text,
            height=600,
            width=1000,
            yaxis_tickformat = ',',
            plot_bgcolor="rgb(240, 240, 250, 1.0)",
            # Add range slider
            xaxis=dict(
                # rangeslider=dict(
                #     visible=True
                # ),
                type="date"
            )
        )

    fig.update_yaxes(title_text="$", row=1, col=1)
    fig.update_yaxes(title_text=" RE / Load", row=2, col=1)

    # range slider for subplots
    # https://community.plotly.com/t/subplot-with-shared-x-axis-and-range-slider/3148
    fig.update_layout(xaxis2_rangeslider_visible=True, 
                      xaxis2_rangeslider_thickness=0.1)
    
    
    fig.for_each_trace(lambda t: t.update(
        hoveron='points+fills',
        # hovertemplate = 'Hours: %{y:,.0f} - %{x}',
        hovertemplate = '%{y:,.2f} - %{x}',
        ))
    
    # fig.update_xaxes(matches='x')
    # fig.update_xaxes(row=1, col=1, rangeslider_visible=False)
    
    if show_fig:
        fig.show()

    return fig

In [0]:
fig = plotly_forecast(plot_df, lookback='7D', show_fig=True)

In [0]:
def plot_td_fcast(df, k, hours: str, fcast_type: str, show_fig=True):
    
    # creat groups (keys)
    mapping_cols = [
        'CMPNY_NAME',
        'UTILITY',
        'AREA',
        'PLANNING_PLANT_GRP'
    ]
    
    df['key'] = df[mapping_cols].apply(' - '.join, axis=1)
    
    # create new df grouped by keys and yera_mmonth
    select_cols = [hours] + ['LWR_0_1'] + ['UPR_0_9'] + ['key'] + ['YR_MTH']  + ['HOURS_TYPE'] 
    max_year_mon = df[select_cols[4]].max()
 
    summed_df = (
        df[select_cols]
        .loc[df['YR_MTH'] <= max_year_mon,:]
        .groupby(select_cols[3:])
        .sum()
        .reset_index()
    )
    
    # datetime format
    summed_df['YR_MTH'] = pd.to_datetime(summed_df['YR_MTH'])
 
    x_actual = summed_df[summed_df.key== k].loc[summed_df.HOURS_TYPE == 'actual', 'YR_MTH']
    x_fcast = summed_df[summed_df.key== k].loc[summed_df.HOURS_TYPE == 'forecast', 'YR_MTH']
    y_p_actual = summed_df[summed_df.key== k].loc[summed_df.HOURS_TYPE == 'actual', hours]
    y_p_fcast = summed_df[summed_df.key== k].loc[summed_df.HOURS_TYPE == 'forecast', hours]
    y_l_int = summed_df[summed_df.key== k].loc[summed_df.HOURS_TYPE == 'forecast', 'LWR_0_1']
    y_u_int = summed_df[summed_df.key== k].loc[summed_df.HOURS_TYPE == 'forecast', 'UPR_0_9']
        
    fig = go.Figure([
        # Actual values (history)
        go.Scatter(
            x=x_actual,
            y=y_p_actual,
            line=dict(color='rgb(10,10,10)'),
            mode='lines',
            name='Actual'
        ),
        # confidence interval
        go.Scatter(
            x=(
                x_fcast.tolist() +
                x_fcast[::-1].tolist()
            ),
            y=(
                y_u_int.tolist() +
                y_l_int[::-1].tolist()
            ),
            fill='toself',
            fillcolor='rgba(200,40,40,0.2)',
            line=dict(color='rgba(255,255,255,0)'),
            showlegend=False,
            name='forecast_ci'
            ),

        # point forecast
        go.Scatter(
            x=x_fcast,
            y=y_p_fcast,
            line=dict(color='rgb(200,40,40)'),
            mode='lines',
            name='Forecast'
        ),

    ])
    
    fig.update_layout(
        title=f"{fcast_type}: {k}",
        yaxis_tickformat = ',',
        plot_bgcolor="rgb(240, 240, 250, 1.0)",
        # Add range slider
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=2,
                        label="2y",
                        step="year",
                        stepmode="backward"),
                    dict(count=3,
                        label="3y",
                        step="year",
                        stepmode="backward"),
                    dict(step="all")
                ])
            ),
            rangeslider=dict(
                visible=True
            ),
            type="date"
        )
    )


    fig.for_each_trace(lambda t: t.update(
        hoveron='points+fills',
        hovertemplate = 'Hours: %{y:,.0f} - %{x}'
    ))
    
    if show_fig:
        fig.show()
    else:
        return fig 
