In [None]:
import pandas as pd
import numpy as np
import datetime
import time
import os
from typing import List

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine

## Connect to database

In [None]:
PROJECT_SRC = '/workspace/src'
os.chdir(PROJECT_SRC)
    
SQLALCHEMY_DATABASE_URI='sqlite:///../data/database.db'
engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=False)

In [None]:
with engine.connect() as conn:
    
    join_txt='''
    SELECT 
        g.dttm, 
        wind, 
        solar, 
        load_mwh,
        price
    FROM GENERATION as g
    INNER JOIN
    LOAD as l
    ON g.dttm = l.dttm
    INNER JOIN
    PRICE as p
    ON g.dttm = p.dttm
    
    '''
    
    data = pd.read_sql(join_txt, engine)
    
data


In [None]:
data.dttm = pd.to_datetime(data.dttm)

In [None]:
data.info()

In [None]:
# find any gaps in timeseries
all_hours = pd.DataFrame({'dttm':pd.date_range(data.dttm.min(), data.dttm.max(), freq='H')})
data = all_hours.merge(data, on='dttm', how='left')

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
data.fillna(method='ffill', inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.set_index('dttm', inplace=True)

In [None]:
data

In [None]:
TRAIN_END = '2022'
scaler_means = data[data.index < TRAIN_END].mean()
scaler_means

In [None]:
scaler_sd = data[data.index < TRAIN_END].std()
scaler_sd

In [None]:
def scale_data(data, scaler_means, scaler_sd):
    return ((data - scaler_means)/scaler_sd)

data_scaled = scale_data(data, scaler_means, scaler_sd)
data_scaled.describe()

In [None]:
def unscale_data(data_scaled, scaler_means, scaler_sd):
    return data_scaled*scaler_sd + scaler_means

In [None]:
unscale_data(data_scaled, scaler_means, scaler_sd).describe()

In [None]:
# create dataframe of start and end times for data loader
LOOKBACK = 23
LOOKFORWARD = 24

t_idx = pd.DataFrame(
    [{'t_start':t_start, 
     'to':t_start+pd.DateOffset(hours=LOOKBACK),
     't_end':t_start+pd.DateOffset(hours=LOOKBACK+LOOKFORWARD)}
     for t_start in data_scaled.index[:-(LOOKBACK+LOOKFORWARD)]
    ]
)

In [None]:
t_idx

In [None]:
t_idx[t_idx.to < TRAIN_END]

In [None]:
t_idx.t_end[-1:]

In [None]:
test_idx = 21118
# test_idx = 0
hist_future_cols = ['wind', 'solar', 'load_mwh']
hist_future = data_scaled.loc[t_idx.t_start[test_idx]:t_idx.t_end[test_idx], hist_future_cols]
hist_future

In [None]:
hist_future.values

In [None]:
label_cols = ['price']
hist = data_scaled.loc[t_idx.t_start[test_idx]:t_idx.to[test_idx], label_cols]
hist

In [None]:
label_cols = ['price']
label = data_scaled.loc[(t_idx.to[test_idx]+pd.DateOffset(hours=1)):t_idx.t_end[test_idx], label_cols]
label

In [None]:
def create_tabular_data(to):
    tab_df = pd.DataFrame()
    
    hours = pd.DataFrame({f'hour_{i}':1*(to.hour == i) for i in range(24)}, index=[0])
    dayofweek = pd.DataFrame({f'dayofweek_{i}':1*(to.dayofweek == i) for i in range(7)}, index=[0])
    month = pd.DataFrame({f'month_{i}':1*(to.month == i) for i in range(1, 13)}, index=[0])
    # week = pd.DataFrame({f'week_{i}':1*(to.isocalendar()[1] == i) for i in range(1, 54)}, index=[0])
    day = pd.DataFrame({f'day_{i}':1*(to.day == i) for i in range(1,32)}, index=[0])
        
        
    return pd.concat([day, month, dayofweek, hours], axis=1)

In [None]:
tabular_data = create_tabular_data(t_idx.to[test_idx])
tabular_data.transpose()

In [None]:
class CombineDataset(Dataset):

    def __init__(self, t_idx, data_scaled, hist_future_cols, label_cols, transform=None):
        """
        Args:
            frame (pd.DataFrame): Frame with the tabular data.
            id_col (string): Name of the column that connects image to tabular data
            label_name (string): Name of the column with the label to be predicted
            path_imgs (string): path to the folder where the images are.
            transform (callable, optional): Optional transform to be applied
                on a sample, you need to implement a transform to use this.
        """
        self.t_idx = t_idx
        self.data_scaled = data_scaled
        self.hist_future_cols = hist_future_cols
        self.label_cols = label_cols

    def __len__(self):
        return (self.t_idx.shape[0])

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # print(f'idx: {idx}')
        
        # get historical and future timeseries
        hist_future = self.data_scaled.loc[
            self.t_idx.t_start[idx]:self.t_idx.t_end[idx], self.hist_future_cols
        ]

        # get historical label timeseries
        hist = self.data_scaled.loc[
            self.t_idx.t_start[idx]:self.t_idx.to[idx], self.label_cols
        ]
       
        # get tabular data
        tabular = create_tabular_data(self.t_idx.to[idx])
        
        # get label timeseries
        label = self.data_scaled.loc[
            (self.t_idx.to[idx]+pd.DateOffset(hours=1)):self.t_idx.t_end[idx], self.label_cols
        ]

        return hist_future.values, hist.values, tabular.values, label.values

In [None]:
t_idx_train = t_idx[t_idx.to <= TRAIN_END].reset_index(drop=True)
t_idx_train

In [None]:
t_idx_test = t_idx[t_idx.to > TRAIN_END].reset_index(drop=True)
t_idx_test

In [None]:
BATCH_SIZE_TRAIN = 64

train_set = CombineDataset(t_idx_train.loc[:BATCH_SIZE_TRAIN,:], data_scaled, hist_future_cols, label_cols)
loader_train = DataLoader(
    train_set,
    batch_size = BATCH_SIZE_TRAIN,
    shuffle = True,
    num_workers = 0,
    drop_last=False
)

In [None]:
BATCH_SIZE_TEST = 1024

test_set = CombineDataset(t_idx_test, data_scaled, hist_future_cols, label_cols)
loader_test = DataLoader(
    test_set,
    batch_size = BATCH_SIZE_TEST,
    shuffle = True,
    num_workers = 0,
    drop_last=False
)

In [None]:
for data in loader_train:
    hist_future, hist, tabular, label = data
    
    print(f'hist_future.shape: {hist_future.shape}')
    print(f'hist.shape: {hist.shape}')
    print(f'tabular.shape: {tabular.shape}')
    print(f'label.shape: {label.shape}')

In [None]:
hist_future

In [None]:
hist

In [None]:
tabular

In [None]:
label