In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import os
from typing import List

In [2]:
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
# from sqlalchemy import Column, Date, Integer, String, Float
# from sqlalchemy.ext.declarative import declarative_base
# from sqlalchemy.dialects.sqlite import insert

## Connect to database

In [3]:
PROJECT_SRC = '/workspace/src'
os.chdir(PROJECT_SRC)
    
SQLALCHEMY_DATABASE_URI='sqlite:///../data/database.db'
engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=False)

In [4]:
with engine.connect() as conn:
    
    join_txt='''
    SELECT 
        g.dttm, 
        wind, 
        solar, 
        load_mwh,
        price
    FROM GENERATION as g
    INNER JOIN
    LOAD as l
    ON g.dttm = l.dttm
    INNER JOIN
    PRICE as p
    ON g.dttm = p.dttm
    
    '''
    
    data = pd.read_sql(join_txt, engine)
    
data


Unnamed: 0,dttm,wind,solar,load_mwh,price
0,2020-01-01 01:00:00.000000,12443.0,0.0,16368.09,12.52
1,2020-01-01 02:00:00.000000,12338.8,0.0,15948.21,12.35
2,2020-01-01 03:00:00.000000,12075.4,0.0,15586.64,12.06
3,2020-01-01 04:00:00.000000,11957.9,0.0,15336.74,11.97
4,2020-01-01 05:00:00.000000,12012.6,0.0,15288.27,12.37
...,...,...,...,...,...
21163,2022-05-31 20:00:00.000000,13579.5,34.0,18099.66,103.30
21164,2022-05-31 21:00:00.000000,13707.3,2.8,17740.72,4.88
21165,2022-05-31 22:00:00.000000,13718.6,0.2,17338.54,9.59
21166,2022-05-31 23:00:00.000000,13558.5,0.2,16304.94,6.29


In [6]:
data.dttm = pd.to_datetime(data.dttm)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21168 entries, 0 to 21167
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   dttm      21168 non-null  datetime64[ns]
 1   wind      21167 non-null  float64       
 2   solar     21166 non-null  float64       
 3   load_mwh  21168 non-null  float64       
 4   price     21168 non-null  float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 827.0 KB


In [8]:
data.describe()

Unnamed: 0,wind,solar,load_mwh,price
count,21167.0,21166.0,21168.0,21168.0
mean,7475.159168,42.644364,16699.025118,28.970177
std,4114.096624,66.774727,2429.071285,28.649957
min,11.5,0.0,10772.12,-48.27
25%,3901.4,0.0,14939.135,16.02
50%,7328.3,0.5,16513.71,21.87
75%,10993.65,73.275,18210.735,34.58
max,17027.7,405.5,25715.57,888.14


In [9]:
data.isna().sum()

dttm        0
wind        1
solar       2
load_mwh    0
price       0
dtype: int64

In [10]:
data.fillna(method='ffill', inplace=True)

In [11]:
data.isna().sum()

dttm        0
wind        0
solar       0
load_mwh    0
price       0
dtype: int64

In [12]:
data.set_index('dttm', inplace=True)

In [13]:
data

Unnamed: 0_level_0,wind,solar,load_mwh,price
dttm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 01:00:00,12443.0,0.0,16368.09,12.52
2020-01-01 02:00:00,12338.8,0.0,15948.21,12.35
2020-01-01 03:00:00,12075.4,0.0,15586.64,12.06
2020-01-01 04:00:00,11957.9,0.0,15336.74,11.97
2020-01-01 05:00:00,12012.6,0.0,15288.27,12.37
...,...,...,...,...
2022-05-31 20:00:00,13579.5,34.0,18099.66,103.30
2022-05-31 21:00:00,13707.3,2.8,17740.72,4.88
2022-05-31 22:00:00,13718.6,0.2,17338.54,9.59
2022-05-31 23:00:00,13558.5,0.2,16304.94,6.29


In [14]:
TRAIN_END = '2022'
scaler_means = data[data.index < TRAIN_END].mean()
scaler_means

wind         6933.515419
solar          39.701408
load_mwh    16605.660148
price          27.070445
dtype: float64

In [15]:
scaler_sd = data[data.index < TRAIN_END].std()
scaler_sd

wind        3897.052794
solar         62.736109
load_mwh    2486.333050
price         28.527752
dtype: float64

In [18]:
def scale_data(data, scaler_means, scaler_sd):
    return ((data - scaler_means)/scaler_sd)

data_scaled = scale_data(data, scaler_means, scaler_sd)
data_scaled.describe()

Unnamed: 0,wind,solar,load_mwh,price
count,21168.0,21168.0,21168.0,21168.0
mean,0.138969,0.046846,0.037551,0.066592
std,1.055673,1.064345,0.976969,1.004284
min,-1.776218,-0.632832,-2.346242,-2.640953
25%,-0.778028,-0.632832,-0.670274,-0.387358
50%,0.101239,-0.624862,-0.036982,-0.182294
75%,1.041764,0.534359,0.645559,0.263237
max,2.59021,5.83075,3.663994,30.183576


In [20]:
def unscale_data(data_scaled, scaler_means, scaler_sd):
    return data_scaled*scaler_sd + scaler_means

In [21]:
unscale_data(data_scaled, scaler_means, scaler_sd).describe()

Unnamed: 0,wind,solar,load_mwh,price
count,21168.0,21168.0,21168.0,21168.0
mean,7475.085256,42.640349,16699.025118,28.970177
std,4114.013496,66.77285,2429.071285,28.649957
min,11.5,0.0,10772.12,-48.27
25%,3901.5,0.0,14939.135,16.02
50%,7328.05,0.5,16513.71,21.87
75%,10993.325,73.225,18210.735,34.58
max,17027.7,405.5,25715.57,888.14


In [28]:
LOOKBACK = 24
LOOKFORWARD = 25

In [32]:
t_idx = pd.DataFrame(
    [{'t_start':t_start, 
     'to':t_start+pd.DateOffset(hours=LOOKBACK),
     't_end':t_start+pd.DateOffset(hours=LOOKBACK+LOOKFORWARD)}
     for t_start in data_scaled.index[:-(LOOKBACK+LOOKFORWARD)]
    ]
)

In [33]:
t_idx

Unnamed: 0,t_start,to,t_end
0,2020-01-01 01:00:00,2020-01-02 01:00:00,2020-01-03 02:00:00
1,2020-01-01 02:00:00,2020-01-02 02:00:00,2020-01-03 03:00:00
2,2020-01-01 03:00:00,2020-01-02 03:00:00,2020-01-03 04:00:00
3,2020-01-01 04:00:00,2020-01-02 04:00:00,2020-01-03 05:00:00
4,2020-01-01 05:00:00,2020-01-02 05:00:00,2020-01-03 06:00:00
...,...,...,...
21114,2022-05-29 19:00:00,2022-05-30 19:00:00,2022-05-31 20:00:00
21115,2022-05-29 20:00:00,2022-05-30 20:00:00,2022-05-31 21:00:00
21116,2022-05-29 21:00:00,2022-05-30 21:00:00,2022-05-31 22:00:00
21117,2022-05-29 22:00:00,2022-05-30 22:00:00,2022-05-31 23:00:00


In [34]:
t_idx[t_idx.to < TRAIN_END]

Unnamed: 0,t_start,to,t_end
0,2020-01-01 01:00:00,2020-01-02 01:00:00,2020-01-03 02:00:00
1,2020-01-01 02:00:00,2020-01-02 02:00:00,2020-01-03 03:00:00
2,2020-01-01 03:00:00,2020-01-02 03:00:00,2020-01-03 04:00:00
3,2020-01-01 04:00:00,2020-01-02 04:00:00,2020-01-03 05:00:00
4,2020-01-01 05:00:00,2020-01-02 05:00:00,2020-01-03 06:00:00
...,...,...,...
17514,2021-12-30 19:00:00,2021-12-31 19:00:00,2022-01-01 20:00:00
17515,2021-12-30 20:00:00,2021-12-31 20:00:00,2022-01-01 21:00:00
17516,2021-12-30 21:00:00,2021-12-31 21:00:00,2022-01-01 22:00:00
17517,2021-12-30 22:00:00,2021-12-31 22:00:00,2022-01-01 23:00:00


In [40]:
t_idx.t_end[-1:]

21118   2022-06-01
Name: t_end, dtype: datetime64[ns]

In [46]:
test_idx = 21118
# test_idx = 0
hist_future_cols = ['wind', 'solar', 'load_mwh']
hist_future = data_scaled.loc[t_idx.t_start[test_idx]:t_idx.t_end[test_idx], hist_future_cols]
hist_future

Unnamed: 0_level_0,wind,solar,load_mwh
dttm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-29 23:00:00,1.525841,-0.632832,0.052624
2022-05-30 00:00:00,1.721348,-0.632832,-0.264965
2022-05-30 01:00:00,1.661636,-0.632832,-0.585384
2022-05-30 02:00:00,1.682062,-0.632832,-0.796197
2022-05-30 03:00:00,1.451401,-0.632832,-0.997308
2022-05-30 04:00:00,1.319403,-0.632832,-1.088249
2022-05-30 05:00:00,1.181504,-0.632832,-1.185891
2022-05-30 06:00:00,1.207062,-0.631238,-1.156358
2022-05-30 07:00:00,1.200057,-0.457494,-1.043026
2022-05-30 08:00:00,1.142475,0.033451,-0.882983


In [50]:
hist_cols = ['price']
hist = data_scaled.loc[t_idx.t_start[test_idx]:t_idx.to[test_idx], hist_cols]
hist

Unnamed: 0_level_0,price
dttm,Unnamed: 1_level_1
2022-05-29 23:00:00,-0.001768
2022-05-30 00:00:00,0.011552
2022-05-30 01:00:00,-0.211038
2022-05-30 02:00:00,-0.543346
2022-05-30 03:00:00,-0.793629
2022-05-30 04:00:00,-0.270279
2022-05-30 05:00:00,-0.044884
2022-05-30 06:00:00,-0.711253
2022-05-30 07:00:00,-1.054778
2022-05-30 08:00:00,-0.851818


In [61]:
# hour [0-23]
t_idx.to[test_idx].hour

23

In [58]:
# dayofweek [0-6]
t_idx.to[test_idx].dayofweek

0

In [55]:
# month 1-12
t_idx.to[test_idx].month

5

In [83]:
def create_tabular_data(to):
    tab_df = pd.DataFrame()
    
    hours = pd.DataFrame({f'hour_{i}':1*(to.hour == i) for i in range(24)}, index=[0])
    dayofweek = pd.DataFrame({f'day_{i}':1*(to.dayofweek == i) for i in range(7)}, index=[0])
    month = pd.DataFrame({f'month_{i}':1*(to.month == i) for i in range(1, 13)}, index=[0])
        
        
    return pd.concat([month, dayofweek, hours], axis=1)

In [85]:
tabular_data = create_tabular_data(t_idx.to[test_idx])
tabular_data.transpose()

Unnamed: 0,0
month_1,0
month_2,0
month_3,0
month_4,0
month_5,1
month_6,0
month_7,0
month_8,0
month_9,0
month_10,0


## Create database

In [None]:
# create database, tables, and indexes
def create_database():

    PROJECT_SRC = '/workspace/src'
    os.chdir(PROJECT_SRC)

    SQLALCHEMY_DATABASE_URI='sqlite:///../data/database.db'
    engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=False)

    with engine.connect() as conn:

        gen_table='''
        CREATE TABLE IF NOT EXISTS GENERATION (
        dttm TEXT PRIMARY KEY, 
        wind REAL,
        solar REAL
        );'''
        conn.execute(gen_table)

        gen_idx='''
        CREATE UNIQUE INDEX IF NOT EXISTS GENERATION_IDX
        ON GENERATION(dttm);
        '''
        conn.execute(gen_idx)


        load_table='''
        CREATE TABLE IF NOT EXISTS LOAD (
        dttm TEXT PRIMARY KEY, 
        load_mwh REAL
        );'''
        conn.execute(load_table)

        load_idx='''
        CREATE UNIQUE INDEX IF NOT EXISTS LOAD_IDX
        ON LOAD(dttm);
        '''
        conn.execute(load_idx)

        price_table='''
        CREATE TABLE IF NOT EXISTS PRICE (
        dttm TEXT PRIMARY KEY, 
        price REAL
        );'''
        conn.execute(price_table)

        price_idx='''
        CREATE UNIQUE INDEX IF NOT EXISTS PRICE_IDX
        ON PRICE(dttm);
        '''
        conn.execute(price_idx)
        
    return engine
    
    

In [None]:
engine = create_database()

In [None]:
engine.table_names()

In [None]:
# url examples

# hourly forecasted and actual load by region, starts 20200101
# loads stay in standard time
# https://docs.misoenergy.org/marketreports/20210314_rf_al.xls
# https://docs.misoenergy.org/marketreports/20211107_rf_al.xls

# hourly generation mix by region, starts 20200101
# https://docs.misoenergy.org/marketreports/20210214_sr_gfm.xlsx
# hourly region price report, starts 20200101
# https://docs.misoenergy.org/marketreports/20220603_rt_pr.xls



# base url for imports
base_url = 'https://docs.misoenergy.org/marketreports/'
load_file = '_rf_al.xls'
generation_file = '_sr_gfm.xlsx'
price_file = '_rt_pr.xls'

In [None]:
start = datetime.datetime(2020, 1, 1)
end = datetime.datetime(2022, 6, 1)
days = pd.date_range(start, end)
days

In [None]:
# get YYYYMMDD string from date
days_str = [str(d).split(' ')[0].replace('-','') for d in days]
days_str[0:10]

## load data

In [None]:
def get_load_data(
    days_idx: int, 
    base_url: str = base_url, 
    days_str: List[str] = days_str, 
    file: str = load_file,
):
    # build url and fetch data
    url = base_url + days_str[days_idx] + file
    load_data = pd.read_excel(url, header=1, skiprows=4).iloc[1:26,1:]
    
    # remove rows with missing hour ending values and rename columns
    load_data = load_data[~load_data.HourEnding.isna()]
    load_data = load_data[['Market Day', 'HourEnding', 'North ActualLoad (MWh)']]
    load_data = load_data.rename(
        columns={'Market Day':'day',
                'HourEnding':'he',
                'North ActualLoad (MWh)':'load_mwh'}
    )
    
    # convert data types
    load_data['he'] = load_data.he.astype(int)
    load_data['load_mwh'] = load_data.load_mwh.astype(float)
    load_data.day= pd.to_datetime(load_data.day)
    
    # create dttm index
    load_data['dttm'] = (
        load_data.day + 
        pd.to_timedelta(load_data.he, unit='h')
    )
    
    load_data = load_data[['dttm', 'load_mwh']].set_index('dttm')
    
    return load_data

In [None]:
load_data = get_load_data(0)
load_data

In [None]:
load_data.info()

## generation data

In [None]:
def get_generation_data(
    days_idx: int, 
    base_url: str = base_url, 
    days_str: List[str] = days_str, 
    file: str = generation_file,
):
    # build url and fetch data
    url = base_url + days_str[days_idx] + file
    generation_data = pd.read_excel(url, skiprows=2, header=[1,2])
    
    # save he data
    # he = hour ending
    he = generation_data[('Unnamed: 0_level_0', 'Market Hour Ending')]
    
    # get wind, solar, other generation from north region
    generation_data = generation_data['North']
    cols = [c for c in generation_data.columns if c in ['Wind', 'Solar', 'Other']]
    generation_data = generation_data[cols]
    
    # set he data and remove non numeric values
    generation_data['he'] = he
    idx = generation_data.he.str.isnumeric() != False
    generation_data = generation_data[idx]
    
    # if solar generation was not reported estimate generation from other
    # by removing subtracting off night generation
    if 'Solar' not in generation_data.columns:
        generation_data['Solar'] = (generation_data.Other - 
                                    np.max((generation_data.Other[:5].max(), generation_data.Other[-5:].max()))
                                   )
        generation_data.loc[generation_data.Solar < 0, 'Solar'] = 0
    
    generation_data.drop('Other', axis=1, inplace=True)
    
    # create dttm index
    generation_data['day'] = pd.to_datetime(days_str[days_idx])
    
    generation_data['dttm'] = (
        generation_data.day + 
        pd.to_timedelta(generation_data.he, unit='h')
    )
    generation_data = generation_data.drop(['he', 'day'], axis=1).set_index('dttm')
    
    return generation_data

In [None]:
generation_data = get_generation_data(0)
generation_data

In [None]:
generation_data.info()

## price data

In [None]:
def get_price_data(
    days_idx: int, 
    base_url: str = base_url, 
    days_str: List[str] = days_str, 
    file: str = price_file,
):
    # build url and fetch data
    url = base_url + days_str[days_idx] + file
    price_data = pd.read_excel(url, skiprows=11, header=0)
    
    # will use minnesota hub prices
    # he = hour ending
    price_data = price_data.rename(columns={'Unnamed: 0':'he', 'Minnesota Hub':'price'})
    
    # get only he row containing Hour
    idx = ['Hour' in he for he in price_data.he]
    price_data = price_data.loc[idx, ['he', 'price']]
    
    # set day and get he as int
    price_data['day'] = pd.to_datetime(days_str[days_idx])
    price_data['he'] = [int(he.split()[1]) for he in price_data.he]
    
    # get dttm and drop day and he
    price_data['dttm'] = (
        price_data.day + 
        pd.to_timedelta(price_data.he, unit='h')
    )
    price_data = price_data.drop(['he', 'day'], axis=1).set_index('dttm')
    
    return price_data

In [None]:
price_data = get_price_data(0)
price_data

In [None]:
price_data.info()

## Test upserts

### Generation

In [None]:
def upsert_generation(
    generation_data: pd.core.frame.DataFrame, 
    engine: sqlalchemy.engine.base.Engine,
):
    
    with engine.connect() as conn:

        generation_data.to_sql('GENERATION_TMP', engine, if_exists='replace')

        upsert='''INSERT INTO GENERATION 
        SELECT *
        FROM GENERATION_TMP WHERE true
        ON CONFLICT (dttm) DO UPDATE SET Wind=excluded.Wind, Solar=excluded.Solar;
        '''

        conn.execute(upsert)

        drop_tbl='DROP TABLE GENERATION_TMP'
        conn.execute(drop_tbl)

In [None]:
upsert_generation(generation_data, engine)
upsert_generation(generation_data, engine)

In [None]:
type(generation_data)

In [None]:
type(engine)

In [None]:
# engine.table_names()

In [None]:
with engine.connect() as conn:
    
    sql_text='''
    SELECT *
    FROM GENERATION ;
    '''
    
    res = pd.read_sql(sql_text, engine)
    
res

### Load

In [None]:
def upsert_load(
    load_data: pd.core.frame.DataFrame, 
    engine: sqlalchemy.engine.base.Engine,
):
    
    with engine.connect() as conn:

        load_data.to_sql('LOAD_TMP', engine, if_exists='replace')

        upsert='''INSERT INTO LOAD 
        SELECT *
        FROM LOAD_TMP WHERE true
        ON CONFLICT (dttm) DO UPDATE SET load_mwh=excluded.load_mwh;
        '''

        conn.execute(upsert)

        drop_tbl='DROP TABLE LOAD_TMP'
        conn.execute(drop_tbl)

In [None]:
upsert_load(load_data, engine)
upsert_load(load_data, engine)

In [None]:
# engine.table_names()

In [None]:
with engine.connect() as conn:
    
    sql_text='''
    SELECT *
    FROM LOAD ;
    '''
    
    res = pd.read_sql(sql_text, engine)
    
res

### Prices

In [None]:
def upsert_prices(
    price_data: pd.core.frame.DataFrame, 
    engine: sqlalchemy.engine.base.Engine,
):
    
    with engine.connect() as conn:

        price_data.to_sql('PRICE_TMP', engine, if_exists='replace')

        upsert='''INSERT INTO PRICE 
        SELECT *
        FROM PRICE_TMP WHERE true
        ON CONFLICT (dttm) DO UPDATE SET price=excluded.price;
        '''

        conn.execute(upsert)

        drop_tbl='DROP TABLE PRICE_TMP'
        conn.execute(drop_tbl)
    

In [None]:
upsert_prices(price_data, engine)
upsert_prices(price_data, engine)

In [None]:
# engine.table_names()

In [None]:
with engine.connect() as conn:
    
    sql_text='''
    SELECT *
    FROM PRICE ;
    '''
    
    res = pd.read_sql(sql_text, engine)
    
res

## Combine data

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for i in range(10):
    print('\n_____________________________')
    print(f'working on day: {days_str[i]}')
    
    print('\t-getting load...')
    # filename = '../data/load_data.csv'
    # hdr = False  if os.path.isfile(filename) else True
    load_data = get_load_data(i)
    upsert_load(load_data, engine)
    # load_data.to_csv(filename, mode='a', header=hdr)
    print(f'\t\tload_data shape: {load_data.shape}')
    
    print('\t-getting generation...')
    # filename = '../data/generation_data.csv'
    # hdr = False  if os.path.isfile(filename) else True
    generation_data = get_generation_data(i)
    upsert_generation(generation_data, engine)
    # generation_data.to_csv(filename, mode='a', header=hdr)
    print(f'\t\tgeneration_data shape: {generation_data.shape}')
    
    print('\t-getting prices...')
    # filename = '../data/price_data.csv'
    # hdr = False  if os.path.isfile(filename) else True
    price_data = get_price_data(i)
    upsert_prices(price_data, engine)
    # price_data.to_csv(filename)
    print(f'\t\tprice_data shape: {price_data.shape}')
    
    time.sleep(1)

In [None]:
with engine.connect() as conn:
    
    sql_text='''
    SELECT *
    FROM PRICE ;
    '''
    
    res = pd.read_sql(sql_text, engine)
    
res