## Load data

In [2]:
import pandas as pd

In [3]:
df_base = pd.read_csv('../data/general/pjm_pivot.csv', index_col=0, parse_dates=True)
df_base.head()

Unnamed: 0_level_0,AE,AEP,AP,ATSI,BC,CE,DAY,DEOK,DOM,DPL,DUQ,EKPC,JC,ME,PE,PEP,PL,PN,PS,RECO
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2018-07-01 05:00:00,1301,12187,4384,6960,3134,12136,1619,2895,9775,1919,1466,1209,2388,1442,4397,3165,3835,1611,5009,186
2018-07-01 06:00:00,1314,11946,4391,6762,3139,11872,1605,2857,9787,1950,1455,1197,2416,1465,4423,3156,3901,1641,4990,187
2018-07-01 07:00:00,1410,12664,4757,6670,3377,11992,1707,2997,10453,2160,1528,1273,2644,1605,4743,3332,4232,1728,5267,202
2018-07-01 08:00:00,1567,14069,5308,7065,3788,12860,1916,3331,11734,2470,1688,1472,3064,1784,5230,3679,4613,1899,5735,230
2018-07-01 09:00:00,1749,15610,5862,7833,4262,14212,2145,3703,13084,2765,1875,1656,3569,1972,5752,4085,5014,2055,6299,259


## Select region

In [4]:
region = 'AE'

In [5]:
df = df_base.loc[:, [region]]
df

Unnamed: 0_level_0,AE
period,Unnamed: 1_level_1
2018-07-01 05:00:00,1301
2018-07-01 06:00:00,1314
2018-07-01 07:00:00,1410
2018-07-01 08:00:00,1567
2018-07-01 09:00:00,1749
...,...
2023-11-01 00:00:00,1160
2023-11-01 01:00:00,1152
2023-11-01 02:00:00,1113
2023-11-01 03:00:00,1049


## Data Preprocessing

### Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3, shuffle=False)
test, prod = train_test_split(test, test_size=0.5, shuffle=False)

### Export Data

In [7]:
import os

path_region = f'../data/regions/{region}'

if not os.path.exists(path_region):
    os.makedirs(path_region)

for df, name in zip([train, test, prod], ['train', 'test', 'prod']):
    df.to_csv(f'{path_region}/{name}.csv')

### Scale Data

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

train_norm = scaler.fit_transform(train)
test_norm = scaler.transform(test)

### Create Sequences

In [9]:
import numpy as np

def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    return np.array(X), np.array(y)

sequence_length = 24 # Use 24 hours prior to predict the following hour
X_train, y_train = create_sequences(train_norm, sequence_length)
X_test, y_test = create_sequences(test_norm, sequence_length)

## Modelling

### Design NN Architecture

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


### Train Model

In [12]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(
    X_train, y_train, epochs=50, batch_size=64, verbose=0,
    validation_data=(X_test, y_test), callbacks=[early_stop])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


### Calculate Predictions

In [13]:
y_pred = model.predict(X_test)
y_pred

array([[0.11617891],
       [0.15235904],
       [0.19788644],
       ...,
       [0.22454396],
       [0.25258493],
       [0.29553095]], dtype=float32)

### Comparison: Real Data & Prediction

#### Descale Data

In [14]:
y_pred_scaled_inverse = scaler.inverse_transform(y_pred)
y_true = scaler.inverse_transform(y_test)

#### Create DataFrame

In [15]:
dic_pred = {
    'y_pred': y_pred.flatten(),
    'y_pred_scaled_inverse': y_pred_scaled_inverse.flatten(),
    'y_true': y_true.flatten(),
}

df_pred = pd.DataFrame(dic_pred)
df_pred

Unnamed: 0,y_pred,y_pred_scaled_inverse,y_true
0,0.116179,769.605652,782.0
1,0.152359,853.254089,889.0
2,0.197886,958.513428,986.0
3,0.229340,1031.235229,1055.0
4,0.246754,1071.495361,1041.0
...,...,...,...
6707,0.201757,967.461060,969.0
6708,0.206564,978.576538,992.0
6709,0.224544,1020.145630,1046.0
6710,0.252585,1084.976318,1136.0


#### Evaluate Model

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
mean_squared_error(df_pred.y_true, df_pred.y_pred_scaled_inverse, squared=False)

26.697341518137467

In [18]:
model.save(f'../models/{region}.keras')