# Short-term residential load forecasting with Deep Learning

London Households SmartMeter Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

#!pip install -U pip
#!pip install -U setuptools wheel

#!pip install autogluon


Creat two load forecasts...
1) half-hourly load forecast for next 24 hours
2) peak half-hour in the next 24 hours

Value...
* For electric network operator, minimize the amount of spinning reserve

Data...
* residential smart meter usage data
* weather data
* weather forecast data

# Load Data

In [None]:
# load half-hourly electric usage data
# https://data.london.gov.uk/dataset/smartmeter-energy-use-data-in-london-households
d = pd.read_csv('/kaggle/input/small-lcl-data/LCL-June2015v2_99.csv', parse_dates=["DateTime"])

In [None]:
# load hourly weather data
# https://data.london.gov.uk/dataset/smartmeter-energy-use-data-in-london-households
weatherData = pd.read_csv('/kaggle/input/smart-meters-in-london/weather_hourly_darksky.csv', parse_dates=["time"])

In [None]:
weatherData.describe()

In [None]:
d.info()

In [None]:
# convert usage to floats
# print(d.iloc[3])
d.iloc[:, 3] = pd.to_numeric(d.iloc[:, 3], errors='coerce')
# print(d.dtypes)
# rename usage column for easier reference
#d.rename(columns={"KWH/hh (per half hour)": "KWHperHH"}, inplace=True)
d.rename(columns={d.columns[3]: 'KWHperHH'}, inplace=True)
# d.rename_col_by_index(3, 'KWHperHH')
d.info()

In [None]:
# set timestamp as the index
d.set_index('DateTime')


In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(d, tsmode=True, sortby="DateTime")
profile.to_file('profile_report.html')
profile

In [None]:
# visualize and handle duplicates
# d = d.drop_duplicates()
print(d.groupby(d.columns.tolist(),as_index=False).size())
dupes = d[d.duplicated()]
print('dupes', dupes)
print('dupes.index', dupes.index)
d = d.drop(index=dupes.index)

d.info()

In [None]:
d.set_index('DateTime')
d.info()

Visualize smart meter dataset to anayze for quality, completenes and othe insights

In [None]:
import seaborn as sns

In [None]:
# visualize meter read coverage and completeness
pivot_table = pd.pivot_table(d, columns='DateTime', index='LCLid', values='KWHperHH')
# print(pivot_table)
plt.subplots(figsize=(20,15))
sns.heatmap(pivot_table)


Observations from Heatmap...
* several houses start producing load part-way through the period
    - eg MAC004221, MAC004248
    
    
* several houses stop producing part-way through the period
    - eg MAC004226, MAC004257
    

* most houses have at least one "gap" in their data (visible as white lines)


* several houses stand out as having significantly higher average load than others
    - eg MAC004225, MAC004249

In [None]:
# inspect and remove records not exactly on the half-hour
offRecs = d.query("DateTime.dt.minute not in (0,30) or DateTime.dt.second != 0")
# aggLoad["DateTime"].dt.hour > 30
print('Records not exactly on the half-hour:\n ', offRecs)
print(offRecs.info())

# delete records not exactly on the half-hour
d = d.drop(offRecs.index)

offRecs = d.query("DateTime.dt.minute not in (0,30) or DateTime.dt.second != 0")
print('Records not exactly on the half-hour: ', offRecs)

In [None]:
d.info()

In [None]:
# First step of interpolation is to create NaN records where records are missing
df = d.copy()
# df.info()
df = df.sort_values(by=['DateTime'])
df = df.set_index('DateTime')
df.index.rename('DateTime', inplace=True)
# df.info()

# df['datetime'] = pd.to_datetime(df['datetime'])
# df.index = df['datetime']
# del df['datetime']

df_interpol = df.groupby('LCLid')\
                .resample('30Min')\
                .mean()
df_interpol['KWHperHH'] = df_interpol['KWHperHH'].interpolate()
# df_interpol.info()
df_interpol = df_interpol.reset_index()

# df_interpol['LCLid'], df_interpol['DateTime'] = df_interpol.index
df_interpol.head(4)
df_interpol.info()

In [None]:
# visualize after interpolating missing values
df_interpol.info()
pivot_table = pd.pivot_table(df_interpol, columns='DateTime', index='LCLid', values='KWHperHH')
plt.subplots(figsize=(20,15))

sns.heatmap(pivot_table)

In [None]:
# visualize zeros in the dataset
df_interpol.info()
df_interpol['ZeroKWHperHH'] = df_interpol['KWHperHH'] == 0
pivot_table = pd.pivot_table(df_interpol, columns='DateTime', index='LCLid', values='ZeroKWHperHH')
plt.subplots(figsize=(20,15))

sns.heatmap(pivot_table)

Obervation: there are a handful of households that account all the zero value meter reads: MAC004233, MAC004226, MAC004267

In [None]:
# investigate the meters with zero reads
MAC004233 = df_interpol.query("LCLid == 'MAC004233'")

fig, ax = plt.subplots(4,figsize=(20,9))

# plot whole ~2 years
ax[0].plot(MAC004233.DateTime, MAC004233.KWHperHH)
ax[0].plot(MAC004233.DateTime, MAC004233.ZeroKWHperHH)
ax[0].set(ylabel='KWH/hh',
       title='Load from one Household MAC004233 with lots of zero values')
plt.tick_params(rotation=45)
ax[0].grid()

# zoom in
ax[1].plot(MAC004233.DateTime[11000:15000], MAC004233.KWHperHH[11000:15000])
ax[1].plot(MAC004233.DateTime[11000:15000], MAC004233.ZeroKWHperHH[11000:15000])
ax[1].set(xlabel='time (s)', ylabel='KWH/hh')
plt.tick_params(rotation=45)
ax[1].grid()

# zoom in more...
ax[2].plot(MAC004233.DateTime[13000:13500], MAC004233.KWHperHH[13000:13500])
ax[2].plot(MAC004233.DateTime[13000:13500], MAC004233.ZeroKWHperHH[13000:13500])
ax[2].set(xlabel='time (s)', ylabel='KWH/hh')
plt.tick_params(rotation=45)
ax[2].grid()

# zoom in to a different part of the series...
ax[3].plot(MAC004233.DateTime[25000:25500], MAC004233.KWHperHH[25000:25500])
ax[3].plot(MAC004233.DateTime[25000:25500], MAC004233.ZeroKWHperHH[25000:25500])
ax[3].set(xlabel='time (s)', ylabel='KWH/hh')
plt.tick_params(rotation=45)
ax[3].grid()

fig.savefig("MAC004233.png")
plt.show()

# Observation:

The zeros for MAC004233 seem legit - leaving them in

In [None]:
# investigate the meters with zero reads
MAC004267 = df_interpol.query("LCLid == 'MAC004267'")
fig, ax = plt.subplots(4,figsize=(20,9))

# plot whole ~2 years
ax[0].plot(MAC004267.DateTime, MAC004267.KWHperHH)
ax[0].plot(MAC004267.DateTime, MAC004267.ZeroKWHperHH)
ax[0].set(ylabel='KWH/hh',
       title='Load from one Household MAC004233 with lots of zero values')
plt.tick_params(rotation=45)
ax[0].grid()

# zoom in
ax[1].plot(MAC004267.DateTime[17000:21000], MAC004267.KWHperHH[17000:21000])
ax[1].plot(MAC004267.DateTime[17000:21000], MAC004267.ZeroKWHperHH[17000:21000])
ax[1].set(xlabel='time (s)', ylabel='KWH/hh')
plt.tick_params(rotation=45)
ax[1].grid()

# zoom in more...
ax[2].plot(MAC004267.DateTime[19300:19800], MAC004267.KWHperHH[19300:19800])
ax[2].plot(MAC004267.DateTime[19300:19800], MAC004267.ZeroKWHperHH[19300:19800])
ax[2].set(xlabel='time (s)', ylabel='KWH/hh')
plt.tick_params(rotation=45)
ax[2].grid()

# zoom in to a different part of the series...
ax[3].plot(MAC004267.DateTime[25000:25500], MAC004267.KWHperHH[25000:25500])
ax[3].plot(MAC004267.DateTime[25000:25500], MAC004267.ZeroKWHperHH[25000:25500])
ax[3].set(xlabel='time (s)', ylabel='KWH/hh')
plt.tick_params(rotation=45)
ax[3].grid()

fig.savefig("MAC004233.png")
plt.show()

# Observation

The zeros for MAC004233 seem legit - leaving them in


In [None]:
# visualize and handle outliers
d = df_interpol.copy()

# minumum and maximum timestamp for each house
print(d.groupby('LCLid').max().sort_values('DateTime'))
print(d.groupby('LCLid').min().sort_values('DateTime'))
print(d.groupby('LCLid').count().sort_values('DateTime'))

print(d.groupby('LCLid').agg(['min', 'max', 'count']))


In [None]:
# which house has the highest peak load?

# which house has the highest total aggregate load?

# how variable / predictable is the timing of the peak load

# how accurate is the next 24 hours forecast profile overall?

# how accurate is the peak load forecast in next 24 hours?

# normalize and standardize


In [None]:
# extract one smartmeter for plotting
sample = d.query("LCLid == 'MAC004233'")
sample

In [None]:
# Visualize load profile for one household meter
fig, ax = plt.subplots()
ax.plot(sample.iloc[100:4500,1], sample.iloc[100:4500,2])

ax.set(xlabel='time (s)', ylabel='KWH/hh',
       title='Load from one Household, June-September 2012')
plt.tick_params(rotation=45)
ax.grid()

fig.savefig("Load from one Household, June-September 2012.png")
plt.show()

In [None]:
# set index for the sample
sample.set_index('DateTime')

# EDA: Visualize daily average load for each meter and all meters...

In [None]:
# calculate average daily load profile for all meters...
# work with a copy of dataset...
MeterData = df_interpol.copy()

avgLoadProfile = pd.DataFrame(MeterData.groupby([MeterData['DateTime'].dt.hour, MeterData['DateTime'].dt.minute]).KWHperHH.mean())
avgLoadProfile = avgLoadProfile.reset_index(names=['hour', 'minute'])
avgLoadProfile['labels'] = pd.to_datetime(avgLoadProfile['hour'].astype(str) + ':' + avgLoadProfile['minute'].astype(str), format='%H:%M').dt.time

# print(avgLoadProfile.info())
# print(avgLoadProfile)

fig, ax = plt.subplots(figsize=(10,7))

ax.set_xticks(avgLoadProfile.index, avgLoadProfile.labels)

ax.set(xlabel='time (HH:MI)', ylabel='KWH/hh',
       title='Average Household 24 hour load profile')

# calculate average daily load for each meter...
avgLoadProfileEachMeter = pd.DataFrame(MeterData.groupby(['LCLid', MeterData['DateTime'].dt.hour, MeterData['DateTime'].dt.minute]).agg({'KWHperHH': 'mean'}))
avgLoadProfileEachMeter = avgLoadProfileEachMeter.reset_index(names=['LCLid', 'hour', 'minute'])
# print(avgLoadProfileEachMeter.info())
# print(avgLoadProfileEachMeter)

# plot every meter
for meter in avgLoadProfileEachMeter.LCLid.unique():
    # print(meter)
    ax.plot(avgLoadProfileEachMeter.loc[avgLoadProfileEachMeter['LCLid'] == meter].index % 48, 
            avgLoadProfileEachMeter.loc[avgLoadProfileEachMeter['LCLid'] == meter].KWHperHH,
           color='grey')

# plot the average
ax.plot(avgLoadProfile.index, avgLoadProfile.KWHperHH, linewidth=5)

plt.tick_params(rotation=45)
ax.grid()

fig.savefig("Avg 24hr Load Profile every meter.png")
plt.show()

In [None]:
# Calculate the sum of all loads for each timestamp using `groupby()` and `agg()`
aggLoad = d.groupby('DateTime')['KWHperHH'].agg('sum')
aggLoad = pd.DataFrame(aggLoad)
aggLoad = aggLoad.reset_index()
aggLoad.columns = ['DateTime', 'AggregateLoad']

print(aggLoad)
print(aggLoad.describe())
print(aggLoad.info())

In [None]:
aggLoad = aggLoad.sort_values(by=['DateTime'])
aggLoad = aggLoad.set_index('DateTime')
aggLoad.index.rename('DateTimeIndex', inplace=True)
aggLoad.info()

In [None]:
aggLoad['DateTime'] = aggLoad.index
aggLoad.info()

In [None]:
# inspect and fix records with zero load
# start with the aggregated records with zero load
AggZeros = aggLoad.query("AggregateLoad == 0")
AggZeros



Observation: Some of the timestamps are not exactly on the half-hour
Question: How many of the timestamps are not exactly on the half-hour?

In [None]:
# inspect and fix records not exactly on the half-hour
offRecs = aggLoad.query("DateTime.dt.minute not in (0,30) or DateTime.dt.second != 0")
# aggLoad["DateTime"].dt.hour > 30
print('Records not exactly on the half-hour: ', offRecs)
print(offRecs.info())

# delete records not exactly on the half-hour
aggLoad = aggLoad.drop(offRecs.index)

offRecs = aggLoad.query("DateTime.dt.minute not in (0,30) or DateTime.dt.second != 0")
print('Records not exactly on the half-hour: ', offRecs)

In [None]:
# check for missing records in the aggregate load time series
# create reference time series
minTimestamp = aggLoad.index.min()
maxTimestamp = aggLoad.index.max()

print('minTimestamp: ', minTimestamp)
print('maxTimestamp: ', maxTimestamp)

date_range = pd.date_range(minTimestamp, maxTimestamp, freq='30Min')
reference_df = pd.DataFrame(np.random.randint(1, 20, (date_range.shape[0], 1)))
reference_df.index = date_range  # set index

print('reference index length: ', reference_df.shape)
print('aggLoad index length: ', aggLoad.shape)

print('reference_df: ', reference_df)
print('aggLoad: ', aggLoad)

print('reference index: ', reference_df.index)
print('aggLoad index: ', aggLoad.index)

# check for missing datetimeindex values based on reference index (with all values)
missing_dates = reference_df.index[~reference_df.index.isin(aggLoad.index)]

print('missing_dates: ', missing_dates)

In [None]:
# check the regularity of the observations (time between observations)
# print(pd.infer_freq(train_data.DateTime))
aggLoad.index.to_series().diff().value_counts()

In [None]:
# Calculate moving average and stddev
window_size = int(len(aggLoad.AggregateLoad) / 10)
print(window_size)

aggLoadMovingStdev = aggLoad.AggregateLoad.rolling(window_size).std()
aggLoadMovingStdev.columns = ['MovingStdev']
# aggLoadMovingStdev.columns.values[0] = 'MovingStdev'

aggLoadMovingAvg = aggLoad.AggregateLoad.rolling(window_size).mean()
aggLoadMovingAvg.columns = ['MovingAvg']

print('aggLoadMovingStdev:\n', aggLoadMovingStdev)
print(aggLoadMovingStdev.info())
print('aggLoadMovingAvg:\n', aggLoadMovingAvg)
print(aggLoadMovingAvg.info())

# aggLoad['MovingStdev'] = aggLoad.AggregateLoad.rolling(window_size).std()
# aggLoad['MovingAvg'] = aggLoad.AggregateLoad.rolling(window_size).mean()

# print('aggLoad.MovingStdev:\n', aggLoad.MovingStdev)
# print('aggLoad.MovingAvg:\n', aggLoad.MovingAvg)



In [None]:
print(aggLoad)

fig, ax = plt.subplots(figsize=(20,7))
ax.plot(aggLoad.DateTime, aggLoad.AggregateLoad)
# ax.plot(aggLoad.DateTime, aggLoad.MovingAvg, linewidth=3)
# ax.plot(aggLoad.DateTime, aggLoad.MovingStdev, linewidth=3)
ax.plot(aggLoad.DateTime, aggLoadMovingAvg, linewidth=3)
ax.plot(aggLoad.DateTime, aggLoadMovingStdev, linewidth=3)

ax.set(xlabel='time (s)', ylabel='KWH/hh',
       title='Aggregate Household load 2012-2014')
plt.tick_params(rotation=45)
ax.grid()

fig.savefig("test.png")
plt.show()

In [None]:
aggLoad.head()


In [None]:
fig, ax = plt.subplots(figsize=(20,7))
ax.plot(aggLoad.DateTime[10000:15000], aggLoad.AggregateLoad[10000:15000])

ax.set(xlabel='time (s)', ylabel='KWH/hh',
       title='Aggregate Household load June-August 2012')
plt.tick_params(rotation=45)
ax.grid()

fig.savefig("Aggregate Household load June-August 2012.png")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
ax.plot(aggLoad.DateTime[12000:13000], aggLoad.AggregateLoad[12000:13000])

ax.set(xlabel='time (s)', ylabel='KWH/hh',
       title='Aggregate Household load')
plt.tick_params(rotation=45)
ax.grid()

fig.savefig("test.png")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,7))
ax.plot(aggLoad.DateTime[12500:12600], aggLoad.AggregateLoad[12500:12600])

ax.set(xlabel='time (s)', ylabel='KWH/hh',
       title='Aggregate Household load ~two days')
plt.tick_params(rotation=45)
ax.grid()

fig.savefig("test.png")
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.plot(aggLoad.DateTime[12500:12550], aggLoad.AggregateLoad[12500:12550])

ax.set(xlabel='time (s)', ylabel='KWH/hh',
       title='Aggregate Household load (one day)')
plt.tick_params(rotation=45)
ax.grid()

fig.savefig("test.png")
plt.show()

In [None]:
def prediction_plot(testY, test_predict):
      len_prediction=[x for x in range(len(testY))]
      plt.figure(figsize=(20,5))
      plt.plot(len_prediction, testY, marker='.', label="actual")
      plt.plot(len_prediction, test_predict, 'r', label="prediction")
      plt.tight_layout()
      sns.despine(top=True)
      plt.subplots_adjust(left=0.07)
      plt.ylabel('KWH per half hour', size=15)
      plt.xlabel('Time step', size=15)
      plt.legend(fontsize=15)
      plt.show();

# Use a naive persistence model as baseline to compare more sophisticated models
Use a 1 week persistence

Georgios Tziolis, Chrysovalantis Spanias, Maria Theodoride, Spyros Theocharides, Javier Lopez-Lorente, Andreas Livera, George Makrides, George E. Georghiou,

Short-term electric net load forecasting for solar-integrated distribution systems based on Bayesian neural networks and statistical post-processing,

Energy,
Volume 271,
2023,
127018,
ISSN 0360-5442,

https://doi.org/10.1016/j.energy.2023.127018.

In [None]:
# Naive 1 week persistence model
NaiveForecast = aggLoad.AggregateLoad.copy()

OneWeekNPeriods = 48 * 7

NaiveForecast[:OneWeekNPeriods] = np.nan

for i in range(OneWeekNPeriods, len(aggLoad.AggregateLoad)):
    NaiveForecast[i] = aggLoad.AggregateLoad[i - OneWeekNPeriods]
    


In [None]:
# visualize naive forecast
prediction_plot(aggLoad.AggregateLoad, NaiveForecast)

In [None]:
# calculate error for naive model
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
print('Naive Root Mean Squared Error(RMSE): %.2f; Naive Mean Absolute Error(MAE) : %.2f; Naive Mean Absolute Percantage Error(MAPE) : %.2f '
      % (np.sqrt(mean_squared_error(aggLoad.AggregateLoad[OneWeekNPeriods:], NaiveForecast[OneWeekNPeriods:])),
         mean_absolute_error(aggLoad.AggregateLoad[OneWeekNPeriods:], NaiveForecast[OneWeekNPeriods:]),
         mean_absolute_percentage_error(aggLoad.AggregateLoad[OneWeekNPeriods:], NaiveForecast[OneWeekNPeriods:])))


In [None]:
# ADF Test function
import statsmodels.tsa.stattools as smt
def adf_test(series):
 result = smt.adfuller(series.dropna())
 print('ADF Statistic: %f' % result[0])
 print('p-value: %f' % result[1])
 return result

In [None]:
adf_test(aggLoad.AggregateLoad)

In [None]:
import statsmodels.tsa.seasonal as sts

components = sts.seasonal_decompose(aggLoad.AggregateLoad, period=48) # 48 = one day
components.plot()
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(aggLoad.AggregateLoad, lags = 96) 


In [None]:
! pip install pmdarima

find best parameters for ARIMA
import pmdarima as pm
model = pm.auto_arima(aggLoad.AggregateLoad, start_p=1, start_q=1,
                      test='adf',       
use adftest to find optimal 'd'
                      max_p=3, max_q=3, 
maximum p=3 and q=3
                      m=17532,              
periodicity of 48 months as the data timeline is in h
                      d=None,           
let the model determine 'd'
                      seasonal=True,   # Seasonality
                      start_P=0, 
                      D=1, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)
print(model.summary())

In [None]:
# print(model.summary())

In [None]:
# Split the time series data into train, test, and validation datasets
train_size = int(len(aggLoad) * 0.7)  # 70% for training
test_size = int(len(aggLoad) * 0.2)   # 20% for testing
val_size = len(aggLoad) - train_size - test_size  # Remaining for validation

train_data = aggLoad[:train_size]
test_data = aggLoad[train_size:train_size+test_size]
val_data = aggLoad[train_size+test_size:]

print('train_data.head()', train_data.head())
print('test_data.head()', test_data.head())
print('val_data.head()', val_data.head())
print(train_data.info())

Create an ARIMA algorithm baseline to assess other models against 

# Create a Deep Learning time series forecasting model using Keras

In [None]:
import tensorflow as tf
from tensorflow import keras

def convert2matrix(data_arr, look_back):
   X, Y =[], []
   for i in range(len(data_arr)-look_back):
       d=i+look_back  
       X.append(data_arr[i:d,])
       Y.append(data_arr[d,])
   return np.array(X), np.array(Y)

In [None]:
# RNN
# work with a copy of the dataset
df1 = aggLoad.copy()
# print(df1.head())
df1 = df1.drop(columns=['DateTime'])
print(df1.head())

# prepare the data
train,test = df1.values[0:train_size,:], df1.values[train_size:train_size+test_size,:]
look_back = 96 # create window size
test = np.append(test,np.repeat(test[-1,], look_back))
train = np.append(train,np.repeat(train[-1,],look_back))
trainX,trainY =convert2matrix(train,look_back)
testX,testY =convert2matrix(test,look_back)
# reshape input to be [samples, window size, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))


print('trainX:\n', trainX.shape, trainX.dtype)


In [None]:
# define the RNN model architecture
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from keras.callbacks import EarlyStopping
def model_rnn(look_back):
  model=Sequential()
  model.add(SimpleRNN(units=8, input_shape=(1,look_back), activation="relu"))
  model.add(Dense(4, activation='relu'))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error',  optimizer='adam',metrics = ['mse', 'mae'])
  return model

In [None]:
# fit the RNN model
model=model_rnn(look_back)

history=model.fit(trainX,trainY, epochs=100, batch_size=30, verbose=1, validation_data=(testX,testY),callbacks=[EarlyStopping(monitor='val_loss', patience=10)],shuffle=False)

In [None]:
# function for plotting the train and test loss curves
def model_loss(history):
    plt.figure(figsize=(8,4))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Test Loss')
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epochs')
    plt.legend(loc='upper right')
    plt.show();

In [None]:
# predict on the train and test datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error
train_predict = model.predict(trainX)
test_predict = model.predict(testX)
print('Train Root Mean Squared Error(RMSE): %.2f; Train Mean Absolute Error(MAE) : %.2f '
      % (np.sqrt(mean_squared_error(trainY, train_predict)), mean_absolute_error(trainY, train_predict[:,0])))
print('Test Root Mean Squared Error(RMSE): %.2f; Test Mean Absolute Error(MAE) : %.2f ' 
      % (np.sqrt(mean_squared_error(testY, test_predict[:,0])), mean_absolute_error(testY, test_predict[:,0])))
model_loss(history)

In [None]:
# plot actuals and predictions for the whole test set
# print('testY:\n', testY.shape, testY)
# print('test_predict:\n', test_predict.shape, test_predict)
prediction_plot(testY, test_predict)



In [None]:
# plot actuals and RNN predictions for the first day of the test set
prediction_plot(testY[0:48], test_predict[0:48])


In [None]:
# LSTM
# work with a copy of the dataset
df1 = aggLoad.copy()
# print(df1.head())
df1 = df1.drop(columns=['DateTime'])
print(df1.head())

# prepare the data
train,test = df1.values[0:train_size,:], df1.values[train_size:train_size+test_size,:]
look_back = 48 # create window size
test = np.append(test,np.repeat(test[-1,], look_back))
train = np.append(train,np.repeat(train[-1,],look_back))
trainX,trainY =convert2matrix(train,look_back)
testX,testY =convert2matrix(test,look_back)
# reshape input to be [samples, window size, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))


print('trainX:\n', trainX.shape, trainX.dtype)

In [None]:
# LSTM
import keras
from keras.layers import LSTM

# Create the LSTM layer
lstm_layer = LSTM(units=128, input_shape=(1,look_back))

# Create the model
lstm_model = keras.Sequential()
lstm_model.add(lstm_layer)
lstm_model.add(keras.layers.Dense(8))
lstm_model.add(keras.layers.Dropout(0.2))
lstm_model.add(keras.layers.Dense(1))

# Compile the model
lstm_model.compile(loss='mean_squared_error',  optimizer='adam',metrics = ['mse', 'mae'])



In [None]:
# Fit the model
print('Fitting LSTM model...\n')
history = lstm_model.fit(trainX,trainY, 
               epochs=100, batch_size=30, verbose=1, 
               validation_data=(testX,testY),
               callbacks=[EarlyStopping(monitor='val_loss', patience=10)],
               shuffle=False)

# Make a prediction
print('Predicting train and test data using LSTM model...\n')

lstm_train_predict = lstm_model.predict(trainX)
lstm_test_predict = lstm_model.predict(testX)
print('Train Root Mean Squared Error(RMSE): %.2f; Train Mean Absolute Error(MAE) : %.2f '
      % (np.sqrt(mean_squared_error(trainY, lstm_train_predict)), mean_absolute_error(trainY, lstm_train_predict[:,0])))
print('Test Root Mean Squared Error(RMSE): %.2f; Test Mean Absolute Error(MAE) : %.2f ' 
      % (np.sqrt(mean_squared_error(testY, lstm_test_predict[:,0])), mean_absolute_error(testY, lstm_test_predict[:,0])))

# generate loss curves...
model_loss(history)

In [None]:
# CNN-LSTM

# Summary of errors for naive and RNN


In [None]:
print('Naive Root Mean Squared Error(RMSE): %.2f; Naive Mean Absolute Error(MAE) : %.2f; Naive Mean Absolute Percantage Error(MAPE) : %.2f '
      % (np.sqrt(mean_squared_error(aggLoad.AggregateLoad[OneWeekNPeriods:], NaiveForecast[OneWeekNPeriods:])),
         mean_absolute_error(aggLoad.AggregateLoad[OneWeekNPeriods:], NaiveForecast[OneWeekNPeriods:]),
         mean_absolute_percentage_error(aggLoad.AggregateLoad[OneWeekNPeriods:], NaiveForecast[OneWeekNPeriods:])))
print('RNN Train Root Mean Squared Error(RMSE): %.2f; Train Mean Absolute Error(MAE) : %.2f '
      % (np.sqrt(mean_squared_error(trainY, train_predict)), mean_absolute_error(trainY, train_predict[:,0])))
print('RNN Test Root Mean Squared Error(RMSE): %.2f; Test Mean Absolute Error(MAE) : %.2f ' 
      % (np.sqrt(mean_squared_error(testY, test_predict[:,0])), mean_absolute_error(testY, test_predict[:,0])))

# to-do: create nice graphic for this

# Plot of naive, RNN and actuals

In [None]:
      len_prediction=[x for x in range(len(testY))]
      plt.figure(figsize=(20,5))
      plt.plot(len_prediction, testY, marker='.', label="actual")
      plt.plot(len_prediction, test_predict, 'r', label="RNN prediction")
      plt.plot(len_prediction, NaiveForecast[train_size:train_size+test_size], 'g', label="Naive prediction")
        
      plt.tight_layout()
      sns.despine(top=True)
      plt.subplots_adjust(left=0.07)
      plt.ylabel('KWH per half hour', size=15)
      plt.xlabel('Time step', size=15)
      plt.legend(fontsize=15)
      plt.show();

In [None]:
      len_prediction=[x for x in range(len(testY[0:48]))]
      plt.figure(figsize=(20,5))
      plt.plot(len_prediction, testY[0:48], marker='.', label="actual")
      plt.plot(len_prediction, test_predict[0:48], 'r', label="RNN prediction")
      plt.plot(len_prediction, NaiveForecast[train_size:train_size+48], 'g', label="Naive prediction")
        
      plt.tight_layout()
      sns.despine(top=True)
      plt.subplots_adjust(left=0.07)
      plt.ylabel('KWH per half hour', size=15)
      plt.xlabel('Time step', size=15)
      plt.legend(fontsize=15)
      plt.show();

Use AutoGluon AutoML with London dataset

In [None]:
# install AutoGluon AutoML
!pip install autogluon
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [None]:
# AutoGluon specific data preparation
# work with a copy of the split datasets..."ag_" prefix stands for AutoGluon
ag_train_data = train_data.copy()
ag_test_data = test_data.copy()
ag_val_data = val_data.copy()

# AutoGluon requires an ItemID Column, so adding one...
ag_train_data['item_id'] = 'LoadSum'
ag_train_data = ag_train_data.astype({"item_id": str})
ag_test_data['item_id'] = 'LoadSum'
ag_test_data = ag_test_data.astype({"item_id": str})
ag_val_data['item_id'] = 'LoadSum'
ag_val_data = ag_val_data.astype({"item_id": str})

In [None]:
# take a quick look at the split datasets
print('ag_train_data\n', ag_train_data)
print('ag_test_data\n', ag_test_data)
print('ag_val_data\n', ag_val_data)

In [None]:
# load training data in to required AutoGluon proprietary data frame
print(ag_train_data.info())
ag_train_data_tsdf = TimeSeriesDataFrame.from_data_frame(
    ag_train_data,
    id_column="item_id",
    timestamp_column="DateTime"
)
ag_train_data_tsdf

In [None]:
# load test data in to required AutoGluon proprietary data frame, "_tsdf" suffix = time series data frame
ag_test_data_tsdf = TimeSeriesDataFrame.from_data_frame(
    ag_test_data,
    id_column="item_id",
    timestamp_column="DateTime"
)
ag_test_data_tsdf

In [None]:
# at "high_quality" level, training takes about 45 minutes...
# training takes about 15 minutes for DeepAR
# training takes about 21 minutes for TemporalFusionTransformer
# training takes about 4 minutes for PatchTST
# training takes about 4 minutes for PatchTST

In [None]:
ag_predictor = TimeSeriesPredictor(
    prediction_length=48,
    path="autogluon-london-half-hourly",
    target="AggregateLoad",
    eval_metric="MASE",
)

ag_predictor.fit(
    ag_train_data_tsdf,
    presets="medium_quality",
    time_limit=6000,
)

In [None]:
# The test score is computed using the last
# prediction_length=48 timesteps of each time series in test_data
ag_predictor.leaderboard(ag_test_data_tsdf, silent=True)

In [None]:
# generate predictions
ag_predictions = ag_predictor.predict(ag_train_data_tsdf)
ag_predictions.head()

In [None]:
# plot prediction results, history and actual test data values
plt.figure(figsize=(20, 3))

item_id = "LoadSum"
y_past = ag_train_data_tsdf.loc[item_id]["AggregateLoad"]
y_pred = ag_predictions.loc[item_id]
y_test = ag_test_data_tsdf.loc[item_id]["AggregateLoad"]

plt.plot(y_past[-100:], label="Past time series values")
plt.plot(y_pred["mean"], label="Mean forecast")
plt.plot(y_test[:48], label="Future time series values")

plt.fill_between(
    y_pred.index, y_pred["0.1"], y_pred["0.9"], color="red", alpha=0.1, label=f"10%-90% confidence interval"
)
plt.legend();