In [None]:
# Recurrent Neural Network

Let's try throwing a neural network at the demand forecasting problem.  We'll give input method the day of the week, time of day, day of the year, and temperature.  This first version uses a single recurrent cell, with a linear layer at the end.  This could be enhanced by making deeper networks at both the beginning and end, using a fancier cell (LSTM, GRU).

THis is the old messy version (which at least worked!)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from util.get_weather_data import convert_isd_to_df, convert_state_isd
from util.EBA_util import remove_na, avg_extremes

%matplotlib inline
%load_ext autoreload
%autoreload 2

import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib.rnn import BasicRNNCell,LSTMCell

In [None]:
#Extend to multiple temperature series
try:
    df_joint=pd.read_csv('data/pdx_joint.txt',
        index_col=0, parse_dates=True)
    print('Read in PDX Frame from file')
except:
    print('Creating PDX DataFrame from scratch')
    air_df = pd.read_csv('data/air_code_df.gz')
    #Just get the weather station data for cities in Oregon.
    df_weather=convert_state_isd(air_df,'OR')
    #Select temperature for only Portland
    #msk1=np.array(df_weather['city']=='Portland')
    #select temp for all Oregon stations
    msk2=np.array(df_weather['state']=='OR')
    df_pdx_weath=df_weather.loc[msk2]
    #find number of unique station city/state combinations
    Nstation = len(df_pdx_weath['city, state'].unique())

    #reshape the single temperature column into Nstation copies.  
    unique_station=df_pdx_weath['city, state'].unique()
    temp_df=pd.DataFrame()
    for station in unique_station:
        colname=str('Temp-'+station)
        temp_df[colname]=df_pdx_weath.loc[df_pdx_weath['city, state']==station,'Temp']

    #get electricity data for Portland General Electric
    df_eba=pd.read_csv('data/EBA_time.gz',index_col=0,parse_dates=True)
    msk=df_eba.columns.str.contains('Portland')
    df_pdx=df_eba.loc[:,msk]
    #select out demand data
    msk1 = df_pdx.columns.str.contains('[Dd]emand') 
    dem=df_pdx.loc[:,msk1]
    #Make a combined Portland Dataframe for demand vs weather.
    df_joint=pd.DataFrame(dem)
    df_joint=df_joint.join(temp_df)
    df_joint = df_joint.rename(columns={df_joint.columns[0]:'Demand',
             df_joint.columns[1]:'Forecast'})
    df_joint.to_csv('data/pdx_joint.txt')

#Make copies of data from dataframe to avoid overwriting source data.
dem=df_joint['Demand'].copy()
temp=df_joint.loc[:,df_joint.columns.str.contains('Temp')].copy()
fore=df_joint['Forecast'].copy()


In [None]:
#clean up data, remove NA
#remove NA values, and average extreme values down
for y in [temp,dem]:
    if len(y.shape)>1:
        for i in range(y.shape[1]):
            x= y.iloc[:,i]
            x = remove_na(x)
            y.iloc[:,i] = avg_extremes(x)
    else:
        x= y
        x = remove_na(x)
        y = avg_extremes(x)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
def make_temptime_data(temp_mat):
    """make_input_data
    Takes input temperature data matrix (for multiple locations),
    and extends with extra indices for time of day, day of year, day of week, and holiday. 

    Input: temp_mat - pandas series of temperatures a location.  
    Output: in_mat - matrix of raw temperatures, and scaled times of day and year.
    """
    Tind = temp_mat.index
    Nt=len(Tind)
    hr = Tind.hour.values/(24-1)
    #scale day of year to [0,1]
    dyear = Tind.dayofyear.values/(365-1+Tind.is_leap_year.astype(int))
    dweek = Tind.dayofweek.values/(7-1)
    # #scale temperature data to so that max/min correspond to [0,1]  
    # temp_max = temp_mat.max(axis=0)
    # temp_min = temp_mat.min(axis=0)
    # temp_mat = (temp_mat-temp_min)/(temp_max-temp_min)
    out_mat=np.stack([hr,dweek,dyear]).T
    out_mat= np.hstack([temp_mat.values,out_mat])
    return out_mat

In [None]:
# ##OBSOLETE - REMOVING!

# def scale_demand(dem):
#     """scale_demand
#     Scale demand to be on 0,1 scale.
#     Input: demand - series at single location
#     Output: dem_scale - scaled array of values.
#             dem_max, dem_min - the maximum and minimum values.
#     """
#     dem_scale = dem.values
#     dem_max = np.max(dem_scale)
#     dem_min = np.min(dem_scale)
#     dem_scale = (dem_scale-dem_min)/(dem_max-dem_min)
#     return dem_scale, dem_max,dem_min

# #drop data prior to 
# temp_mat,tmax,tmin=make_temptime_data(temp[:Ntest])
# dem_mat,dmax,dmin=scale_demand(dem)

# temp_train = temp_mat[0:Ntest,:]
# temp_test = temp_mat[Ntest:,:]
# dem_train = dem_mat[0:Ntest]
# dem_test = dem_mat[Ntest:]

In [None]:
#make combined temperature and time data.
temp_mat=make_temptime_data(temp)

In [None]:
#Use Sklearn MinMaxScaler to scale all data between 0,1.
#Only fit the scaling on training data.

Nt=len(dem)
Ntest = Nt//2

Tscaler=MinMaxScaler()
Dscaler=MinMaxScaler()
Tscaler.fit(temp_mat[:Ntest])
Dscaler.fit(dem[:Ntest].values.reshape((Ntest,1)))

In [None]:
def get_random_batch(X,y,n_batch,seq_len):
    """get_random_batch(Xsig,t,n_batch)   
    Gets multiple random samples for the data.
    Samples generated by 'get_selection' function.
    Makes list of returned entries.
    Then combines together with 'stack' function at the end.

    X - matrix of inputs, (Nt, Ninputs)
    y - vector of desired outputs (Nt)
    n_batch - number of batches
    seq_len - length of sequence to extract in each batch

    Outputs:
    X_batch - random subset of inputs shape (Nbatch,seq_len,Ninputs) 
    y_batch - corresponding subset of outputs (Nbatch,seq_len)
    """
    Nt,Nin = X.shape
    x_list=[]
    y_list=[]
    for i in range(n_batch):
        n0=int(np.random.random()*(Nt-seq_len-1))
        x_sub = X[n0:n0+seq_len]
        y_sub = y[n0:n0+seq_len]
        x_list.append(x_sub)
        y_list.append(y_sub)
    x_batch=np.stack(x_list,axis=0)
    y_batch=np.stack(y_list,axis=0)
    y_batch=y_batch.reshape( [n_batch,seq_len,-1])                    
    return x_batch,y_batch

Xb,yb=get_random_batch(temp_mat,dem_mat,1000,24)


In [None]:
n_steps=24
n_inputs=len(temp.iloc[0])+3
n_neurons=120
n_layers=3
n_outputs=1  #number of stations to predict at that time.
lr=1E-2
np.random.seed(seed=3453)

In [None]:
def make_RNN_cell(n_neurons,fn=tf.nn.relu):
    cell=BasicRNNCell(num_units=n_neurons,activation=fn)
    return cell

In [None]:
#Initial test with code liberally borrowed from ch14 of Geron's 
#"Practical Machine Learning with scikit-learn and Tensorflow"

#Makes a single RNN cell, with a fully connected output layer (with no activation on the output).

print('setting up graphs:Multi-layer RNN')
tf.reset_default_graph()
#inputs:  Nobs, with n_steps, and n_inputs per step
X = tf.placeholder(tf.float32,[None,n_steps,n_inputs],name='X')
#Outputs: n_outputs we want to predict in the future.
y = tf.placeholder(tf.float32,[None,n_steps,n_outputs],name='y')

#define neural network shape
#works:make a list of them.  
# cell=BasicRNNCell(num_units=n_neurons,activation=tf.nn.relu)

#Make a list of cells to pass along.  
cell_list=[]
for i in range(n_layers):
    cell_list.append(make_RNN_cell(n_neurons,tf.nn.relu))

multi_cell=tf.contrib.rnn.MultiRNNCell(cell_list,state_is_tuple=True)
#Note that using [cell]*n_layers did not work since that copies the memory location, rather than making
#a number of independent copies.
rnn_outputs,states=tf.nn.dynamic_rnn(multi_cell,X,dtype=tf.float32)
#this maps the number of hidden units to fewer outputs.
stacked_rnn_outputs = tf.reshape(rnn_outputs,[-1,n_neurons])
stacked_outputs = fully_connected(stacked_rnn_outputs,n_outputs,activation_fn=None)
outputs=tf.reshape(stacked_outputs,[-1,n_steps,n_outputs])

#define loss (mean-square-error)
loss = tf.reduce_mean(tf.square(outputs-y))
#define optimization function.
optimizer=tf.train.AdamOptimizer(learning_rate=lr)
training_op=optimizer.minimize(loss)
init=tf.global_variables_initializer()

saver = tf.train.Saver()
#Try adding everything by name to a collection to save and restore later
tf.add_to_collection('X',X)
tf.add_to_collection('y',y)
tf.add_to_collection('loss',loss)
tf.add_to_collection('pred',outputs)
tf.add_to_collection('train',training_op)

#compute number correct.
print('Loading data')
n_iter=1000
n_batch=100
run_network=True

if (run_network==True):
    print('Running this thang')
    with tf.Session() as sess:
        init.run()
        for iteration in range(n_iter):
            #select random starting point. 
            X_batch,y_batch=get_random_batch(
                            temp_train, dem_train, n_batch, n_steps)

            sess.run(training_op, feed_dict={X: X_batch, y:y_batch})
            if iteration%50 ==0:
                mse =loss.eval(feed_dict={X:X_batch,y:y_batch})
                print("MSE on batch ",iteration,':\t',mse)
                #save model
                saver.save(sess, "./models/pdx_RNN_model",
                           write_meta_graph=True)


In [None]:
So multiple tanhs are bad.  A couple ReLU layers seem to work well, but do lead to negative predictions.  Note that in comparisons that the early 2015 data is pretty flaky (like the forecasts are zero, and I had to fix multiple issues in the demand data).

In [None]:
def model_predict_whole(Xin,path_str="pdx_RNN_model"):
    """model_predict_whole(tstart)
    Retrieve the outputs of the network for all values of the inputs 
    """
    Nt,Nin=Xin.shape
    nmax = int(Nt/n_steps)
    ytot = np.zeros((Nt,1))
    #Note that loading/saving graph is not properly implemented yet.    
    #reset graph, and reload saved graph
    tf.reset_default_graph()
    model_path = "./models/"+path_str    
    saver = tf.train.import_meta_graph(model_path+".meta")
    #saver=tf.train.import_meta_graph(full_model_name+'.meta')
    #restore graph structure
    X=tf.get_collection('X')[0]
    y=tf.get_collection('y')[0]
    outputs=tf.get_collection('pred')[0]
    train_op=tf.get_collection('train_op')[0]
    loss=tf.get_collection('loss')[0]
    #restores weights etc.
    #saver.restore(sess,full_model_name)
    
    with tf.Session() as sess:

        #restore variables
        saver.restore(sess,model_path)
        for i in range(nmax-1):
            n0=n_steps*i
            x_sub = Xin[n0:n0+n_steps,:]
            x_sub = x_sub.reshape(-1,n_steps,Nin)
            y_pred=sess.run(outputs,feed_dict={X:x_sub})
            #nn_pred=predict_on_batch(sess,X_batch)            
            ytot[n0:n0+n_steps]=y_pred
    return ytot

In [None]:
def plot_whole_sample_fit(X,y,ntest,n_steps,path_str="pdx_RNN_model"):
    """plot_whole_sample_fit

    Plot ALL of the predictions of the trained model
    on a 'test' set with different noise, and longer
    times.  Concatenates the predicted results together.  
    """
    #pull in the inputs, and predictions
    Nt, Nin = X.shape
    ytot=model_predict_whole(X,path_str)
    plt.figure()
    #now plot against the test sets defined earlier
    plt.plot(np.arange(0,ntest),X[:ntest,0],'b',label='Training')
    plt.plot(np.arange(ntest,Nt), X[ntest:,0],'g',label='Test')
    plt.plot(np.arange(Nt),ytot,'r',label='Predicted')
    plt.plot(np.arange(Nt),dem_mat,label='Real')
    plt.legend(loc='right')
    plt.show()
    return ytot

In [None]:
#n0,x_sub,y_pred=toy_predict(2.5)
ytot=plot_whole_sample_fit(temp_mat,dem_mat,Ntest,n_steps,'pdx_RNN_model')

In [None]:
#convert the RNN output to a pandas time-series
pred=pd.Series(((dmax-dmin)*ytot+dmin).reshape(-1),index=dem.index)

In [None]:
def rmse(x,y):
    z = np.sqrt(np.sum((x-y)*(x-y))/len(x))
    return z

def mape(x,y):
    z = np.mean(np.abs((1-x/y)))
    return z

plt.plot(dem['2015-11':],pred['2015-11':],'.')
plt.xlabel('Actual Demand')
plt.ylabel('RNN Prediction')
plt.show()

In [None]:
nt = len(ytot)//2
fore_train_rmse=rmse(fore[:nt],dem[:nt])
fore_test_rmse=rmse(fore[nt:],dem[nt:])
pred_train_rmse=rmse(pred[:nt],dem[:nt])
pred_test_rmse=rmse(pred[nt:],dem[nt:])

print("Forecast RMSE in training/test      : {}, {}".format(fore_train_rmse,fore_test_rmse))
print("RNN Prediction RMSE in training/test: {}, {}".format(pred_train_rmse,pred_test_rmse))

In [None]:
fore_train_mape=mape(fore[:nt],dem[:nt])
fore_test_mape=mape(fore[nt:],dem[nt:])

pers_train_mape=mape(dem[:nt-24].values,dem[24:nt].values)
pers_test_mape=mape(dem[nt:-24].values,dem[nt+24:].values)

pred_train_mape=mape(pred[:nt],dem[:nt])
pred_test_mape=mape(pred[nt:],dem[nt:])

print("Forecast MAPE in training/test      : {}, {}".format(fore_train_mape,fore_test_mape))
print("Persistence MAPE in training/test   : {}, {}".format(pers_train_mape,pers_test_mape))
print("RNN Prediction MAPE in training/test: {}, {}".format(pred_train_mape,pred_test_mape))

In [None]:
So this simple RNN does worse than the actual forecast, but does out perform persistence.  Well, that's at least something.
Obviously, this can be greatly improved.  The above is a simple toy model, one input station, one output series for the same set of time.
We can play with other architectures, activations, and using more data.

In [None]:
plt.figure(figsize=(10,6))
date_slice=slice('2016-12-20','2017-01-02')
plt.plot(pred[date_slice],label='pred')
plt.plot(dem[date_slice],label='demand')
plt.plot(fore[date_slice],label='fore')
plt.legend(loc='right')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
date_slice=slice('2017-06-01','2017-08-01')
plt.plot(pred[date_slice]/dem[date_slice]-1,label='pred err')
plt.plot(fore[date_slice]/dem[date_slice]-1,label='fore err')
plt.ylabel('Percentage Error')
plt.legend(loc='right')
plt.show()

In [None]:
So looking at the percentage errors, this model (which currently lacks knowledge of holidays) is messing up on Thanksgiving.  Also the model seems to make opposite errors to the forecast model.  It's probably worth checking that the distribution of errors.  Eyeballing the curves shows that the errors are lowest early in the morning, and highest at midday.  The error signal probably has a significant daily frequency component.

Right now this is a 3-layer RNN.  We can extend it to include different cell types, fiddle with the network size, and maybe a different layout.
I'm going to retry this in a more modular approach (and for a more general set of code), with multiple inputs, differing sizes, dropout, more efficient loading.