# Understanging RNNs

# 1. Understanding the forward pass

The first part of this lab is based on Andrew Ng's introduction to Machine Learning course. We start by importing all necessary libraries.

In [None]:
import numpy as np
import torch.nn.utils.rnn as rnn_utils
from scipy.special import softmax
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math
import matplotlib.pyplot as plt

The goal of this part is to understand the forward pass in an RNN. We will build a function that implements a single forward step of an RNN cell, where the input is made-up of input data and of the output from the previous time step.

In [None]:
def rnn_cell_forward(xt, ht, parameters):
    """
    Implements a single forward step of an RNN-cell

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    ht -- Output at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- dictionary containing:
                        Wx -- Weight matrix for the input, numpy array of shape (n_a, n_x)
                        Wh -- Weight matrix for the hidden state, numpy array of shape (n_a, n_a)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    m enables processing multiple sequences simultaneously during training or inference, by inputing multiple inputs to be processed in parallel
    Returns:
    h_next -- next hidden state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    """

    # Retrieve parameters from "parameters"
    Wx = parameters["Wx"]
    Wh = parameters["Wh"]
    Wy = parameters["Wy"]
    ba = parameters["ba"]
    by = parameters["by"]


    # compute next activation state
    h_next = np.tanh(np.dot(Wh,ht) + np.dot(Wx,xt) + ba)
    # compute output of the current cell using the formula given above
    yt_pred = softmax(np.dot(Wy,h_next) + by)

    # store values you need for backward propagation in cache
    cache = (h_next, ht, xt, parameters)

    return h_next, yt_pred, cache

Test the function by providing an input.

In [None]:
#intitialise the parameters randomly to test a forward pass
np.random.seed(1)
xt = np.random.randn(3,10)
ht = np.random.randn(5,10)
Wh = np.random.randn(5,5)
Wx = np.random.randn(5,3)
Wy = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wh": Wh, "Wx": Wx, "Wy": Wy, "ba": ba, "by": by}

In [None]:
#Test a forward pass of the RNN cell, with a single time step
h_next, yt_pred, cache = rnn_cell_forward(xt, ht, parameters)
print("h_next[4] = ", h_next[4])
print("h_next.shape = ", h_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)

The RNN uses this forward pass repeatedly for the entirety of the sequence of inputs, starting from the first in the sequence at timestep 1 and continuing through the sequence.

We will create a vector *h* to store all hidden states (that are provided as input to the next time-step), a vector *y* to store all predictions and a vector *caches* containing the list of caches.

In [None]:
def rnn_forward(x, h0, parameters):
    """
    Implement the forward propagation of the recurrent neural network described in Figure (3).

    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    h0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wh -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wx -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    h -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of caches, x)
    """

    # Initialize "caches" which will contain the list of all caches
    caches = []

    # Retrieve dimensions from shapes of x and parameters["Wya"]
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wy"].shape


    # initialize "a" and "y" with zeros
    h = np.zeros([n_a,m,T_x])
    y_pred = np.zeros([n_y,m,T_x])

    # Initialize a_next
    h_next = h0

    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, compute the prediction, get the cache (≈1 line)
        h_next, yt_pred, cache = rnn_cell_forward(x[:,:,t], h_next, parameters)
        # Save the value of the new "next" hidden state in a (≈1 line)
        h[:,:,t] = h_next
        # Save the value of the prediction in y (≈1 line)
        y_pred[:,:,t] = yt_pred
        # Append "cache" to "caches" (≈1 line)
        caches.append(cache)

    # store values needed for backward propagation in cache
    caches = (caches, x)

    return h, y_pred, caches

Test a forward pass of the model by providing an input.

In [None]:
np.random.seed(1)
x = np.random.randn(3,10,4)
h0 = np.random.randn(5,10)
Wh = np.random.randn(5,5)
Wx = np.random.randn(5,3)
Wy = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wh": Wh, "Wx": Wx, "Wy": Wy, "ba": ba, "by": by}

h, y_pred, caches = rnn_forward(x, h0, parameters)
print("h[4][1] = ", h[4][1])
print("h.shape = ", h.shape)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)
print("caches[1][1][3] =", caches[1][1][3])
print("len(caches) = ", len(caches))

Optional extension: create a function to calculate the gradient after a forward pass. Then, you can create a function to train your from-scratch RNN.

---



# 2. RNN training

This part of the lab is based on the tutorial at https://machinelearningmastery.com/understanding-simple-recurrent-neural-networks-in-keras/ . Now we will use a library to build our RNN.

In [None]:
def create_RNN(hidden_units, dense_units, input_shape, activation):
    model = Sequential()
    model.add(SimpleRNN(hidden_units, input_shape=input_shape,
                        activation=activation[0]))
    model.add(Dense(units=dense_units, activation=activation[1]))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

demo_model = create_RNN(2, 1, (3,1), activation=['linear', 'linear'])

We will use a dataset containing the Monthly Mean Total Sunspot Number, from 1749/01/01 to 2017/08/31. This is a sequential dataset, containing the evolution of the number of susnpots in time.

We load the dataset, split it into test and train and scale it.

In [None]:
# Parameter split_percent defines the ratio of training examples
def get_train_test(url, split_percent=0.8):
    df = read_csv(url, usecols=[1], engine='python')
    data = np.array(df.values.astype('float32'))
    scaler = MinMaxScaler(feature_range=(0, 1))
    data = scaler.fit_transform(data).flatten()
    n = len(data)
    # Point for splitting data into train and test
    split = int(n*split_percent)
    train_data = data[range(split)]
    test_data = data[split:]
    return train_data, test_data, data

sunspots_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/monthly-sunspots.csv'
train_data, test_data, data = get_train_test(sunspots_url)

The next step is to prepare the data for Keras model training. The input array should be shaped as: total_samples x time_steps x features.

Timesteps indicates the number of timesteps that we will consider in each training sequence. In order to get these sequences for training, we’ll create input rows with non-overlapping time steps.

In [None]:
# Prepare the input X and target Y
def get_XY(dat, time_steps):
    # Indices of target array
    Y_ind = np.arange(time_steps, len(dat), time_steps)
    Y = dat[Y_ind]
    # Prepare X
    rows_x = len(Y)
    X = dat[range(time_steps*rows_x)]
    X = np.reshape(X, (rows_x, time_steps, 1))
    return X, Y

time_steps = 12
trainX, trainY = get_XY(train_data, time_steps)
testX, testY = get_XY(test_data, time_steps)

We can now train.

In [None]:
model = create_RNN(hidden_units=3, dense_units=1, input_shape=(time_steps,1),
                   activation=['tanh', 'tanh'])
model.fit(trainX, trainY, epochs=20, batch_size=1, verbose=2)

Note that in every epoch, 12 derivatives (one for each time step) are added to form the updating gradient.

In [None]:
def print_error(trainY, testY, train_predict, test_predict):
    # Error of predictions
    train_rmse = math.sqrt(mean_squared_error(trainY, train_predict))
    test_rmse = math.sqrt(mean_squared_error(testY, test_predict))
    # Print RMSE
    print('Train RMSE: %.3f ' % (train_rmse))
    print('Test RMSE: %.3f ' % (test_rmse))

# make predictions
train_predict = model.predict(trainX)
test_predict = model.predict(testX)
# Mean square error
print_error(trainY, testY, train_predict, test_predict)

Exercise: To obtain the prediction of sunspots, reverse the scaling, for example with the  inverse_transform function. Check your predictions againt the dataset. You can also make predictions across many years, using your own predictions as inputs. You can compare the predictions with an updated list of sunspots here: https://www.sidc.be/SILSO/datafiles . For what length of time does the model maintain accuracy?