In [54]:
import os

import pandas as pd
import numpy as np
import torch

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import torch.nn as nn
import torch.nn.functional as F

# RNN from scratch

In [55]:
data = pd.read_csv("../data/clean_weather.csv", index_col=0)
data = data.ffill()
data.head()

Unnamed: 0,tmax,tmin,rain,tmax_tomorrow
1970-01-01,60.0,35.0,0.0,52.0
1970-01-02,52.0,39.0,0.0,52.0
1970-01-03,52.0,35.0,0.0,53.0
1970-01-04,53.0,36.0,0.0,52.0
1970-01-05,52.0,35.0,0.0,50.0


In [56]:
np.random.seed(0)

class rnn(nn.Module):
    def __init__(self):
        super().__init__()
        self.weight_input = np.random.rand(1, 5) / 5 - 0.1

        self.weight_hidden = np.random.rand(5, 5) / 5 - 0.1
        self.bias_hidden = np.random.rand(1, 5) / 5 - 0.1

        self.weight_output = np.random.rand(5, 1) * 50
        self.bias_output = np.random.rand(1, 1)
        self.hidden_step = None

    def forward(self, x):
        x_output = []
        hiddens = np.zeros((10, 5))

        for i in range(len(x)):
            # Reshapping input
            x_input = x[i].reshape(1, 1)
        
            # Input Layer
            x_input = x_input @ self.weight_input

            # Hidden Layer
            if i != 0:
                x_hidden = np.tanh(x_input + (self.hidden_step @ self.weight_hidden))
            else:
                x_hidden = np.tanh(x_input)

            # Output Layer
            output = (x_hidden @ self.weight_output).item()

            # Saving hidden states and output values
            self.hidden_step = x_hidden
            hiddens[i,:] = x_hidden
            x_output.append(output)

        return x_output, hiddens

In [57]:
rnn_model_for_weather = rnn()

In [58]:
train = data.head(10).tmax.values
print(f"\033[95m↓ train ↓\033[0m\n{train}")

prediction, hiddens = rnn_model_for_weather(train)
print(f"\033[95m↓ prediction ↓\033[0m\n{prediction}")
print(f"\033[95m↓ hiddens ↓\033[0m\n{hiddens}")

[95m↓ train ↓[0m
[60. 52. 52. 53. 52. 50. 52. 56. 54. 57.]
[95m↓ prediction ↓[0m
[71.0742190277846, 73.91953306750138, 73.97074609609383, 74.50992244242337, 74.04836420420375, 72.93682904415054, 73.89303982479758, 75.98268476297383, 75.20618121079521, 76.5658351071108]
[95m↓ hiddens ↓[0m
[[ 0.52684078  0.98863364  0.84349363  0.49192612 -0.72406738]
 [ 0.4913896   0.97746488  0.80455755  0.58573775 -0.69308416]
 [ 0.49997043  0.9777974   0.80332419  0.58507687 -0.69762274]
 [ 0.50739162  0.9796219   0.81056156  0.59140405 -0.70547389]
 [ 0.50018991  0.97781757  0.8035146   0.58656475 -0.69779462]
 [ 0.48542095  0.97370097  0.78833187  0.5736915  -0.6817688 ]
 [ 0.50005263  0.977801    0.80314708  0.58349426 -0.69762897]
 [ 0.52871827  0.98422052  0.83068071  0.60857924 -0.72769806]
 [ 0.51487162  0.98130927  0.81792654  0.60100623 -0.71329606]
 [ 0.53618209  0.98552925  0.83714233  0.61631894 -0.73508814]]


In [59]:
def mse(actual, predicted):
    return np.mean((actual-predicted)**2)

# The gradient of mse wrt the network outputs, which is the variable 'predicted'
# It's actually -2/n * (actual-predicted), but we can drop the coefficient 2/n.
def mse_grad(actual, predicted):
    return (predicted - actual)

In [60]:
loss_grad = mse_grad(train, prediction)
print(f"\033[95m↓ loss ↓\033[0m\n{loss_grad}")

[95m↓ loss ↓[0m
[11.07421903 21.91953307 21.9707461  21.50992244 22.0483642  22.93682904
 21.89303982 19.98268476 21.20618121 19.56583511]


## Calculate Gradient of the last sequence

We have 10 input variables. To calculate the gradient of the parameters, we first need to determine the value of the final input variable.

In [61]:
# Output_weight_grad
# This is the gradient of loss wrt output weight
# Output = X_hidden @ W_output + B_ouput
# output_weight_grad = loss_grad @  (X_hidden @ W_output + B_output)/derivative wrt W_output =  loss_grad @ X_hidden
# We have X_hiddens in the array 'hiddens'
output_weight_grad = hiddens[9][:,np.newaxis] @ loss_grad[9].reshape(1,1)
print(f"\033[95m↓ output_weight_grad ↓\033[0m\n{output_weight_grad}")

[95m↓ output_weight_grad ↓[0m
[[ 10.49085036]
 [ 19.28270284]
 [ 16.37938883]
 [ 12.05879483]
 [-14.38261327]]


In [62]:
# This is the gradient of loss wrt output bias
# Output bias gradient is just summation of all loss
# Output = X_hidden @ W_output + B_ouput
# Output bias gradient = loss_grad @ (X_hidden @ W_output + B_output)/derivative wrt B_output =  loss_grad @ 1
output_bias_grad = loss_grad[9].reshape(1,1)
print(f"\033[95m↓ output_bias_grad ↓\033[0m\n{output_bias_grad}")


[95m↓ output_bias_grad ↓[0m
[[19.56583511]]


In [63]:
# hidden_output_gradient
# Output = X_hidden @ W_output + B_ouput
# hidden_output_gradient = loss_grad @  (X_hidden @ W_output + B_output)/derivative wrt X_hidden =  loss_grad @ W_output
hidden_output_gradient9 = loss_grad[9].reshape(1,1) @ rnn_model_for_weather.weight_output.T
print(f"\033[95m↓ hidden_output_gradient ↓\033[0m\n{hidden_output_gradient9}")

[95m↓ hidden_output_gradient ↓[0m
[[604.2277146  598.80819903 603.54144274 923.26096434 667.01917725]]


In [64]:
# activation_grad

activation_grad = 1 - hiddens[9][np.newaxis, :] ** 2
print(f"\033[95m↓ activation_grad ↓\033[0m\n{activation_grad}")
hidden_output_gradient9 = np.multiply(hidden_output_gradient9, activation_grad)
print(f"\033[95m↓ hidden_output_gradient ↓\033[0m\n{hidden_output_gradient9}")

[95m↓ activation_grad ↓[0m
[[0.71250877 0.02873209 0.29919272 0.62015096 0.45964543]]
[95m↓ hidden_output_gradient ↓[0m
[[430.51754363  17.20501289 180.57520357 572.5611729  306.59231761]]


In [65]:

hidden_weight_gradient = hidden_output_gradient9 @ hiddens[9][:, np.newaxis]
print(f"\033[95m↓ hidden_weight_gradient ↓\033[0m\n{hidden_weight_gradient}")

[95m↓ hidden_weight_gradient ↓[0m
[[526.46690868]]


In [66]:
hidden_bias_gradient = np.mean(hidden_output_gradient9)
print(f"\033[95m↓ hidden_bias_gradient ↓\033[0m\n{hidden_bias_gradient}")

[95m↓ hidden_bias_gradient ↓[0m
301.4902501193734


In [67]:
input_weight_gradient = train[9].reshape(1,1) @ hidden_output_gradient9
print(f"\033[95m↓ input_weight_gradient ↓\033[0m\n{input_weight_gradient}")

[95m↓ input_weight_gradient ↓[0m
[[24539.49998718   980.68573447 10292.78660353 32635.98685506
  17475.76210378]]


### Now we are going to calculate sequence 8's gradient

In [68]:
loss8_grad = loss_grad[8].reshape(1,1)
print(f"\033[95m↓ loss8_grad ↓\033[0m\n{loss8_grad}")

[95m↓ loss8_grad ↓[0m
[[21.20618121]]


In [69]:

print(f"\033[95m↓ loss8_grad.shape ↓\033[0m\n{loss8_grad.shape}")
print(f"\033[95m↓ hiddens[8].reshape(-1, 1).shape ↓\033[0m\n{hiddens[8].reshape(-1, 1).T.shape}")
print(f"\033[95m↓ output_weight_grad ↓\033[0m\n{output_weight_grad}")
output_weight_grad += hiddens[8].reshape(-1, 1) @ loss8_grad


[95m↓ loss8_grad.shape ↓[0m
(1, 1)
[95m↓ hiddens[8].reshape(-1, 1).shape ↓[0m
(1, 5)
[95m↓ output_weight_grad ↓[0m
[[ 10.49085036]
 [ 19.28270284]
 [ 16.37938883]
 [ 12.05879483]
 [-14.38261327]]


In [70]:
output_bias_grad += loss8_grad
print(f"\033[95m↓ output_bias_grad ↓\033[0m\n{output_bias_grad}")

[95m↓ output_bias_grad ↓[0m
[[40.77201632]]


In [71]:
# Gradient rwt hidden output 
hidden_output_gradient8 = loss8_grad @ rnn_model_for_weather.weight_output.T

# Add the previous hidden output gradient9
# Multiply hidden_output_gradient9 by the weight to pull it back to the current sequence position
hidden_output_gradient8 += hidden_output_gradient9 @ rnn_model_for_weather.weight_hidden.T

tanh_derivative = 1 - hiddens[8].reshape(1,-5) **2
hidden_output_gradient8 = np.multiply(tanh_derivative, hidden_output_gradient8)
print(f"\033[95m↓ hidden_output_gradient8 ↓\033[0m\n{hidden_output_gradient8}")


[95m↓ hidden_output_gradient8 ↓[0m
[[534.51897159  25.88984607 226.23379412 670.87851134 366.97666118]]


In [72]:
print(f"\033[95m↓ hidden_weight_gradient BEFORE↓\033[0m\n{hidden_weight_gradient}")
hidden_weight_gradient +=  hidden_output_gradient8 @ hiddens[8][:, np.newaxis] 
print(f"\033[95m↓ hidden_weight_gradient ↓\033[0m\n{hidden_weight_gradient}")

[95m↓ hidden_weight_gradient ↓[0m
[[526.46690868]]
[95m↓ hidden_weight_gradient ↓[0m
[[1153.56328621]]


In [76]:
print(f"\033[95m↓ hidden_bias_gradient.shape BEFORE↓\033[0m\n{hidden_bias_gradient}")
hidden_bias_gradient += np.mean(hidden_output_gradient8)
print(f"\033[95m↓ hidden_bias_gradient.shape ↓\033[0m\n{hidden_bias_gradient}")

[95m↓ hidden_bias_gradient.shape ↓[0m
301.4902501193734
[95m↓ hidden_bias_gradient.shape ↓[0m
666.3898069787671


In [77]:
input_weight_gradient += train[8].reshape(1,1) @ hidden_output_gradient8
print(f"\033[95m↓ input_weight_gradient ↓\033[0m\n{input_weight_gradient}")

[95m↓ input_weight_gradient ↓[0m
[[53403.52445306  2378.73742202 22509.41148597 68863.42646755
  37292.50180745]]


### The final sequence gradient

In the final step, there's no need to accumulate gradients for the hidden weights and biases across timesteps. It's because, in the final step, hidden state does not affect any previous hidden state, thus we don't need to add hidden_weight_grad and hidden_bias_grad. 

In [86]:
loss0_grad = loss_grad[0].reshape(1,1)
print(f"\033[95m↓ loss0_grad ↓\033[0m\n{loss0_grad}")

[95m↓ loss0_grad ↓[0m
[[11.07421903]]


In [91]:
output_weight_grad += hiddens[0].reshape(-1, 1) @ loss0_grad
print(f"\033[95m↓ output_weight_grad ↓\033[0m\n{output_weight_grad}")

[95m↓ output_weight_grad ↓[0m
[[ 27.24366154]
 [ 51.04087046]
 [ 43.06552034]
 [ 30.25153944]
 [-37.52737952]]


In [None]:
output_bias_grad += np.mean(loss0_grad)

In [93]:
hidden_output_gradient0 = loss0_grad @ rnn_model_for_weather.weight_output.T
print(f"\033[95m↓ hidden_output_gradient0 ↓\033[0m\n{hidden_output_gradient0}")

[95m↓ hidden_output_gradient0 ↓[0m
[[341.9915387  338.92410497 341.60311035 522.56364643 377.53136649]]


## Make a class 