## VANILLA RNN WITH ARBITRARY LAYERS

In [1]:
## implementation & testing --> v1

import torch
import torch.nn as nn
import numpy as np
from torch.nn import functional as F
torch.manual_seed(1337)

print('IMPORTS DONE')

IMPORTS DONE


In [2]:
## start with data
data = open('/Users/joesasson/Desktop/open-source/numpy-RNN/data/input.txt', 'r').read() # should be simple plain text file

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)

print('data has {} characters, {} unique.'.format(data_size, vocab_size))

char_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_char = { i:ch for i,ch in enumerate(chars) }

data has 1115394 characters, 65 unique.


In [38]:
from typing import Any
np.random.seed(99)

class RNN:
    def __init__(self, hidden_size, vocab_size, seq_length, num_layers):
        self.name = 'RNN'
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers

        # model parameters
        self.Wxh = [np.random.randn(hidden_size, vocab_size)*0.01 for _ in range(num_layers)] # input to hidden
        self.Whh = [np.random.randn(hidden_size, hidden_size)*0.01 for _ in range(num_layers)] # hidden to hidden
        self.Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
        self.bh = [np.zeros((hidden_size, 1)) for _ in range(num_layers)] # hidden bias
        self.by = np.zeros((vocab_size, 1)) # output bias

        # memory variables for training (ada grad from karpathy's github)
        self.iteration, self.pointer = 0, 0
        self.mWxh = [np.zeros_like(w) for w in self.Wxh]
        self.mWhh = [np.zeros_like(w) for w in self.Whh] 
        self.mWhy = np.zeros_like(self.Why)
        self.mbh, self.mby = [np.zeros_like(b) for b in self.bh], np.zeros_like(self.by)
        self.loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

        self.running_loss = []

    def __call__(self, *args: Any, **kwds: Any) -> Any:
        """RNN Forward Pass"""

        x, y, hprev = kwds['inputs'], kwds['targets'], kwds['hprev']

        loss = 0
        xs, hs, ys, ps = {}, {}, {}, {} # inputs, hidden state, output, probabilities
        hs[-1] = np.copy(hprev)

        # forward pass
        for t in range(len(x)):
            xs[t] = np.zeros((self.vocab_size,1)) # encode in 1-of-k representation
            xs[t][x[t]] = 1
            hs[t] = np.copy(hprev)

            for l in range(self.num_layers):
                hs[t][l] = np.tanh(np.dot(self.Wxh[l], xs[t]) + np.dot(self.Whh[l], hs[t-1][l]) + self.bh[l]) # hidden state
            
            ys[t] = np.dot(self.Why, hs[t][-1]) + self.by # unnormalized log probabilities for next chars
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
            loss += -np.log(ps[t][y[t],0]) # softmax (cross-entropy loss)

        self.running_loss.append(loss)

        return loss, hs[len(x)-1], {'xs':xs, 'hs':hs, 'ps':ps}

    def backward(self, targets, cache):
        """RNN Backward Pass"""

        xs, hs, ps = cache['xs'], cache['hs'], cache['ps']
        dWxh, dWhh, dWhy = [np.zeros_like(w) for w in self.Wxh], [np.zeros_like(w) for w in self.Whh], np.zeros_like(self.Why)
        dbh, dby = [np.zeros_like(b) for b in self.bh], np.zeros_like(self.by)
        dhnext = [np.zeros_like(h) for h in hs[0]]

        for t in reversed(range(len(xs))):

            dy = np.copy(ps[t])
            dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
            dWhy += np.dot(dy, hs[t][-1].T)
            dby += dy

            for l in reversed(range(self.num_layers)):
                dh = np.dot(self.Why.T, dy) + dhnext[l]
                dhraw = (1 - hs[t][l] * hs[t][l]) * dh # backprop through tanh nonlinearity
                dbh[l] += dhraw
                dWxh[l] += np.dot(dhraw, xs[t].T)
                dWhh[l] += np.dot(dhraw, hs[t-1][l].T)
                dhnext[l] = np.dot(self.Whh[l].T, dhraw)

        return {'dWxh':dWxh, 'dWhh':dWhh, 'dWhy':dWhy, 'dbh':dbh, 'dby':dby}

    def update(self, grads, lr):
        """Perform Parameter Update w/ Adagrad"""

        # unpack grads
        dWxh, dWhh, dWhy = grads['dWxh'], grads['dWhh'], grads['dWhy']
        dbh, dby = grads['dbh'], grads['dby']

        # loop through each layer
        for i in range(self.num_layers):
            # clip gradients to mitigate exploding gradients
            np.clip(dWxh[i], -5, 5, out=dWxh[i])
            np.clip(dWhh[i], -5, 5, out=dWhh[i])
            np.clip(dbh[i], -5, 5, out=dbh[i])

            # perform parameter update with Adagrad
            self.mWxh[i] += dWxh[i] * dWxh[i]
            self.Wxh[i] -= lr * dWxh[i] / np.sqrt(self.mWxh[i] + 1e-8)
            self.mWhh[i] += dWhh[i] * dWhh[i]
            self.Whh[i] -= lr * dWhh[i] / np.sqrt(self.mWhh[i] + 1e-8)
            self.mbh[i] += dbh[i] * dbh[i]
            self.bh[i] -= lr * dbh[i] / np.sqrt(self.mbh[i] + 1e-8)
        
        # clip gradients for Why and by
        np.clip(dWhy, -5, 5, out=dWhy)
        np.clip(dby, -5, 5, out=dby)

        # perform parameter update with Adagrad
        self.mWhy += dWhy * dWhy
        self.Why -= lr * dWhy / np.sqrt(self.mWhy + 1e-8)
        self.mby += dby * dby
        self.by -= lr * dby / np.sqrt(self.mby + 1e-8)


# Initialize RNN
num_layers = 3
hidden_size = 100
seq_length = 8

rnn = RNN(hidden_size=hidden_size, vocab_size=vocab_size, seq_length=seq_length, num_layers=num_layers)

x = [char_to_idx[ch] for ch in data[rnn.pointer:rnn.pointer+seq_length]]

y = [char_to_idx[ch] for ch in data[rnn.pointer+1:rnn.pointer+seq_length+1]]

# initialize hidden state with zeros
hprev = [np.zeros((hidden_size, 1)) for _ in range(num_layers)] 

## Call RNN
loss, hprev, cache = rnn(inputs=x, targets=y, hprev=hprev)
grads = rnn.backward(targets=y, cache=cache)
rnn.update(grads=grads, lr=1e-1)

In [44]:
# Initialize RNN
num_layers = 2
hidden_size = 128
seq_length = 8

rnn = RNN(hidden_size=hidden_size, vocab_size=vocab_size, seq_length=seq_length, num_layers=num_layers)

def train(rnn, epochs, data, lr=1e-1):

    for _ in range(epochs):

        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if rnn.pointer+seq_length+1 >= len(data) or rnn.iteration == 0: 
            hprev = [np.zeros((hidden_size, 1)) for _ in range(num_layers)]  # reset RNN memory
            rnn.pointer = 0 # go from start of data

        x = [char_to_idx[ch] for ch in data[rnn.pointer:rnn.pointer+seq_length]]
        y = [char_to_idx[ch] for ch in data[rnn.pointer+1:rnn.pointer+seq_length+1]]

        ## Call RNN
        loss, hprev, cache = rnn(inputs=x, targets=y, hprev=hprev)
        grads = rnn.backward(targets=y, cache=cache)
        rnn.update(grads=grads, lr=1e-1)

        # update loss
        rnn.loss = rnn.loss * 0.999 + loss * 0.001

        ## show progress now and then
        if rnn.iteration % 1000 == 0: 
            print('iter {}, loss: {}'.format(rnn.iteration, rnn.loss))

        rnn.pointer += seq_length # move data pointer
        rnn.iteration += 1 # iteration counter 

train(rnn=rnn, epochs=50000, data=data)

iter 0, loss: 33.39509760284121
iter 1000, loss: 31.39037657797319
iter 2000, loss: 28.553121730864426
iter 3000, loss: 27.046728077514302
iter 4000, loss: 25.76771024952786
iter 5000, loss: 25.2216656533259
iter 6000, loss: 24.408158911256773
iter 7000, loss: 23.640915823683265
iter 8000, loss: 22.896141777596206
iter 9000, loss: 22.282290000362327
iter 10000, loss: 21.745090366115605
iter 11000, loss: 21.162839060502293
iter 12000, loss: 21.014126612832126
iter 13000, loss: 20.82522160771053
iter 14000, loss: 20.81880043403413
iter 15000, loss: 20.61903499232407
iter 16000, loss: 20.42781923676823
iter 17000, loss: 20.39400362110716
iter 18000, loss: 20.311679733742412
iter 19000, loss: 20.152124084168616
iter 20000, loss: 20.641160180193395
iter 21000, loss: 20.55199233807862
iter 22000, loss: 20.418200024120882
iter 23000, loss: 20.345876883553704
iter 24000, loss: 20.15374894631003
iter 25000, loss: 19.911754722691615
iter 26000, loss: 19.909126867981186
iter 27000, loss: 20.11073