In [1]:
from __future__ import division

import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
#nltk.download('book')

vocabulary_size = 8000
unknown_token = 'UNKNOWN_TOKEN'
start_token = "START_TOKEN"
end_token = "END_TOKEN"

### Split comments into tokenized_sentences

In [3]:
comments = pd.read_csv("data/reddit-comments-2015-08.csv")

#tokenize comment into sentences
sentences = [nltk.sent_tokenize(comment.lower()) for comment in comments['body'].values]
sentences = list(itertools.chain(*sentences))

print(f"There are total {len(sentences)} sentences in this corpus")

#add start_token and end_token to each sentences
sentences = [f"{start_token} {s} {end_token}" for s in sentences]
print(f"\"{sentences[0]}\"")

#split each sentences into word
tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
print(tokenized_sentences[0])

There are total 79170 sentences in this corpus
"START_TOKEN i joined a new league this year and they have different scoring rules than i'm used to. END_TOKEN"
['START_TOKEN', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'END_TOKEN']


### Create vocabulary

In [4]:
#Word distribution
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

#Number of unique word in corpus
print(f"Number of unique words is {len(word_freq.items())}")

vocab = word_freq.most_common(vocabulary_size - 1)

print(f"Using vocabulary size of {vocabulary_size}")
print(f"The least frequent word is \"{vocab[-1][0]}\" which appeared {vocab[-1][1]} times")

Number of unique words is 65408
Using vocabulary size of 8000
The least frequent word is "documentary" which appeared 10 times


### Replace all words not in vocabulary with UNKNOWN_TOKEN

In [5]:
index_to_word = [word_freq[0] for word_freq in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in index_to_word else unknown_token for w in sent]

### Create X train & Y train

In [6]:
X_train = np.asarray([[word_to_index[word] for word in sent[:-1]] for sent in tokenized_sentences])
Y_train = np.asarray([[word_to_index[word] for word in sent[1:]] for sent in tokenized_sentences])

In [7]:
print(f"x : {X_train[1]}")
print(f"x : {[index_to_word[index] for index in X_train[1]]}")
print(f"y : {Y_train[1]}")
print(f"y : {[index_to_word[index] for index in Y_train[1]]}")

x : [0, 11, 17, 7, 3094, 5974, 7999, 7999, 5974, 2]
x : ['START_TOKEN', 'it', "'s", 'a', 'slight', 'ppr', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'ppr', '.']
y : [11, 17, 7, 3094, 5974, 7999, 7999, 5974, 2, 1]
y : ['it', "'s", 'a', 'slight', 'ppr', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'ppr', '.', 'END_TOKEN']


### Define RNN class using pure Numpy

![](http://www.wildml.com/wp-content/uploads/2015/09/rnn.jpg)

Let's recap the equations for the RNN from the first part of the tutorial:

$
\begin{aligned}
s_t &= \tanh(Ux_t + Ws_{t-1}) \\
o_t &= \mathrm{softmax}(Vs_t)
\end{aligned}
$

I always find it useful to write down the dimensions of the matrices and vectors. Let's assume we pick a vocabulary size $C = 8000$ and a hidden layer size $H = 100$. You can think of the hidden layer size as the "memory" of our network. Making it bigger allows us to learn more complex patterns, but also results in additional computation. Then we have:

$
\begin{aligned}
x_t & \in \mathbb{R}^{8000} \\
o_t & \in \mathbb{R}^{8000} \\
s_t & \in \mathbb{R}^{100} \\
U & \in \mathbb{R}^{100 \times 8000} \\
V & \in \mathbb{R}^{8000 \times 100} \\
W & \in \mathbb{R}^{100 \times 100} \\
\end{aligned}
$

In [8]:
def softmax(zs):
    exp_zs = [np.exp(z) for z in zs] 
    return exp_zs / np.sum(exp_zs)

In [24]:
from  pdb import set_trace
class RNNNumpy:
    def __init__(self, n_inputs, n_hiddens = 100, bptt_truncate=4):
        self.n_inputs = n_inputs
        self.n_hiddens = n_hiddens
        self.bptt_truncate = bptt_truncate
        
        self.U = np.random.uniform(-np.sqrt(1/n_inputs), np.sqrt(1/n_inputs), (n_hiddens, n_inputs))
        self.W = np.random.uniform(-np.sqrt(1/n_hiddens), np.sqrt(1/n_hiddens), (n_hiddens, n_hiddens))
        self.V = np.random.uniform(-np.sqrt(1/n_hiddens), np.sqrt(1/n_hiddens), (n_inputs, n_hiddens))
        
    def forward_propagate(self, sentent):
        # x is sentent
        
        T = len(sentent)
        s = np.zeros((T + 1, self.n_hiddens))
        o = np.zeros((T, self.n_inputs))
        
        for t in range(T):
            s[t] = np.tanh(self.U[:, sentent[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]    
            
    def predict(self, x):
        o, s = self.forward_propagate(x)
        return np.argmax(o, axis=1)
    
    def loss(self, xs, ys):
        L = 0
        for i in range(len(xs)):
            o, s = self.forward_propagate(xs[i])
            predict_probs = o[np.arange(len(o)), ys[i]]
            L += -1 * np.sum(np.log(predict_probs))
        return L
    
    def total_loss(self, xs, ys):
        N = np.sum([len(x) for x in xs])
        return self.loss(xs, ys) / N
    
    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagate(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print &quot;Backpropagation step t=%d bptt step=%d &quot; % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])              
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]
    
    def update_weights(self, x, y, learning_rate):
        dLdU, dLdV, dLdW = self.bptt(x, y)
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW
        
    def train_with_gd(self, X, y, learning_rate=0.005, n_epochs=100):
        for epoch in range(n_epochs):
            if(epoch % 10 == 0):
                loss = self.total_loss(X, y)
                print(f"Epoch : {epoch}, loss = {loss}")
            
            for i in range(len(y)):
                self.update_weights(X[i], y[i], learning_rate)

In [None]:
model = RNNNumpy(8000)
model.train_with_gd(X_train[:20], Y_train[:20])

Epoch : 0, loss = 8.987186970698227


In [23]:
model.predict(X_train[0])

AttributeError: 'RNNNumpy' object has no attribute 'predict'