In [1]:
%matplotlib inline
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt

In [2]:
nltk.download("book") # do this only once

[nltk_data] Downloading collection u'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/harangju/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/harangju/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/harangju/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/harangju/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/harangju/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/harangju/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_dat

True

In [3]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [4]:
print "Reading CSV file..."
with open('../data/reddit/reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
print "Example sentence:"
print "\"%s\"" % sentences[0]

Reading CSV file...
Parsed 79170 sentences.
Example sentence:
"SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END"


In [5]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
print "Example sentence:"
print tokenized_sentences[0]

Example sentence:
[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END']


In [6]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())

Found 65751 unique words tokens.


In [7]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])
print "Example vocab:"
print vocab[3]
print "Example index_to_word[3]: %s" % index_to_word[3]
print "Example word_to_index[\"the\"]: %d" % word_to_index["the"]

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times.
Example vocab:
(u'the', 52338)
Example index_to_word[3]: the
Example word_to_index["the"]: 3


In [8]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]


Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END']'


In [9]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
X_words = [[w for w in sent[:-1]] for sent in tokenized_sentences]
y_words = [[w for w in sent[1:]] for sent in tokenized_sentences]

# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print "x predicts y\n"
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

x predicts y

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 856, 53, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 856, 53, 25, 34, 69, 1]


#### Definitions

![](http://www.wildml.com/wp-content/uploads/2015/09/rnn.jpg)

$x_t$ = input at time step $t$  
$s_t$ = $\begin{aligned} \tanh(Ux_t + Ws_{t-1}) \end{aligned}$  = hidden state at time step $t$  
$o_t$ = $\begin{aligned} \mathrm{softmax}(Vs_t) \end{aligned}$ = output at step $t$  

$
\begin{aligned}
x_t & \in \mathbb{R}^{8000} \\
s_t & \in \mathbb{R}^{100} \\
o_t & \in \mathbb{R}^{8000} \\
U & \in \mathbb{R}^{100 \times 8000} \\
V & \in \mathbb{R}^{8000 \times 100} \\
W & \in \mathbb{R}^{100 \times 100} \\
\end{aligned}
$



In [10]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim # input dim
        self.hidden_dim = hidden_dim # hidden dim
        self.bptt_truncate = bptt_truncate # how far back to backprop
        # Randomly initialize the network parameters
        # 100 X 8000 - input to hidden
        self.U = np.random.uniform(-1 * np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        # 100 X 100 - hidden to hidden
        self.W = np.random.uniform(-1 * np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        # 8000 X 100 - hidden to output
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))

In [40]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dim))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
#         s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        z = np.zeros((self.word_dim, 1), dtype=np.float64)
        
        z[x[t]] = 1
#         print self.U.shape
#         print z.shape
#         print str("one-hot vector: (index " + str(x[t]) + ") " + str(z))
#         print "to get " + str(self.U[:,x[t]])
#         print "should equal " + str(np.outer(z, self.U))
#         s[t] = np.tanh(np.outer(z, self.U) + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [33]:
z = np.zeros(8000, dtype=np.float64)
z[2] = 1
print z
print z.shape
word_dim = 8000
hidden_dim = 100
U = np.random.uniform(-1 * np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
print U.shape
r = U * z
t = z * U
print r.shape
print t.shape
print r.T[2]
print t

[ 0.  0.  1. ...,  0.  0.  0.]
(8000,)
(100, 8000)
(100, 8000)
(100, 8000)
[ -3.68703032e-03  -1.30457333e-04   5.96817490e-03  -1.42813825e-04
   1.55395252e-03  -1.04018576e-02   6.17905785e-03  -2.88878852e-04
  -1.14293446e-03   1.21975690e-03   9.19626739e-03  -3.51550577e-03
  -1.07099069e-02  -8.97907841e-03   2.93163918e-03   1.21507509e-03
  -3.77185133e-03   8.33852680e-04  -3.34446128e-03   8.86750201e-03
  -6.66720354e-03   9.41316272e-03   3.13209563e-03   7.45407620e-03
   1.78734621e-03   3.82765536e-03  -2.84050701e-03   8.07125156e-03
   8.00731938e-03   7.06717658e-03   1.32620666e-03  -5.62240250e-03
  -1.51205534e-03  -1.11798931e-02   1.08785048e-02   3.54668669e-03
  -9.50256452e-03   2.85706675e-03   5.93152179e-04  -1.06049189e-02
  -7.03046354e-03   8.95851617e-03   5.36883071e-03   4.66321963e-03
   6.66649835e-03   1.05950637e-02   8.94560323e-03  -1.11493434e-02
  -7.57155043e-03   3.08398097e-03  -5.87497927e-03   3.25928209e-03
  -3.43545550e-03   9.301651

In [34]:
# Forward prop example
np.random.seed(1)
model = RNNNumpy(vocabulary_size)
print X_train[10]
o, s = model.forward_propagation(X_train[10])
# print o.shape
# print o
# plt.imshow(o)
# plt.show()
#print X_train[10]
#print o.shape
#print o

[0, 72, 63, 13, 124, 5, 26, 1126, 208, 5, 324, 3, 333, 4, 112, 32, 75, 7, 4921, 4, 8, 84, 52, 9, 7, 3217, 1016, 490, 7928, 8, 133, 49, 3141, 4, 10, 95, 51, 4, 128, 17, 37, 314, 576, 2, 40]
(100, 8000)
(8000, 1)
one-hot vector: (index 0) [[ 1.]
 [ 0.]
 [ 0.]
 ..., 
 [ 0.]
 [ 0.]
 [ 0.]]
to get [ -1.85544438e-03  -3.02621201e-03   6.11294469e-03  -2.76274419e-03
   4.06295775e-03   8.46856521e-03   1.43494470e-04   7.11024478e-03
  -2.29029981e-03  -1.40127741e-03  -6.96337225e-03   3.08668190e-03
  -7.89800458e-03  -8.87170314e-03  -1.09250332e-02  -9.14453166e-03
  -6.68026999e-04  -1.01180217e-02   3.15333981e-03  -9.19428307e-03
   3.22285441e-04   1.23482138e-03   1.06709322e-02   1.77314279e-03
   1.06173452e-02   2.72002513e-03  -3.62515004e-05  -9.59401187e-04
  -7.03603939e-03  -5.39672910e-03   4.01039591e-03  -4.74864217e-03
   5.32033768e-04  -8.39045006e-03   1.70657227e-03   4.39808347e-03
   1.76545569e-03  -9.61550537e-03   7.60751155e-03   1.66490141e-03
   6.77226501e-0

In [35]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [36]:
X_words_example = X_words[1]
X_train_example = X_train[1]
print "%s\n\t%s\n" % (X_words[1], str(X_train_example))

predictions = model.predict(X_train_example)
print "Predictions without training: \"%s\"" % " ".join([index_to_word[x] for x in predictions])
print "%s %s" % (str(predictions.shape), predictions)

[u'SENTENCE_START', u'it', u"'s", u'a', u'slight', u'ppr', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', u'ppr', u'.']
	[0, 11, 17, 7, 3114, 6036, 7999, 7999, 6036, 2]

(100, 8000)
(8000, 1)
one-hot vector: (index 0) [[ 1.]
 [ 0.]
 [ 0.]
 ..., 
 [ 0.]
 [ 0.]
 [ 0.]]
to get [ -1.85544438e-03  -3.02621201e-03   6.11294469e-03  -2.76274419e-03
   4.06295775e-03   8.46856521e-03   1.43494470e-04   7.11024478e-03
  -2.29029981e-03  -1.40127741e-03  -6.96337225e-03   3.08668190e-03
  -7.89800458e-03  -8.87170314e-03  -1.09250332e-02  -9.14453166e-03
  -6.68026999e-04  -1.01180217e-02   3.15333981e-03  -9.19428307e-03
   3.22285441e-04   1.23482138e-03   1.06709322e-02   1.77314279e-03
   1.06173452e-02   2.72002513e-03  -3.62515004e-05  -9.59401187e-04
  -7.03603939e-03  -5.39672910e-03   4.01039591e-03  -4.74864217e-03
   5.32033768e-04  -8.39045006e-03   1.70657227e-03   4.39808347e-03
   1.76545569e-03  -9.61550537e-03   7.60751155e-03   1.66490141e-03
   6.77226501e-03  -7.73474487e-03  -9.58446272e

Cross entropy loss function

$
\begin{aligned}
L(y,o) = - \frac{1}{N} \sum_{n \in N} y_{n} \log o_{n}
\end{aligned}
$

In [38]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        print len(y[i])
        print y[i]
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [39]:
# Limit to 100 examples to save time
print "Expected Loss for random predictions: %f" % np.log(vocabulary_size)
print "Actual loss: %f" % model.calculate_loss(X_train[:100], y_train[:100])

Expected Loss for random predictions: 8.987197
(100, 8000)
(8000, 1)
one-hot vector: (index 0) [[ 1.]
 [ 0.]
 [ 0.]
 ..., 
 [ 0.]
 [ 0.]
 [ 0.]]
to get [ -1.85544438e-03  -3.02621201e-03   6.11294469e-03  -2.76274419e-03
   4.06295775e-03   8.46856521e-03   1.43494470e-04   7.11024478e-03
  -2.29029981e-03  -1.40127741e-03  -6.96337225e-03   3.08668190e-03
  -7.89800458e-03  -8.87170314e-03  -1.09250332e-02  -9.14453166e-03
  -6.68026999e-04  -1.01180217e-02   3.15333981e-03  -9.19428307e-03
   3.22285441e-04   1.23482138e-03   1.06709322e-02   1.77314279e-03
   1.06173452e-02   2.72002513e-03  -3.62515004e-05  -9.59401187e-04
  -7.03603939e-03  -5.39672910e-03   4.01039591e-03  -4.74864217e-03
   5.32033768e-04  -8.39045006e-03   1.70657227e-03   4.39808347e-03
   1.76545569e-03  -9.61550537e-03   7.60751155e-03   1.66490141e-03
   6.77226501e-03  -7.73474487e-03  -9.58446272e-03  -8.10953219e-03
   7.73638216e-03   4.41892492e-03   5.45463102e-04   9.26393813e-03
   5.20064891e-03  -

In [125]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [126]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    # Calculate the gradients using backpropagation. We want to checker if these are correct.
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to check.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter = operator.attrgetter(pname)(self)
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            # Reset parameter to original value
            parameter[ix] = original_value
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)

RNNNumpy.gradient_check = gradient_check

In [128]:
# Performs one step of SGD.
def numpy_sdg_step(self, x, y, learning_rate):
    # Calculate the gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    # Change parameters according to gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [129]:
# Outer SGD Loop
# - model: The RNN model instance
# - X_train: The training data set
# - y_train: The training data labels
# - learning_rate: Initial learning rate for SGD
# - nepoch: Number of times to iterate through the complete dataset
# - evaluate_loss_after: Evaluate the loss after this many epochs
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    # We keep track of the losses so we can plot them later
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        # Optionally evaluate the loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print "%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss)
            # Adjust the learning rate if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print "Setting learning rate to %f" % learning_rate
            sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [130]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

1 loops, best of 3: 308 ms per loop


In [132]:
# Try with a small batch
batch_size = 100
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=2, evaluate_loss_after=1)

2016-02-28 22:48:07: Loss after num_examples_seen=0 epoch=0: 8.987374
2016-02-28 22:48:23: Loss after num_examples_seen=100 epoch=1: 8.976118


## Theano!

In [1]:
import theano as theano
import theano.tensor as T
import operator

In [2]:
class RNNTheano:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        # Theano: Created shared variables
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))      
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self):
        U, V, W = self.U, self.V, self.W
        x = T.ivector('x')
        y = T.ivector('y')
        def forward_prop_step(x_t, s_t_prev, U, V, W):
            s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
            o_t = T.nnet.softmax(V.dot(s_t))
            return [o_t[0], s_t]
        [o,s], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
            non_sequences=[U, V, W],
            truncate_gradient=self.bptt_truncate,
            strict=True)
        
        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        
        # Gradients
        dU = T.grad(o_error, U)
        dV = T.grad(o_error, V)
        dW = T.grad(o_error, W)
        
        # Assign functions
        self.forward_propagation = theano.function([x], o)
        self.predict = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], o_error)
        self.bptt = theano.function([x, y], [dU, dV, dW])
        
        # SGD
        learning_rate = T.scalar('learning_rate')
        self.sgd_step = theano.function([x,y,learning_rate], [], 
                      updates=[(self.U, self.U - learning_rate * dU),
                              (self.V, self.V - learning_rate * dV),
                              (self.W, self.W - learning_rate * dW)])
    
    def calculate_total_loss(self, X, Y):
        return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
    
    def calculate_loss(self, X, Y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y) for y in Y])
        return self.calculate_total_loss(X,Y)/float(num_words)   

In [3]:
def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01):
    # Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient
    model.bptt_truncate = 1000
    # Calculate the gradients using backprop
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to chec.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter_T = operator.attrgetter(pname)(model)
        parameter = parameter_T.get_value()
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            parameter_T.set_value(parameter)
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            parameter_T.set_value(parameter)
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            parameter[ix] = original_value
            parameter_T.set_value(parameter)
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)

In [8]:
np.random.seed(10)
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

  from scan_perform.scan_perform import *


NameError: global name 'X_train' is not defined

In [140]:
def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

In [155]:
from utils import load_model_parameters_theano, save_model_parameters_theano

model = RNNTheano(vocabulary_size, hidden_dim=50)
# losses = train_with_sgd(model, X_train, y_train, nepoch=50)
# save_model_parameters_theano('./data/trained-model-theano.npz', model)
load_model_parameters_theano('../data/reddit/trained-model-theano.npz', model)

Loaded model parameters from ../data/reddit/trained-model-theano.npz. hidden_dim=50 word_dim=8000


In [156]:
num_sentences = 10
senten_min_length = 7

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print " ".join(sent)

same and workers arguing is okay .
nice call this little risk by typing
that 's attention to her day about city .
i have giving an active looks def .
oh what 's the dust was social developed .
they 're *not* saying the case .
still made him out the visually of popcorn .
placing ... what is your rare ) .
we have access of problems with this .
a lot of your garage code of words .
