## RNN Language Model

Below is a diagram of the RNN computation that we will implement below. We're plugging characters into the RNN with a 1-hot encoding and expecting it to predict the next character. In this example the training data is the string "hello", so there are 4 letters in the vocabulary: [h,e,l,o].

<img src="rnnlm.jpeg">

In [1]:
import numpy as np
np.random.seed(1337)

In [2]:
# data I/O
data = open('warpeace.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)

data has 3291648 characters, 93 unique.


In [3]:
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

In [4]:
char_to_ix['a']

86

In [5]:
# lets sample a batch of data
seq_length = 25 # number of characters in the batch
p = 220000 # point in the book to sample from
print data[p:p+seq_length] # print a chunk of data

at the twitching cheeks o


In [6]:
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print inputs
print targets

[86, 22, 0, 22, 18, 87, 0, 22, 45, 88, 22, 40, 18, 88, 64, 41, 0, 40, 18, 87, 87, 42, 44, 0, 43]
[22, 0, 22, 18, 87, 0, 22, 45, 88, 22, 40, 18, 88, 64, 41, 0, 40, 18, 87, 87, 42, 44, 0, 43, 62]


In [7]:
# lets plug the first character into the RNN
ix_input = inputs[0]
ix_target = targets[0]
# encode the input character with a 1-hot representation
x = np.zeros((vocab_size,1))
x[ix_input] = 1
print x.ravel()

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.]


In [8]:
# create random starting parameters
hidden_size = 10
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [9]:
# compute the hidden state
h_prev = np.zeros((hidden_size, 1))
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))
print h.ravel()

[ -2.19597294e-03  -1.11958403e-02  -7.96573514e-03  -8.29025315e-03
  -1.69046852e-02   7.16480643e-03  -1.08628224e-05   1.38136549e-03
   8.91231078e-03   1.22455580e-02]


In [10]:
# compute the scores for next character
y = np.dot(Why, h) + by
print y.ravel()

[ -1.86941995e-04   5.75478992e-05   5.45725372e-05  -4.12423671e-04
   3.06201649e-04  -5.06931560e-04   3.19806236e-04  -3.51163814e-05
  -3.38972137e-04  -2.39857442e-04  -8.01432457e-05  -1.04391992e-04
  -3.15399258e-04   1.48454913e-05   4.55998798e-05   1.28931375e-04
   4.04072882e-04  -5.75504844e-04   1.57533734e-04  -4.08258042e-04
  -7.41755605e-05  -7.76690551e-05   1.87837950e-04  -2.79910204e-04
  -7.96190232e-04  -5.79876645e-05   1.26730230e-04   3.95894081e-04
   2.76955496e-04   2.59637379e-05  -4.61015050e-04  -5.14636280e-04
  -2.56714611e-04   4.94377196e-04  -3.64149466e-04  -4.26364575e-04
  -1.25515821e-04   9.29132184e-06   4.88944584e-05  -6.31681837e-04
  -1.75121523e-05   9.51597096e-06   3.08227238e-04   1.29694758e-04
   1.72591168e-04  -5.87989566e-05   2.74250548e-04   5.90320270e-05
   1.64825617e-04   3.11431559e-04   3.48582321e-04  -4.19733486e-05
  -2.77062543e-04  -1.10111861e-05  -1.33642742e-05  -5.99994639e-05
   6.07084027e-04   3.51784289e-04

In [11]:
# the scores are unnormalized log probabilities. compute the probabilities
p = np.exp(y) / np.sum(np.exp(y))
print p.ravel()
print 'probabilities sum to ', p.sum()

[ 0.01075121  0.01075383  0.0107538   0.01074878  0.01075651  0.01074777
  0.01075666  0.01075284  0.01074957  0.01075064  0.01075235  0.01075209
  0.01074982  0.01075338  0.01075371  0.0107546   0.01075756  0.01074703
  0.01075491  0.01074883  0.01075242  0.01075238  0.01075524  0.01075021
  0.01074466  0.01075259  0.01075458  0.01075747  0.01075619  0.01075349
  0.01074826  0.01074768  0.01075046  0.01075853  0.0107493   0.01074863
  0.01075187  0.01075332  0.01075374  0.01074643  0.01075303  0.01075332
  0.01075653  0.01075461  0.01075507  0.01075258  0.01075617  0.01075385
  0.01075499  0.01075656  0.01075696  0.01075276  0.01075024  0.0107531
  0.01075307  0.01075257  0.01075975  0.010757    0.0107528   0.0107531
  0.01075638  0.01075405  0.01075015  0.0107578   0.01074948  0.01074819
  0.01075248  0.0107532   0.01075389  0.01075024  0.01075641  0.01075342
  0.01075643  0.01075036  0.01074721  0.01074991  0.01075725  0.01075834
  0.01075003  0.01074992  0.01075335  0.01075168  0.0

In [12]:
print 'probability assigned to the correct next character is right now: ', p[ix_target,0]

probability assigned to the correct next character is right now:  0.0107552356218


In [13]:
loss = -np.log(p[ix_target,0])
print 'the cross-entropy (softmax) loss is ', loss

the cross-entropy (softmax) loss is  4.53236260838


In [14]:
# compute the gradient on y
dy = np.copy(p)
dy[ix_target] -= 1
print dy.ravel()
print 'sum of dy is ', dy.sum()
print 'the gradient for the correct character (%s) is: %s' % (ix_to_char[ix_target], dy[ix_target,0])
print 'the gradient for the character (a) is: ', dy[char_to_ix['a'],0]

[ 0.01075121  0.01075383  0.0107538   0.01074878  0.01075651  0.01074777
  0.01075666  0.01075284  0.01074957  0.01075064  0.01075235  0.01075209
  0.01074982  0.01075338  0.01075371  0.0107546   0.01075756  0.01074703
  0.01075491  0.01074883  0.01075242  0.01075238 -0.98924476  0.01075021
  0.01074466  0.01075259  0.01075458  0.01075747  0.01075619  0.01075349
  0.01074826  0.01074768  0.01075046  0.01075853  0.0107493   0.01074863
  0.01075187  0.01075332  0.01075374  0.01074643  0.01075303  0.01075332
  0.01075653  0.01075461  0.01075507  0.01075258  0.01075617  0.01075385
  0.01075499  0.01075656  0.01075696  0.01075276  0.01075024  0.0107531
  0.01075307  0.01075257  0.01075975  0.010757    0.0107528   0.0107531
  0.01075638  0.01075405  0.01075015  0.0107578   0.01074948  0.01074819
  0.01075248  0.0107532   0.01075389  0.01075024  0.01075641  0.01075342
  0.01075643  0.01075036  0.01074721  0.01074991  0.01075725  0.01075834
  0.01075003  0.01074992  0.01075335  0.01075168  0.0

In [15]:
# we computed [y = np.dot(Why, h) + by]; Backpropagate to Why, h, and by
dWhy = np.dot(dy, h.T)
dh = np.dot(Why.T, dy)
dby = np.copy(dy)
print 'the hidden vector activations were:'
print h.ravel()
print 'the gradients are:'
print dh.ravel()
print 'the gradients dWhy have size: ', dWhy.shape
print 'a small sample is:'
print dWhy[:4,:4]

the hidden vector activations were:
[ -2.19597294e-03  -1.11958403e-02  -7.96573514e-03  -8.29025315e-03
  -1.69046852e-02   7.16480643e-03  -1.08628224e-05   1.38136549e-03
   8.91231078e-03   1.22455580e-02]
the gradients are:
[ 0.010636   -0.00556332  0.01572234 -0.01013355  0.01024101  0.00893933
  0.01156068 -0.0035879  -0.00304811 -0.00761244]
the gradients dWhy have size:  (93, 10)
a small sample is:
[[ -2.36093564e-05  -1.20368781e-04  -8.56412557e-05  -8.91302155e-05]
 [ -2.36151293e-05  -1.20398213e-04  -8.56621967e-05  -8.91520096e-05]
 [ -2.36150591e-05  -1.20397855e-04  -8.56619418e-05  -8.91517444e-05]
 [ -2.36040335e-05  -1.20341643e-04  -8.56219473e-05  -8.91101206e-05]]


In [16]:
# we computed [h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))]; 
# Backprop into Wxh, x, Whh, h_prev, bh:
dh_before_tanh = (1-h**2)*dh
dbh = np.copy(dh_before_tanh)
dWxh = np.dot(dh_before_tanh, x.T)
dWhh = np.dot(dh_before_tanh, h.T)
dh_prev = np.dot(Whh.T, dh_before_tanh)
print 'small sample of Whh:'
print Whh[:4,:4]

small sample of Whh:
[[-0.0181644  -0.00730641  0.00068086  0.00043998]
 [-0.00706002 -0.00657655  0.0100175   0.00198405]
 [ 0.0175388   0.01643369 -0.00161146 -0.01413518]
 [ 0.00812143 -0.00105367  0.00944295  0.01148334]]


In [17]:
# we now have the gradients for all parameters! (Wxh, Whh, Why, bh, by)
# lets do a parameter update
learning_rate = 0.1
Wxh2 = Wxh - learning_rate * dWxh
Whh2 = Whh - learning_rate * dWhh
Why2 = Why - learning_rate * dWhy
bh2 = bh - learning_rate * dbh
by2 = by - learning_rate * dby

In [18]:
# these parameters should be much better! lets try it out:
h2 = np.tanh(np.dot(Wxh2, x) + np.dot(Whh2, h_prev + bh2))
y2 = np.dot(Why2, h2) + by2
p2 = np.exp(y2) / np.sum(np.exp(y2))
print 'probability assigned to the correct next character was: ', p[ix_target,0]
print 'probability assigned to the correct next character is now: ', p2[ix_target,0]
loss2 = -np.log(p2[ix_target,0])
print 'the cross-entropy (softmax) loss was ', loss
print 'the loss is now ', loss2

probability assigned to the correct next character was:  0.0107552356218
probability assigned to the correct next character is now:  0.0118750046895
the cross-entropy (softmax) loss was  4.53236260838
the loss is now  4.43331953415


In [19]:
# note: the probability for the correct character went up! (and the loss went down)

In [20]:
# putting it together with loops
def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # forward pass
    for t in xrange(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(xrange(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        
    # clip to mitigate exploding gradients
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [21]:
loss, dWxh, dWhh, dWhy, dbh, dby, hnew = lossFun(inputs, targets, h_prev)
print loss

113.314839864


In [22]:
# TODO: write the sampling code
def sample(h, seed_ix, n):
    """ 
    sample a sequence of integers from the model 
    h is initial memory state, seed_ix is seed letter for first time step
    n is the number of time steps to sample for
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = [] # sampled indices
    for t in xrange(n):
        pass # TODO: run the RNN for one time step, sample from distribution
    return ixes


In [23]:
# TODO: write the optimization loop
# Loop over the dataset from beginning to end, sampling batches of characters seq_length long
# Call the loss function and get the gradients
# Perform a parameter update
# Sample some examples from the model