In [1]:
import numpy as np

In [3]:
# data I/O
data = open('warpeace.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)

data has 3291648 characters, 93 unique.


In [4]:
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

In [10]:
char_to_ix['a']

86

In [57]:
# lets sample a batch of data
seq_length = 25 # number of characters in the batch
p = 220000 # point in the book to sample from
print data[p:p+seq_length] # print a chunk of data

at the twitching cheeks o


In [58]:
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print inputs
print targets

[86, 22, 0, 22, 18, 87, 0, 22, 45, 88, 22, 40, 18, 88, 64, 41, 0, 40, 18, 87, 87, 42, 44, 0, 43]
[22, 0, 22, 18, 87, 0, 22, 45, 88, 22, 40, 18, 88, 64, 41, 0, 40, 18, 87, 87, 42, 44, 0, 43, 62]


In [59]:
# lets plug the first character into the RNN
ix_input = inputs[0]
ix_target = targets[0]
# encode the input character with a 1-hot representation
x = np.zeros((vocab_size,1))
x[ix_input] = 1
print x.ravel()

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.]


In [60]:
# create random starting parameters
hidden_size = 10
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [61]:
# compute the hidden state
h_prev = np.zeros((hidden_size, 1))
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))
print h.ravel()

[ 0.01971421 -0.00533177  0.00142983 -0.00985955  0.00248728 -0.01736378
  0.01687631 -0.01756024  0.00207365 -0.00083882]


In [62]:
# compute the scores for next character
y = np.dot(Why, h) + by
print y.ravel()

[  7.60384659e-04   4.52720278e-04  -6.57143068e-04   4.59711552e-04
   2.16313976e-04  -2.73453571e-04   6.45337766e-04   6.58329123e-06
   4.52167436e-04  -6.46317115e-04   4.02199638e-04   1.99170349e-04
   4.25310887e-04   3.50158091e-04  -6.04819130e-04   2.26877413e-04
  -1.94723297e-04   5.50639249e-04  -1.71656881e-04  -2.99860880e-04
   6.07620521e-04   5.70905998e-06   4.26038734e-04   1.01454914e-04
  -3.65278849e-04   5.90686240e-04   1.59204750e-04   4.08708316e-04
   2.56014525e-04   1.39087410e-06  -6.90950824e-04   7.18269467e-04
   2.61719276e-04   4.58528196e-04   3.86335255e-04   2.70955381e-04
  -1.48929417e-04  -1.83764339e-04  -6.36673024e-04   4.00478015e-04
  -8.69493168e-05   3.73431457e-04   9.14721284e-05   4.38316766e-05
  -6.83760980e-05   1.52588617e-05   1.74420973e-04  -1.47075009e-04
  -3.62079011e-04   5.05874063e-05   6.20919471e-04   7.19441792e-05
  -1.88333333e-04  -2.98096664e-04   4.43592043e-04  -3.76377143e-04
   2.10335682e-04  -6.41359408e-04

In [63]:
# the scores are unnormalized log probabilities. compute the probabilities
p = np.exp(y) / np.sum(np.exp(y))
print p.ravel()
print 'probabilities sum to ', p.sum()

[ 0.01076018  0.01075687  0.01074493  0.01075694  0.01075432  0.01074906
  0.01075894  0.01075207  0.01075686  0.01074505  0.01075632  0.01075414
  0.01075657  0.01075576  0.0107455   0.01075444  0.0107499   0.01075792
  0.01075015  0.01074877  0.01075853  0.01075206  0.01075658  0.01075309
  0.01074807  0.01075835  0.01075371  0.01075639  0.01075475  0.01075201
  0.01074457  0.01075972  0.01075481  0.01075693  0.01075615  0.01075491
  0.0107504   0.01075002  0.01074515  0.0107563   0.01075106  0.01075601
  0.01075298  0.01075247  0.01075126  0.01075216  0.01075387  0.01075042
  0.0107481   0.01075254  0.01075867  0.01075277  0.01074997  0.01074879
  0.01075677  0.01074795  0.01075426  0.0107451   0.01075151  0.01075118
  0.01075259  0.01074901  0.01075222  0.01075429  0.01074654  0.01075337
  0.01075656  0.0107497   0.01074978  0.01075045  0.01075684  0.01075495
  0.01075341  0.01075554  0.01075047  0.01075115  0.01075168  0.01074785
  0.010755    0.01075934  0.01074979  0.01074742  0

In [64]:
print 'probability assigned to the correct next character is right now: ', p[ix_target,0]

probability assigned to the correct next character is right now:  0.0107565780746


In [72]:
loss = -np.log(p[ix_target,0])
print 'the cross-entropy (softmax) loss is ', loss

the cross-entropy (softmax) loss is  4.53223779763


In [74]:
# compute the gradient on y
dy = np.copy(p)
dy[ix_target] -= 1
print dy.ravel()
print 'sum of dy is ', dy.sum()
print 'the gradient for the correct character (%s) is: %s' % (ix_to_char[ix_target], dy[ix_target,0])
print 'the gradient for the character (a) is: ', dy[char_to_ix['a'],0]

[ 0.01076018  0.01075687  0.01074493  0.01075694  0.01075432  0.01074906
  0.01075894  0.01075207  0.01075686  0.01074505  0.01075632  0.01075414
  0.01075657  0.01075576  0.0107455   0.01075444  0.0107499   0.01075792
  0.01075015  0.01074877  0.01075853  0.01075206 -0.98924342  0.01075309
  0.01074807  0.01075835  0.01075371  0.01075639  0.01075475  0.01075201
  0.01074457  0.01075972  0.01075481  0.01075693  0.01075615  0.01075491
  0.0107504   0.01075002  0.01074515  0.0107563   0.01075106  0.01075601
  0.01075298  0.01075247  0.01075126  0.01075216  0.01075387  0.01075042
  0.0107481   0.01075254  0.01075867  0.01075277  0.01074997  0.01074879
  0.01075677  0.01074795  0.01075426  0.0107451   0.01075151  0.01075118
  0.01075259  0.01074901  0.01075222  0.01075429  0.01074654  0.01075337
  0.01075656  0.0107497   0.01074978  0.01075045  0.01075684  0.01075495
  0.01075341  0.01075554  0.01075047  0.01075115  0.01075168  0.01074785
  0.010755    0.01075934  0.01074979  0.01074742  0

In [88]:
# we computed [y = np.dot(Why, h) + by]; Backpropagate to Why, h, and by
dWhy = np.dot(dy, h.T)
dh = np.dot(Why.T, dy)
dby = np.copy(dy)
print 'the hidden vector activations were:'
print h.ravel()
print 'the gradients are:'
print dh.ravel()
print 'the gradients dWhy have size: ', dWhy.shape
print 'a small sample is:'
print dWhy[:4,:4]

the hidden vector activations were:
[ 0.01971421 -0.00533177  0.00142983 -0.00985955  0.00248728 -0.01736378
  0.01687631 -0.01756024  0.00207365 -0.00083882]
the gradients are:
[ 0.01411266 -0.00086765 -0.00501521  0.00024313  0.00070486  0.02431426
 -0.00599582  0.00532303 -0.01018747 -0.00155986]
the gradients dWhy have size:  (93, 10)
a small sample is:
[[  2.12128361e-04  -5.73708201e-05   1.53852052e-05  -1.06090457e-04]
 [  2.12063107e-04  -5.73531718e-05   1.53804724e-05  -1.06057822e-04]
 [  2.11827876e-04  -5.72895529e-05   1.53634117e-05  -1.05940178e-04]
 [  2.12064589e-04  -5.73535728e-05   1.53805799e-05  -1.06058563e-04]]


In [90]:
# we computed [h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))]; 
# Backprop into Wxh, x, Whh, h_prev, bh:
dh_before_tanh = (1-h**2)*dh
dbh = np.copy(dh_before_tanh)
dWxh = np.dot(dh_before_tanh, x.T)
dWhh = np.dot(dh_before_tanh, h.T)
dh_prev = np.dot(Whh.T, dh_before_tanh)
print 'small sample of Whh:'
print Whh[:4,:4]

small sample of Whh:
[[ 0.00013373 -0.01329709  0.01799412  0.01353991]
 [-0.00427217 -0.01052258  0.00820703  0.00452433]
 [-0.00511801 -0.0094128   0.00029513  0.0058532 ]
 [-0.00132356  0.00330526  0.00737467 -0.00426905]]


In [95]:
# we now have the gradients for all parameters! (Wxh, Whh, Why, bh, by)
# lets do a parameter update
learning_rate = 0.1
Wxh2 = Wxh - learning_rate * dWxh
Whh2 = Whh - learning_rate * dWhh
Why2 = Why - learning_rate * dWhy
bh2 = bh - learning_rate * dbh
by2 = by - learning_rate * dby

In [96]:
# these parameters should be much better! lets try it out:
h2 = np.tanh(np.dot(Wxh2, x) + np.dot(Whh2, h_prev + bh2))
y2 = np.dot(Why2, h2) + by2
p2 = np.exp(y2) / np.sum(np.exp(y2))
print 'probability assigned to the correct next character was: ', p[ix_target,0]
print 'probability assigned to the correct next character is now: ', p2[ix_target,0]
loss2 = -np.log(p2[ix_target,0])
print 'the cross-entropy (softmax) loss was ', loss
print 'the loss is now ', loss2

probability assigned to the correct next character was:  0.0107565780746
probability assigned to the correct next character is now:  0.0118772830163
the cross-entropy (softmax) loss was  4.53223779763
the loss is now  4.43312769353


In [98]:
# note: the probability for the correct character went up! (and the loss went down)

In [101]:
# putting it together
def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in xrange(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(xrange(len(inputs))):
        pass # TODO
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [104]:
loss, dWxh, dWhh, dWhy, dbh, dby, hnew = lossFun(inputs, targets, h_prev)
print loss

113.313778376
