In [1]:
data = open('abc.txt', 'r').read().lower()
characters = list(set(data))
len(data), len(characters)

(18741, 58)

In [2]:
char_to_index = {ch:i for i,ch in enumerate(characters)}
index_to_char = {i:ch for i,ch in enumerate(characters)}

char_to_index, index_to_char

({'\t': 51,
  '\n': 32,
  ' ': 38,
  '!': 18,
  '"': 44,
  '#': 55,
  '$': 34,
  '%': 40,
  "'": 2,
  '(': 1,
  ')': 15,
  '*': 45,
  ',': 4,
  '-': 11,
  '.': 10,
  '/': 56,
  '0': 54,
  '1': 37,
  '2': 33,
  '3': 20,
  '4': 6,
  '5': 28,
  '6': 57,
  '7': 24,
  '8': 30,
  '9': 14,
  ':': 53,
  ';': 42,
  '=': 12,
  '>': 22,
  '?': 39,
  '@': 21,
  'a': 48,
  'b': 19,
  'c': 36,
  'd': 3,
  'e': 31,
  'f': 26,
  'g': 35,
  'h': 7,
  'i': 25,
  'j': 5,
  'k': 17,
  'l': 47,
  'm': 49,
  'n': 8,
  'o': 9,
  'p': 41,
  'q': 29,
  'r': 16,
  's': 50,
  't': 52,
  'u': 13,
  'v': 27,
  'w': 43,
  'x': 0,
  'y': 23,
  'z': 46},
 {0: 'x',
  1: '(',
  2: "'",
  3: 'd',
  4: ',',
  5: 'j',
  6: '4',
  7: 'h',
  8: 'n',
  9: 'o',
  10: '.',
  11: '-',
  12: '=',
  13: 'u',
  14: '9',
  15: ')',
  16: 'r',
  17: 'k',
  18: '!',
  19: 'b',
  20: '3',
  21: '@',
  22: '>',
  23: 'y',
  24: '7',
  25: 'i',
  26: 'f',
  27: 'v',
  28: '5',
  29: 'q',
  30: '8',
  31: 'e',
  32: '\n',
  33: '2',
  34

In [3]:
hidden_units = 100
learning_rate = 0.1
length_seq = 20
vocab_size = len(characters)

In [4]:
import numpy as np
Wxh = np.random.randn(hidden_units, vocab_size)* 0.01
Whh = np.random.randn(hidden_units, hidden_units)* 0.01
Why = np.random.randn(vocab_size, hidden_units)* 0.01
bh = np.zeros((hidden_units, 1))
by = np.zeros((vocab_size, 1))

In [5]:
inp = np.zeros((vocab_size, 1))
inp[char_to_index['a']] = 1

In [6]:
def next_char(current_char, h_prev):
    current_input = np.zeros((vocab_size, 1))
    current_input[char_to_index[current_char]] = 1
    hidden_output = np.tanh(bh + np.dot(Wxh, current_input) + np.dot(Whh, h_prev))
    output = np.dot(Why, hidden_output) + by
    prob = np.exp(output)/np.sum(np.exp(output))
    max_index = np.argmax(prob)
    output_char = index_to_char[max_index]
    return output_char, hidden_output
                  


In [7]:
def train(inputs, targets, h_prev):
    #h_prev = np.zeros((hidden_units, 1))
    xs, hs, os, ps = {},{},{},{}
    hs[-1] = np.copy(h_prev)
    loss = 0
    for i in range(len(inputs)):
        xs[i] = np.zeros((vocab_size, 1))
        xs[i][char_to_index[inputs[i]]] = 1
        hs[i] = np.tanh(bh + np.dot(Wxh, xs[i]) + np.dot(Whh, hs[i - 1]))
        os[i] = np.dot(Why, hs[i]) + by
        ps[i] = np.exp(os[i])/np.sum(np.exp(os[i]))
        loss += -np.log(ps[i][char_to_index[targets[i]], 0]) # TODO
    # backward pass: compute gradients going backwards    
    #initalize vectors for gradient values for each set of weights 
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[char_to_index[targets[t]]] -= 1 # backprop into y  
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
        dbh += dhraw #derivative of hidden bias
        dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
        dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
        dhnext = np.dot(Whh.T, dhraw) 
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [8]:
def generate_sentence(seed, n, h_prev):
    character = seed
    sentence = "" + seed
    for i in range(n):
        character, h_prev = next_char(character, h_prev)
        sentence += character
    print(sentence)

In [9]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*length_seq # loss at iteration 0           
while n<=1000*40:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    # check "How to feed the loss function to see how this part works
    if p+length_seq+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_units,1)) # reset RNN memory                                                                                                                                      
        p = 0 # go from start of data                                                                                                                                                             
    
    inputs = [ch for ch in data[p:p+length_seq]]
    targets = [ch for ch in data[p+1:p+length_seq+1]]

    # forward seq_length characters through the net and fetch gradient                                                                                                                          
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = train(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    # sample from the model now and then                                                                                                                                                        
    if n % 1000 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
        #sample(hprev, inputs[0], 200)

    # perform parameter update with Adagrad                                                                                                                                                     
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        current_learning = learning_rate/np.sqrt(mem + 1e-8)
        param += -current_learning * dparam  # adagrad update                                                                                                                   

    p += length_seq # move data pointer                                                                                                                                                         
    n += 1 # iteration counter

iter 0, loss: 81.208856
iter 1000, loss: 66.875676
iter 2000, loss: 56.856521
iter 3000, loss: 52.874632
iter 4000, loss: 50.350039
iter 5000, loss: 48.650209
iter 6000, loss: 47.440236
iter 7000, loss: 46.236625
iter 8000, loss: 45.408042
iter 9000, loss: 45.642357
iter 10000, loss: 44.894843
iter 11000, loss: 44.180518
iter 12000, loss: 43.395132
iter 13000, loss: 42.638883
iter 14000, loss: 41.980876
iter 15000, loss: 41.417475
iter 16000, loss: 40.908004
iter 17000, loss: 40.651144
iter 18000, loss: 40.291458
iter 19000, loss: 39.975696
iter 20000, loss: 39.823343
iter 21000, loss: 39.472068
iter 22000, loss: 39.113851
iter 23000, loss: 38.788883
iter 24000, loss: 38.693271
iter 25000, loss: 38.332453
iter 26000, loss: 38.019409
iter 27000, loss: 37.697628
iter 28000, loss: 37.453395
iter 29000, loss: 37.187348
iter 30000, loss: 36.979780
iter 31000, loss: 36.844359
iter 32000, loss: 36.761309
iter 33000, loss: 36.642241
iter 34000, loss: 36.604001
iter 35000, loss: 36.583973
iter 

In [10]:
generate_sentence('a', 50, hprev)

an was sin the ing the fill the tion we his simes a
