# RNN : Predict Next Word

## Test example

In [55]:
import numpy as np

def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['sox'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

sent2output = np.random.rand(3,len(word_vects))
identity = np.eye(3)

sent2output

array([[0.30024834, 0.14300583, 0.90130844, 0.54155938, 0.97474037,
        0.6366044 , 0.99391302, 0.5460708 , 0.52642593],
       [0.1354279 , 0.35570517, 0.02621857, 0.16039518, 0.74563719,
        0.03039969, 0.3665431 , 0.86234625, 0.69267772],
       [0.69094214, 0.1886368 , 0.44190428, 0.58157741, 0.98975171,
        0.20390623, 0.2477329 , 0.26217308, 0.75017241]])

Given the sentences: "red sox defeat" need to predict the next word 'yankees'

In [56]:
layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['sox']
layer_2 = layer_1.dot(identity) + word_vects['defeat']
print(layer_2.dot(sent2output))
pred = softmax(layer_2.dot(sent2output))
print(pred)

[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


In [57]:
y = np.array([1,0,0,0,0,0,0,0,0])
pred_delta = pred - y
print(f'pred_delta = {pred_delta}')

layer_2_delta = pred_delta.dot(sent2output.T)
print(f'layer_2_delta = {layer_2_delta}')

defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)
alpha = 0.01
word_vects['red'] -= layer_0_delta * alpha
word_vects['sox'] -= sox_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha
identity -= np.outer(layer_0,layer_1_delta) * alpha
identity -= np.outer(layer_1,layer_2_delta) * alpha
sent2output -= np.outer(layer_2,pred_delta) * alpha

pred_delta = [[-0.88888889  0.11111111  0.11111111  0.11111111  0.11111111  0.11111111
   0.11111111  0.11111111  0.11111111]]
layer_2_delta = [[ 0.31796016  0.23961107 -0.20685359]]


## Real example

In [58]:
import sys

file = open('en/qa1_single-supporting-fact_train.txt','r')
raw = file.readlines()
file.close()

raw

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n',
 '4 Daniel went back to the hallway.\n',
 '5 Sandra moved to the garden.\n',
 '6 Where is Daniel? \thallway\t4\n',
 '7 John moved to the office.\n',
 '8 Sandra journeyed to the bathroom.\n',
 '9 Where is Daniel? \thallway\t4\n',
 '10 Mary moved to the hallway.\n',
 '11 Daniel travelled to the office.\n',
 '12 Where is Daniel? \toffice\t11\n',
 '13 John went back to the garden.\n',
 '14 John moved to the bedroom.\n',
 '15 Where is Sandra? \tbathroom\t8\n',
 '1 Sandra travelled to the office.\n',
 '2 Sandra went to the bathroom.\n',
 '3 Where is Sandra? \tbathroom\t2\n',
 '4 Mary went to the bedroom.\n',
 '5 Daniel moved to the hallway.\n',
 '6 Where is Sandra? \tbathroom\t2\n',
 '7 John went to the garden.\n',
 '8 John travelled to the office.\n',
 '9 Where is Sandra? \tbathroom\t2\n',
 '10 Daniel journeyed to the bedroom.\n',
 '11 Daniel travelled to the hallway.\n',
 '12 Where is

In [59]:
tokens = list()

for line in raw[:10]:
    tokens.append(line.lower().replace("\n","").replace("\t","").replace("?","").split(" ")[1:])

tokens

[['mary', 'moved', 'to', 'the', 'bathroom.'],
 ['john', 'went', 'to', 'the', 'hallway.'],
 ['where', 'is', 'mary', 'bathroom1'],
 ['daniel', 'went', 'back', 'to', 'the', 'hallway.'],
 ['sandra', 'moved', 'to', 'the', 'garden.'],
 ['where', 'is', 'daniel', 'hallway4'],
 ['john', 'moved', 'to', 'the', 'office.'],
 ['sandra', 'journeyed', 'to', 'the', 'bathroom.'],
 ['where', 'is', 'daniel', 'hallway4'],
 ['mary', 'moved', 'to', 'the', 'hallway.']]

In [60]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
        return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


In [61]:
import numpy as np
np.random.seed(1)
embed_size = 10
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1

recurrent = np.eye(embed_size)
start = np.zeros(embed_size)
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1
one_hot = np.eye(len(vocab))
one_hot.shape

(18, 18)

In [62]:
def predict(sent):
    layers = list()
    layer = {}
    layer['hidden'] = start
    
    layers.append(layer)

    loss = 0
    preds = list()
    for target_i in range(len(sent)):
        layer = {}
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)

    return layers, loss

In [63]:
#  sent = words2indices(tokens[iter%len(tokens)][1:])
len(tokens)

10

In [64]:
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:])
    layers,loss = predict(sent)
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]

        if(layer_idx > 0):
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())

        if(layer_idx == len(layers)-1):
            layer['hidden_delta'] = new_hidden_delta
        else:
            layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
    
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    for layer_idx,layer in enumerate(layers[1:]):
        
        decoder -= np.outer(layers[layer_idx]['hidden'],layer['output_delta']) * alpha / float(len(sent))

        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))

        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))

        if(iter % 1000 == 0):
            print("Perplexity:" + str(np.exp(loss/len(sent))))

Perplexity:18.000000000000004


Perplexity:17.8827020220545
Perplexity:17.420274874808523
Perplexity:15.585621719807442
Perplexity:10.172836771584652
Perplexity:3.855983806950532
Perplexity:2.5886737266250273
Perplexity:2.5381896839964826
Perplexity:2.534116961471557
Perplexity:2.529970840006578
Perplexity:2.525637062291749
Perplexity:2.5210823974774565
Perplexity:2.517058362965171
Perplexity:2.5138110847218837
Perplexity:2.5112523214219737
Perplexity:2.509225587619763
Perplexity:2.5075967758236892
Perplexity:2.506266467020901
Perplexity:2.5051632618737023
Perplexity:2.504235738745952
Perplexity:2.503446357900584
Perplexity:2.5027672625577253
Perplexity:2.5021774397015477
Perplexity:2.501660790606871
Perplexity:2.5012048026190614
Perplexity:2.500799620745197
Perplexity:2.500437388462531
Perplexity:2.5001117724964073
Perplexity:2.4998176152351297
Perplexity:2.4995506770254323


In [65]:
sent_index = 5
l,_ = predict(words2indices(tokens[sent_index]))
print(tokens[sent_index])

idx = list()
for word in tokens[sent_index]:
    idx.append(word2index[word])

print(idx)

['where', 'is', 'daniel', 'hallway4']
[11, 7, 1, 6]


In [66]:
for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:" + input + (' ' * (12 - len(input))) + "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)