In [1]:
import numpy as np

def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['sox'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

sent2output = np.random.rand(3,len(word_vects))
identity = np.eye(3)

sent2output

array([[0.26490035, 0.69912039, 0.48802165, 0.26454558, 0.12385591,
        0.04050989, 0.86721806, 0.24550038, 0.9223188 ],
       [0.27573219, 0.66564167, 0.27173791, 0.53797238, 0.13969189,
        0.37318559, 0.62779409, 0.36648545, 0.09926367],
       [0.73009871, 0.38656715, 0.46874977, 0.11378947, 0.60768867,
        0.44412683, 0.6609999 , 0.12365298, 0.31978151]])

Given the sentences: "red sox defeat" need to predict the next word 'yankees'

In [9]:
layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['sox']
layer_2 = layer_1.dot(identity) + word_vects['defeat']
print(layer_2.dot(sent2output))
pred = softmax(layer_2.dot(sent2output))
print(pred)

[[ 0.00445079 -0.00201443  0.0009575  -0.00189164  0.00446021  0.00272357
  -0.00027825 -0.00120441 -0.00210499]]
[[0.11154318 0.11082435 0.1111542  0.11083796 0.11154423 0.11135068
  0.11101693 0.11091416 0.11081431]]


In [3]:
y = np.array([1,0,0,0,0,0,0,0,0])
pred_delta = pred - y
print(f'pred_delta = {pred_delta}')

layer_2_delta = pred_delta.dot(sent2output.T)
print(f'layer_2_delta = {layer_2_delta}')

defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)
alpha = 0.01
word_vects['red'] -= layer_0_delta * alpha
word_vects['sox'] -= sox_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha
identity -= np.outer(layer_0,layer_1_delta) * alpha
identity -= np.outer(layer_1,layer_2_delta) * alpha
sent2output -= np.outer(layer_2,pred_delta) * alpha

pred_delta = [[-0.88888889  0.11111111  0.11111111  0.11111111  0.11111111  0.11111111
   0.11111111  0.11111111  0.11111111]]
layer_2_delta = [[ 0.17020977  0.0973239  -0.30171482]]


In [5]:
import sys

file = open('en/qa1_single-supporting-fact_train.txt','r')
raw = file.readlines()
file.close()

raw

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n',
 '4 Daniel went back to the hallway.\n',
 '5 Sandra moved to the garden.\n',
 '6 Where is Daniel? \thallway\t4\n',
 '7 John moved to the office.\n',
 '8 Sandra journeyed to the bathroom.\n',
 '9 Where is Daniel? \thallway\t4\n',
 '10 Mary moved to the hallway.\n',
 '11 Daniel travelled to the office.\n',
 '12 Where is Daniel? \toffice\t11\n',
 '13 John went back to the garden.\n',
 '14 John moved to the bedroom.\n',
 '15 Where is Sandra? \tbathroom\t8\n',
 '1 Sandra travelled to the office.\n',
 '2 Sandra went to the bathroom.\n',
 '3 Where is Sandra? \tbathroom\t2\n',
 '4 Mary went to the bedroom.\n',
 '5 Daniel moved to the hallway.\n',
 '6 Where is Sandra? \tbathroom\t2\n',
 '7 John went to the garden.\n',
 '8 John travelled to the office.\n',
 '9 Where is Sandra? \tbathroom\t2\n',
 '10 Daniel journeyed to the bedroom.\n',
 '11 Daniel travelled to the hallway.\n',
 '12 Where is

In [12]:
tokens = list()

for line in raw[:10]:
    tokens.append(line.lower().replace("\n","").replace("\t","").replace("?","").split(" ")[1:])

tokens

[['mary', 'moved', 'to', 'the', 'bathroom.'],
 ['john', 'went', 'to', 'the', 'hallway.'],
 ['where', 'is', 'mary', 'bathroom1'],
 ['daniel', 'went', 'back', 'to', 'the', 'hallway.'],
 ['sandra', 'moved', 'to', 'the', 'garden.'],
 ['where', 'is', 'daniel', 'hallway4'],
 ['john', 'moved', 'to', 'the', 'office.'],
 ['sandra', 'journeyed', 'to', 'the', 'bathroom.'],
 ['where', 'is', 'daniel', 'hallway4'],
 ['mary', 'moved', 'to', 'the', 'hallway.']]

In [54]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
        return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


In [63]:
import numpy as np
np.random.seed(1)
embed_size = 10
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1

recurrent = np.eye(embed_size)
start = np.zeros(embed_size)
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1
one_hot = np.eye(len(vocab))
one_hot.shape

(18, 18)

In [64]:
def predict(sent):
    layers = list()
    layer = {}
    layer['hidden'] = start
    
    layers.append(layer)

    loss = 0
    preds = list()
    for target_i in range(len(sent)):
        layer = {}
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)

    return layers, loss

In [73]:
#  sent = words2indices(tokens[iter%len(tokens)][1:])
len(tokens)

10

In [61]:
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:])
    layers,loss = predict(sent)
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]

        if(layer_idx > 0):
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())

        if(layer_idx == len(layers)-1):
            layer['hidden_delta'] = new_hidden_delta
        else:
            layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    for layer_idx,layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'],layer['output_delta']) * alpha / float(len(sent))

        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))

        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))

        if(iter % 1000 == 0):
            print("Perplexity:" + str(np.exp(loss/len(sent))))

Perplexity:2.4993761491997253
Perplexity:2.4991475520264617
Perplexity:2.4989379688422546
Perplexity:2.4987451946982513
Perplexity:2.4985673523308733
Perplexity:2.4984028338194997
Perplexity:2.498250254194898
Perplexity:2.4981084142559427
Perplexity:2.497976270543961
Perplexity:2.4978529109267256
Perplexity:2.49773753461319
Perplexity:2.4976294356933573
Perplexity:2.497527989502166
Perplexity:2.4974326412605397
Perplexity:2.4973428965638553
Perplexity:2.497258313377983
Perplexity:2.497178495272474
Perplexity:2.497103085674083
Perplexity:2.497031762966265
Perplexity:2.496964236293189
Perplexity:2.496900241953293
Perplexity:2.4968395402881707
Perplexity:2.496781912989427
Perplexity:2.4967271607595163
Perplexity:2.4966751012736563
Perplexity:2.4966255673985334
Perplexity:2.4965784056309657
Perplexity:2.4965334747254904
Perplexity:2.4964906444847776
Perplexity:2.4964497946907636


In [74]:
sent_index = 4
l,_ = predict(words2indices(tokens[sent_index]))
print(tokens[sent_index])
for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:" + input + (' ' * (12 - len(input))) + "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)

['sandra', 'moved', 'to', 'the', 'garden.']
