In [1]:
import keras
from __future__ import print_function
from functools import reduce
import re
import tarfile
import numpy as np
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import *
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau
print(keras.__version__)

def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

Using TensorFlow backend.


2.0.5


In [2]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        #print(line)
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        #print([nid,line])
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [ x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    #print(data[0])
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    #print(data)
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqsi = []
    xqso = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        x += [word_idx['\n']]
        xq = [word_idx[w] for w in query]
        y=word_idx[answer]
        xqi =[y] + xq
        xqo = xq  +[word_idx['\n']]
        xs.append(x)
        xqsi.append(xqi)
        xqso.append(xqo)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen,padding='pre'), pad_sequences(xqsi, maxlen=query_maxlen,padding='post'),\
pad_sequences(xqso, maxlen=query_maxlen,padding='post'), np.array(ys)
BATCH_SIZE = 32
EPOCHS = 40

trainFile = open('./qa5_three-arg-relations_train.txt','r')
train = get_stories(trainFile)
test = get_stories(open('./qa5_three-arg-relations_test.txt'))

  return _compile(pattern, flags).split(string, maxsplit)


In [3]:
test[0]

(['Fred',
  'picked',
  'up',
  'the',
  'football',
  'there',
  '.',
  'Fred',
  'gave',
  'the',
  'football',
  'to',
  'Jeff',
  '.'],
 ['What', 'did', 'Fred', 'give', 'to', 'Jeff', '?'],
 'football')

In [4]:
vocab = set()
vocab |=set(('\t','\n'))
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)
print(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inv_map = {v: k for k, v in word_idx.items()}
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))+1
longestStory = np.argmax(list(map(len, (x for x, _, _ in train + test))))
print(longestStory)
x, xqi,xqoMid, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txqi,txqoMid, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)


['\t', '\n', '.', '?', 'Bill', 'Fred', 'Jeff', 'Mary', 'What', 'Who', 'apple', 'back', 'bathroom', 'bedroom', 'did', 'discarded', 'down', 'dropped', 'football', 'garden', 'gave', 'give', 'got', 'grabbed', 'hallway', 'handed', 'journeyed', 'kitchen', 'left', 'milk', 'moved', 'office', 'passed', 'picked', 'put', 'received', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went']
364


In [5]:
print(xqoMid[0])
def token2OHE(xqo):
    temp = np.zeros((xqo.shape[0], 9, 44),dtype='float32')
    for i,idx in enumerate(xqo):
        #print(idx.shape)
        for u,v in enumerate(idx):
            temp [i,u, v] = 1.
    return temp

xqo=token2OHE(xqoMid)
txqo=token2OHE(txqoMid)
print(xqo[0])

[ 9 15  5 22 39  6  4  2  0]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0

In [None]:
inv_map[0]='_'
checkIdx = 50
print([inv_map[i] for i in x[checkIdx] ])
print([inv_map[i] for i in xqi[checkIdx]])
print([inv_map[i] for i in xqoMid[checkIdx]])
print(xqo[checkIdx])
print(inv_map[y[checkIdx]] )
print(inv_map[np.argmax(xqo[checkIdx][0])] )

['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_',

In [None]:
print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xqi.shape))
print('xq.shape = {}'.format(xqo.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))
LSTM_SZ = 49

embeddingLayer =  Embedding(vocab_size, 80)
encoder_inputs = Input(shape=( story_maxlen ,)) 
print(encoder_inputs)
eInp =embeddingLayer(encoder_inputs)
print(eInp)
encoder = LSTM(LSTM_SZ ,return_state=True,unroll=False)
encoder_outputs, state_h, state_c = encoder(eInp)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(query_maxlen,),name='decoder_inputs')
dInp = embeddingLayer(decoder_inputs)
decoder_lstm = LSTM(LSTM_SZ , return_sequences=True, return_state=True,name='LSTM_01',unroll=True) 
decoder_outputs, _, _ = decoder_lstm(dInp , initial_state=encoder_states)
print(decoder_outputs)
decoder_dense =Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
print(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
reduce_LR = ReduceLROnPlateau(monitor='val_loss',factor = 0.9, patience=3,cooldown=2, min_lr = 0.00001)
model.fit([x, xqi], xqo,
          batch_size=256,
          epochs=10,
          callbacks=[reduce_LR],
          validation_split=0.1)


vocab = ['\t', '\n', '.', '?', 'Bill', 'Fred', 'Jeff', 'Mary', 'What', 'Who', 'apple', 'back', 'bathroom', 'bedroom', 'did', 'discarded', 'down', 'dropped', 'football', 'garden', 'gave', 'give', 'got', 'grabbed', 'hallway', 'handed', 'journeyed', 'kitchen', 'left', 'milk', 'moved', 'office', 'passed', 'picked', 'put', 'received', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went']
x.shape = (1000, 627)
xq.shape = (1000, 9)
xq.shape = (1000, 9, 44)
y.shape = (1000,)
story_maxlen, query_maxlen = 627, 9
Tensor("input_1:0", shape=(?, 627), dtype=float32)
Tensor("embedding_1/Gather:0", shape=(?, 627, 80), dtype=float32)
Tensor("LSTM_01/transpose_1:0", shape=(?, 9, 49), dtype=float32)
Tensor("dense_1/truediv:0", shape=(?, 9, 44), dtype=float32)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
decoder_inputs (InputLayer)      (None, 9)  

In [None]:
model.fit([x, xqi], xqo,
          batch_size=256,
          epochs=1,
          callbacks=[reduce_LR],
          validation_split=0)
model.evaluate([tx, txqi], txqo,
          batch_size=49)

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(LSTM_SZ ,))
decoder_state_input_c = Input(shape=(LSTM_SZ ,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(dInp, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
inv_map[y[0]]

In [None]:
def decode_sequence(input_seq,yIn,inv_map):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros(( 1, query_maxlen))
    target_seq[ 0, 0] = yIn
    stop_condition = False #our stop condition will be '/n' which acts as <EOS>
    decoded_sentence = ''
    ct =0
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        print([inv_map[i] for i in target_seq[0] ])
        target_seq[0, ct+1] = np.argmax(output_tokens[0,ct, :])
        sampled_char = inv_map[target_seq[0, ct+1]]
        decoded_sentence += ' '+sampled_char
        if (ct>6):
            stop_condition = True
        ct+=1
        states_value = [h, c]
    return decoded_sentence,target_seq
testIdx =10
input_seq = np.expand_dims(tx[testIdx],axis=0)
yIn = ty[testIdx]
decoded_sentence,target_seq = decode_sequence(input_seq,yIn,inv_map)
print(decoded_sentence)
print([inv_map[i] for i in tx[testIdx] ])
print([inv_map[ty[testIdx]]])
print([inv_map[i] for i in txqi[testIdx] ])
print([inv_map[i] for i in target_seq[0] ])

In [None]:
inv_map[21]

In [None]:
inv_map[0]='_'

In [None]:
input_seq = np.expand_dims(x[3],axis=0)
states_value = encoder_model.predict(input_seq)
target_seq = np.zeros(( 1, query_maxlen))
target_seq[ 0, 0] = word_idx['\t']
print(target_seq.shape)
stop_condition = False
decoded_sentence = ' '
ct =0
while not stop_condition:
    print(ct)
    print( len(decoded_sentence))
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sampled_token_index = np.argmax(output_tokens[0,ct, :])
    sampled_char = inv_map[sampled_token_index]
    print(sampled_char)
    decoded_sentence +=  sampled_char
    if (  len(decoded_sentence)+1 > query_maxlen):
        stop_condition = True
        print('stop' + str(len(decoded_sentence)))
    ct+=1
    # Update the target sequence (of length 1).
    target_seq[0, ct] =sampled_token_index

    # Update states
    states_value = [h, c]
print(target_seq)

In [None]:
print(decoded_sentence)
print(inv_map[1])

In [None]:
print(inv_map[00])

In [None]:


print(input_seq)
print(decoded_sentence)