In [1]:
import keras
from __future__ import print_function
from functools import reduce
import re
import tarfile
import numpy as np
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import *
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau


def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

Using TensorFlow backend.


In [2]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        #print(line)
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        #print([nid,line])
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [ x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    #print(data[0])
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    #print(data)
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqsi = []
    xqso = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        x += [word_idx['\n']]
        xqi = [word_idx['\t']]
        xqi += [word_idx[w] for w in query]
        xqo = xqi +[word_idx['\n']]
        y=word_idx[answer]
        xs.append(x)
        xqsi.append(xqi)
        xqso.append(xqo)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqsi, maxlen=query_maxlen),pad_sequences(xqso, maxlen=query_maxlen), np.array(ys)

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 40
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))
trainFile = open('./qa5_three-arg-relations_train.txt','r')
#print(trainFile.readlines())
train = get_stories(trainFile)
test = get_stories(open('./qa5_three-arg-relations_test.txt'))

RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100


  return _compile(pattern, flags).split(string, maxsplit)


In [3]:
test[0]

(['Fred',
  'picked',
  'up',
  'the',
  'football',
  'there',
  '.',
  'Fred',
  'gave',
  'the',
  'football',
  'to',
  'Jeff',
  '.'],
 ['What', 'did', 'Fred', 'give', 'to', 'Jeff', '?'],
 'football')

In [21]:
vocab = set()
vocab |=set(('\t','\n'))
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inv_map = {v: k for k, v in word_idx.items()}
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xqi,xqo, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txqi,txqo, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
temp = np.zeros((x.shape[0], query_maxlen, vocab_size),dtype='float32')
print(temp.shape)
for i,idx in enumerate(xqi):
    for u,v in enumerate(idx):
        temp [i,u, v] = 1.
print(temp[0])

(1000, 8, 44)
[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

In [6]:
print([inv_map[i] for i in x[1] if i !=0])
print([inv_map[i] for i in xqi[1] if i !=0])
print(inv_map[y[1]] )

['Bill', 'travelled', 'to', 'the', 'office', '.', 'Bill', 'picked', 'up', 'the', 'football', 'there', '.', 'Bill', 'went', 'to', 'the', 'bedroom', '.', 'Bill', 'gave', 'the', 'football', 'to', 'Fred', '.', 'Fred', 'handed', 'the', 'football', 'to', 'Bill', '.', 'Jeff', 'went', 'back', 'to', 'the', 'office', '.', '\n']
['\t', 'Who', 'received', 'the', 'football', '?']
Bill


In [25]:
print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xqi.shape))
print('xq.shape = {}'.format(xqo.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

embeddingLayer =  Embedding(vocab_size, 80)

encoder_inputs = Input(shape=( story_maxlen ,)) 
print(encoder_inputs)
eInp = Embedding(vocab_size, 80)(encoder_inputs)
print(eInp)
encoder = LSTM(50,return_state=True)
encoder_outputs, state_h, state_c = encoder(eInp)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(query_maxlen,),name='decoder_inputs')
dInp = Embedding(vocab_size, 80)(decoder_inputs)
decoder_lstm = LSTM(50, return_sequences=True, return_state=True,name='LSTM_01') 
decoder_outputs, _, _ = decoder_lstm(dInp , initial_state=encoder_states)
print(decoder_outputs)
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)
print(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
reduce_LR = ReduceLROnPlateau(monitor='val_loss',factor = 0.9, patience=3,cooldown=2, min_lr = 0.00001)
model.fit([x, xqi], temp,
          batch_size=49,
          epochs=100,
          callbacks=[reduce_LR],
          validation_split=0.2)

vocab = ['\t', '\n', '.', '?', 'Bill', 'Fred', 'Jeff', 'Mary', 'What', 'Who', 'apple', 'back', 'bathroom', 'bedroom', 'did', 'discarded', 'down', 'dropped', 'football', 'garden', 'gave', 'give', 'got', 'grabbed', 'hallway', 'handed', 'journeyed', 'kitchen', 'left', 'milk', 'moved', 'office', 'passed', 'picked', 'put', 'received', 'the', 'there', 'to', 'took', 'travelled', 'up', 'went']
x.shape = (1000, 627)
xq.shape = (1000, 8)
xq.shape = (1000, 8)
y.shape = (1000,)
story_maxlen, query_maxlen = 627, 8
Tensor("input_9:0", shape=(?, 627), dtype=float32)
Tensor("embedding_26/Gather:0", shape=(?, 627, 80), dtype=float32)
Tensor("LSTM_01_8/transpose_1:0", shape=(?, ?, 50), dtype=float32)
Tensor("time_distributed_8/Reshape_1:0", shape=(?, 8, 44), dtype=float32)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_9 (InputLayer)            

KeyboardInterrupt: 