In [None]:
import keras
from __future__ import print_function
from functools import reduce
import re
import tarfile
import numpy as np
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import *
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau
print(keras.__version__)

def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [None]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        #print(line)
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        #print([nid,line])
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [ x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    #print(data[0])
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    #print(data)
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqsi = []
    xqso = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        x += [word_idx['\n']]
        xqi = [word_idx['\t']]
        xqi += [word_idx[w] for w in query]
        xqo = xqi +[word_idx['\n']]
        y=word_idx[answer]
        xs.append(x)
        xqsi.append(xqi)
        xqso.append(xqo)
        ys.append(y)
    return pad_sequences(xs, maxlen=story_maxlen,padding='post'), pad_sequences(xqsi, maxlen=query_maxlen,padding='post'),pad_sequences(xqso, maxlen=query_maxlen), np.array(ys)

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 40
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))
trainFile = open('./qa5_three-arg-relations_train.txt','r')
#print(trainFile.readlines())
train = get_stories(trainFile)
test = get_stories(open('./qa5_three-arg-relations_test.txt'))

In [None]:
test[0]

In [None]:
vocab = set()
vocab |=set(('\t','\n'))
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inv_map = {v: k for k, v in word_idx.items()}
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xqi,xqo, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txqi,txqo, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
temp = np.zeros((x.shape[0], query_maxlen, vocab_size),dtype='float32')

def token2OHE(xqo):
    for i,idx in enumerate(xqo):
        for u,v in enumerate(idx):
            temp [i,u, v] = 1.
    return temp

xqo=token2OHE(xqo)
txqo=token2OHE(txqo)


In [None]:
print([inv_map[i] for i in x[1] if i !=0])
print([inv_map[i] for i in xqi[1] if i !=0])
print(inv_map[y[1]] )

In [None]:
print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xqi.shape))
print('xq.shape = {}'.format(xqo.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

embeddingLayer =  Embedding(vocab_size, 80)

encoder_inputs = Input(shape=( story_maxlen ,)) 
print(encoder_inputs)
eInp = Embedding(vocab_size, 80)(encoder_inputs)
print(eInp)
encoder = LSTM(50,return_state=True)
encoder_outputs, state_h, state_c = encoder(eInp)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(query_maxlen,),name='decoder_inputs')
dInp = Embedding(vocab_size, 80)(decoder_inputs)
decoder_lstm = LSTM(50, return_sequences=True, return_state=True,name='LSTM_01') 
decoder_outputs, _, _ = decoder_lstm(dInp , initial_state=encoder_states)
print(decoder_outputs)
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)
print(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
reduce_LR = ReduceLROnPlateau(monitor='val_loss',factor = 0.9, patience=3,cooldown=2, min_lr = 0.00001)
model.fit([x, xqi], xqo,
          batch_size=128,
          epochs=50,
          callbacks=[reduce_LR],
          validation_split=0.2)


In [None]:
model.fit([x, xqi], xqo,
          batch_size=49,
          epochs=1,
          callbacks=[reduce_LR],
          validation_split=0.2)
model.evaluate([tx, txqi], txqo,
          batch_size=49)

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(dInp, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros(( 1, query_maxlen))
    
    # Create and pass in the firsat token to start the predictions when combined with the decoeder state.
    target_seq[ 0, 0] = word_idx['\t']
    print(target_seq.shape)
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False #our stop condition will be '/n' which acts as <EOS>
    decoded_sentence = ''
    ct =0
    while not stop_condition:
        
        print(ct)
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        #print(output_tokens)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0,ct, :])
        sampled_char = inv_map[sampled_token_index]
        print(sampled_char)
        decoded_sentence += "-" + sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or  len(decoded_sentence) > query_maxlen):
            stop_condition = True
            print( len(decoded_sentence))
        ct+=1
        # Update the target sequence (of length 1).
        target_seq[0, ct] =sampled_token_index

        # Update states
        states_value = [h, c]


    return decoded_sentence

input_seq = np.expand_dims(x[0],axis=0)
print(input_seq.shape)
decoded_sentence = decode_sequence(input_seq)

In [None]:
 states_value = encoder_model.predict(input_seq)
target_seq = np.zeros(( 1, query_maxlen))
target_seq[ 0, 0] = word_idx['\t']
print(target_seq.shape)
stop_condition = False
decoded_sentence = ''
ct =0
while not stop_condition:
    print(ct)
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sampled_token_index = np.argmax(output_tokens[0,ct, :])
    sampled_char = inv_map[sampled_token_index]
    print(sampled_char)
    decoded_sentence +=  sampled_char
    if (  len(decoded_sentence) > query_maxlen):
        stop_condition = True
        print( len(decoded_sentence))
    ct+=1
    # Update the target sequence (of length 1).
    target_seq[0, ct] =sampled_token_index

    # Update states
    states_value = [h, c]


In [None]:
print(sampled_char)

In [None]:


print(input_seq)
print(decoded_sentence)