## Question Answering System

## Imports

In [7]:
import os
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input, Activation, Dense, Permute, Dropout
from tensorflow.keras.layers import add, dot, concatenate
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K

from functools import reduce
import tarfile
import numpy as np
import re

import IPython
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [8]:
class PreprocessData():
    def tokenize(self, sent):
        return [ x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
    
    def parse_data(self, lines, only_supporting=False):
        
        '''Parse stories provided in the bAbi tasks format
        If only_supporting is true, only the sentences
        that support the answer are kept.
        '''
        data = []
        story = []
        for line in lines:
            line = line.strip()
            nid, line = line.split(' ', 1)
            nid = int(nid)
            if nid == 1:
                story = []
            if '\t' in line:
                q, a, supporting = line.split('\t')
                q = self.tokenize(q)
                substory = None
                if only_supporting:
                    # Only select the related substory
                    supporting = map(int, supporting.split())
                    substory = [story[i - 1] for i in supporting]
                else:
                    # Provide all the substories
                    substory = [x for x in story if x]
                data.append((substory, q, a))
                story.append('')
            else:
                sent = self.tokenize(line)
                story.append(sent)
        return data
    
    def make_data(self, lines, only_supporting=False, max_length=None):
        data = self.parse_data(lines, only_supporting=only_supporting)
        flatten = lambda data: reduce(lambda x, y: x + y, data)
        data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
        return data
    
    
    def vectorize_stories(self, data, word_idx, story_maxlen, query_maxlen):
        X = []
        Xq = []
        Y = []
        for story, query, answer in data:
            x = [word_idx[w] for w in story]
            xq = [word_idx[w] for w in query]
            # let's not forget that index 0 is reserved
            y = np.zeros(len(word_idx) + 1)
            y[word_idx[answer]] = 1
            X.append(x)
            Xq.append(xq)
            Y.append(y)
        return (pad_sequences(X, maxlen=story_maxlen),
                pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))
    
    def get_data(self,path,type = "qa1_single-supporting-fact"):
        train_path = os.path.join(path,type+'_train.txt')
        test_path = os.path.join(path,type+'_test.txt')
        with open(train_path,'r')as fp:
            data = fp.read().splitlines()
            train_data = self.make_data(data)
        with open(test_path,'r') as fp:
            data = fp.read().splitlines()
            test_data = self.make_data(data)
            
        vocab = set()
        for story, q, answer in train_data + test_data:
            vocab |= set(story + q + [answer])
        self.vocab = sorted(vocab)

        # Reserve 0 for masking via pad_sequences
        self.story_maxlen = max(map(len, (x for x, _, _ in train_data + test_data)))
        self.query_maxlen = max(map(len, (x for _, x, _ in train_data + test_data)))
        return train_data,test_data



In [9]:
prp_obj = PreprocessData()
train_data,test_data = prp_obj.get_data("data/en-10k")
vocab = prp_obj.vocab
vocab_size = len(vocab)+1
word2idx = dict((c, i + 1) for i, c in enumerate(vocab))
idx2word = dict((i+1, c) for i,c in enumerate(vocab))
story_maxlen = prp_obj.story_maxlen
query_maxlen = prp_obj.query_maxlen


  return _compile(pattern, flags).split(string, maxsplit)


In [10]:
inputs_train, queries_train, answers_train = prp_obj.vectorize_stories(train_data,
                                                               word2idx,
                                                               story_maxlen,
                                                               query_maxlen)
inputs_test, queries_test, answers_test = prp_obj.vectorize_stories(test_data,
                                                            word2idx,
                                                            story_maxlen,
                                                            query_maxlen)

In [13]:
train_epochs = 100
batch_size = 32
lstm_size = 64

In [19]:
class CreateModel():
    
    def build_model(self):
        # placeholders
        input_sequence = Input((story_maxlen,))
        question = Input((query_maxlen,))

        print('Input sequence:', input_sequence)
        print('Question:', question)

        # encoders
        # embed the input sequence into a sequence of vectors
        input_encoder_m = Sequential()
        input_encoder_m.add(Embedding(input_dim=vocab_size,
                                      output_dim=64))
        input_encoder_m.add(Dropout(0.3))
        # output: (samples, story_maxlen, embedding_dim)

        # embed the input into a sequence of vectors of size query_maxlen
        input_encoder_c = Sequential()
        input_encoder_c.add(Embedding(input_dim=vocab_size,
                                      output_dim=query_maxlen))
        input_encoder_c.add(Dropout(0.3))
        # output: (samples, story_maxlen, query_maxlen)

        # embed the question into a sequence of vectors
        question_encoder = Sequential()
        question_encoder.add(Embedding(input_dim=vocab_size,
                                       output_dim=64,
                                       input_length=query_maxlen))
        question_encoder.add(Dropout(0.3))
        # output: (samples, query_maxlen, embedding_dim)

        # encode input sequence and questions (which are indices)
        # to sequences of dense vectors
        input_encoded_m = input_encoder_m(input_sequence)
        print('Input encoded m', input_encoded_m)
        input_encoded_c = input_encoder_c(input_sequence)
        print('Input encoded c', input_encoded_c)
        question_encoded = question_encoder(question)
        print('Question encoded', question_encoded)


        # compute a 'match' between the first input vector sequence
        # and the question vector sequence
        # shape: `(samples, story_maxlen, query_maxlen)
        match = dot([input_encoded_m, question_encoded], axes=(2, 2))
        print(match.shape)
        match = Activation('softmax')(match)
        print('Match shape', match)

        # add the match matrix with the second input vector sequence
        response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
        response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)
        print('Response shape', response)

        # concatenate the response vector with the question vector sequence
        answer = concatenate([response, question_encoded])
        print('Answer shape', answer)

        #answer = LSTM(lstm_size, return_sequences=True)(answer)  # Generate tensors of shape 32
        #answer = Dropout(0.3)(answer)
        answer = LSTM(lstm_size)(answer)  # Generate tensors of shape 32
        answer = Dropout(0.3)(answer)
        answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
        # we output a probability distribution over the vocabulary
        answer = Activation('softmax')(answer)
        # build the final model
        self.model = Model([input_sequence, question], answer)
        self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                      metrics=['acc'])
        print(self.model.summary())
        
    def train(self):
        callbacks = None#[TrainingVisualizer()]

        self.model.fit([inputs_train, queries_train], answers_train, batch_size, train_epochs,callbacks=callbacks,
          validation_data=([inputs_test, queries_test], answers_test))

        self.model.save('model.h5')

In [20]:
model_obj = CreateModel()
model_obj.build_model()
model_obj.train()

Input sequence: Tensor("input_9:0", shape=(None, 68), dtype=float32)
Question: Tensor("input_10:0", shape=(None, 4), dtype=float32)
Input encoded m Tensor("sequential_12/Identity:0", shape=(None, 68, 64), dtype=float32)
Input encoded c Tensor("sequential_13/Identity:0", shape=(None, 68, 4), dtype=float32)
Question encoded Tensor("sequential_14/Identity:0", shape=(None, 4, 64), dtype=float32)
(None, 68, 4)
Match shape Tensor("activation_7/Identity:0", shape=(None, 68, 4), dtype=float32)
Response shape Tensor("permute_4/Identity:0", shape=(None, 4, 68), dtype=float32)
Answer shape Tensor("concatenate_4/Identity:0", shape=(None, 4, 132), dtype=float32)
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 68)]         0                                            
____________________________

KeyboardInterrupt: 