## Question Answering System

## Imports

In [1]:
import os
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input, Activation, Dense, Permute, Dropout
from tensorflow.keras.layers import add, dot, concatenate
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K

from functools import reduce
import tarfile
import numpy as np
import re

import IPython
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [14]:
class PreprocessData():
    def tokenize(self, sent):
        return [ x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
    
    def parse_data(self, lines, only_supporting=False):
        
        '''Parse stories provided in the bAbi tasks format
        If only_supporting is true, only the sentences
        that support the answer are kept.
        '''
        data = []
        story = []
        for line in lines:
            line = line.strip()
            nid, line = line.split(' ', 1)
            nid = int(nid)
            if nid == 1:
                story = []
            if '\t' in line:
                q, a, supporting = line.split('\t')
                q = self.tokenize(q)
                substory = None
                if only_supporting:
                    # Only select the related substory
                    supporting = map(int, supporting.split())
                    substory = [story[i - 1] for i in supporting]
                else:
                    # Provide all the substories
                    substory = [x for x in story if x]
                data.append((substory, q, a))
                story.append('')
            else:
                sent = self.tokenize(line)
                story.append(sent)
        return data
    
    def make_data(self, lines, only_supporting=False, max_length=None):
        data = self.parse_data(lines, only_supporting=only_supporting)
        flatten = lambda data: reduce(lambda x, y: x + y, data)
        data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
        return data
    
    
    def vectorize_stories(self, data, word_idx, story_maxlen, query_maxlen):
        X = []
        Xq = []
        Y = []
        for story, query, answer in data:
            x = [word_idx[w] for w in story]
            xq = [word_idx[w] for w in query]
            # let's not forget that index 0 is reserved
            y = np.zeros(len(word_idx) + 1)
            y[word_idx[answer]] = 1
            X.append(x)
            Xq.append(xq)
            Y.append(y)
        return (pad_sequences(X, maxlen=story_maxlen),
                pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))
    
    def get_data(self,path,type = "qa1_single-supporting-fact"):
        train_path = os.path.join(path,type+'_train.txt')
        test_path = os.path.join(path,type+'_test.txt')
        with open(train_path,'r')as fp:
            data = fp.read().splitlines()
            train_data = self.make_data(data)
        with open(test_path,'r') as fp:
            data = fp.read().splitlines()
            test_data = self.make_data(data)
            
        vocab = set()
        for story, q, answer in train_stories + test_stories:
            vocab |= set(story + q + [answer])
        vocab = sorted(vocab)

        # Reserve 0 for masking via pad_sequences
        self.vocab_size = len(vocab) + 1
        self.story_maxlen = max(map(len, (x for x, _, _ in train_data + test_data)))
        self.query_maxlen = max(map(len, (x for _, x, _ in train_data + test_data)))
        return train_data,test_data



In [15]:
prp_obj = PreprocessData()
train_data,test_data = prp_obj.get_data("data/en-10k")
vocab_size = prp_obj.vocab_size
story_maxlen = prp_obj.story_maxlen
query_maxlen = prp_obj.query_maxlen

  return _compile(pattern, flags).split(string, maxsplit)


NameError: name 'train_stories' is not defined