In [27]:
import json
import os
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import variable_scope as vs
from tqdm import tqdm

In [28]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\glodp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing Data

In [2]:
def load_json(filename):
    with open(filename) as data_file:
        data = json.load(data_file)
    return data

In [3]:
def tokenize_data(string):
    tokens = [token.replace("''", '" ').replace("``", '" ').lower() for token in nltk.word_tokenize(string)]
    return tokens

In [4]:
def get_Word_Index(context, context_tokens):
    result = ''
    current_word_index = 0
    wordMap = dict()

    for char_index, char in enumerate(context):
        if char !='\n' and char !=' ':
            result += char
            context_token = context_tokens[current_word_index]
            if result == context_token:
                start = char_index - len(result) + 1
                for char_position in range(start, char_index+1):
                    wordMap[char_position] = (result, current_word_index)
                result = ''
                current_word_index += 1
                
    if current_word_index != len(context_tokens):
        return None
    else:
        return wordMap

In [5]:
def preprocess(dataset, datatype, jsonDir):
    num_map_problem = 0
    num_token_problem = 0
    num_align_problem = 0
    num_examples = 0
    examples = []
    
    for eventID in tqdm(range(len(dataset['data']))):
        event_Paragraphs = dataset['data'][eventID]['paragraphs']
        for paragraphID in range(len(event_Paragraphs)):
            # Context Data
            context = event_Paragraphs[paragraphID]['context']
            context = context.replace("''", '" ').replace("``", '" ').lower()
            context_tokens = tokenize_data(context)
            
            wordIndex = get_Word_Index(context, context_tokens)
            if wordIndex is None:
                num_map_problem += len(event_Paragraphs[paragraphID]['qas'])
                continue
            
            # Question and Answer Data
            qaSet = event_Paragraphs[paragraphID]['qas']
            for qID in qaSet:
                question = qID['question']
                question = question.replace("''", '" ').replace("``", '" ').lower()
                question_tokens = tokenize_data(question)
                
                ans_text = qID['answers'][0]['text']
                ans_text = ans_text.lower()
                ans_start_index = qID['answers'][0]['answer_start']
                ans_end_index = ans_start_index + len(ans_text)
                
                if context[ans_start_index:ans_end_index] != ans_text:
                    num_align_problem += 1
                    continue
                
                ans_start_word = wordIndex[ans_start_index][1]
                ans_end_word = wordIndex[ans_end_index-1][1]
                ans_tokens = context_tokens[ans_start_word:ans_end_word+1]
                if("".join(ans_tokens) != "".join(ans_text.split())):
                    num_token_problem += 1
                    continue
                
                examples.append((" ".join(context_tokens), " ".join(question_tokens), " ".join(ans_tokens), " ".join([str(ans_start_word), str(ans_end_word)])))
                num_examples += 1
    
    # Creating files for context, questions, answers, and answer span indexes
    index = list(range(len(examples)))
    np.random.shuffle(index)
    with open(os.path.join(jsonDir, datatype +'.context'), 'w', encoding="utf-8") as context_file,  \
         open(os.path.join(jsonDir, datatype +'.question'), 'w', encoding="utf-8") as question_file,\
         open(os.path.join(jsonDir, datatype +'.answer'), 'w', encoding="utf-8") as answer_file, \
         open(os.path.join(jsonDir, datatype +'.span'), 'w', encoding="utf-8") as span_file:
        
        for i in index:
            (context, question, answer, span_index) = examples[i]
            context_file.write(context + '\n')
            question_file.write(question + '\n')
            answer_file.write(answer + '\n')
            span_file.write(span_index + '\n')
    
    # Returning results
    print ("Number of triples ignored due to token mapping problems: ", num_map_problem)
    print ("Number of triples ignored due to unalignment with tokenization problems: ", num_token_problem)
    print ("Number of triples ignored due to span alignment problems: ", num_align_problem)
    print ("Processed examples: %i out of %i" % (num_examples, num_examples+num_map_problem+num_token_problem+num_align_problem))

In [6]:
# Read data
jsonDir = "./dataset/"
dev_data = load_json(os.path.join(jsonDir,"dev-v1.1.json"))
train_data = load_json(os.path.join(jsonDir,"train-v1.1.json"))

In [7]:
preprocess(dev_data, "dev", jsonDir)

100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:04<00:00, 11.73it/s]


Number of triples ignored due to token mapping problems:  3212
Number of triples ignored due to unalignment with tokenization problems:  240
Number of triples ignored due to span alignment problems:  0
Processed examples: 7118 out of 10570


In [8]:
preprocess(train_data, "train", jsonDir)

100%|████████████████████████████████████████████████████████████████████████████████| 442/442 [00:35<00:00, 11.43it/s]


Number of triples ignored due to token mapping problems:  28669
Number of triples ignored due to unalignment with tokenization problems:  1760
Number of triples ignored due to span alignment problems:  7
Processed examples: 57163 out of 87599


## Network Layers

### Embedding Layer

### Highway Layer

In [9]:
def highway_Layer(x, size, activation, carry_bias=-1.0):
    W_T = tf.Variable(tf.truncated_normal([size, size], stddev=0.1), name="weight_transform")
    b_T = tf.Variable(tf.constant(carry_bias, shape=[size]), name="bias_transform")
    
    W = tf.Variable(tf.truncated_normal([size, size], stddev=0.1), name="weight")
    b = tf.Variable(tf.constant(0.1, shape=[size]), name="bias")

    T = tf.sigmoid(tf.matmul(x, W_T) + b_T, name="transform_gate")
    H = activation(tf.matmul(x, W) + b, name="activation")
    C = tf.sub(1.0, T, name="carry_gate")
    
    y = tf.add(tf.mul(H, T), tf.mul(x, C), "y")
    return y
## This uses tensor flow...

## Model

In [38]:
class mrcModel(object):
    def __init__(self, id2word, word2id, embed_matrix):
        self.id2word = id2word
        self.word2id = word2id
        with tf.variable_scope("QAModel", initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, uniform=True)):
            self.add_placeholders()
            self.add_embed_layer(embed_matrix)
            self.add_loss()
        
    def add_placeholders(self):
        # Add placeholders for the inputs
        self.context_ids = tf.placeholder(tf.int32, shape=[None, context_len])
        self.context_mask = tf.placeholder(tf.int32, shape=[None, context_len])
        self.question_ids = tf.placeholder(tf.int32, shape=[None, question_len])
        self.question_mask = tf.placeholder(tf.int32, shape=[None, question_len])
        self.answer_span = tf.placeholder(tf.int32, shape=[None, 2]) # The start and end index

        # Add a placeholder to feed in the keep probability (for dropout)
        self.keep_prob = tf.placeholder_with_default(1.0, shape=())
    
    def add_embed_layer(self, embed_matrix):
#         with vs.variable_scope("embedding"):
        with tf.variable_scope("embedding"):
            embedding_matrix = tf.constant(embed_matrix, dtype=tf.float32, name="embed_matrix")
            
            self.context_embed = embedding_ops.embedding_lookup(embedding_matrix, self.context_ids) # shape (batch_size, context_len, embedding_size)
            self.question_embed = embedding_ops.embedding_lookup(embedding_matrix, self.question_ids)
    
    def add_loss(self):
#         with vs.variable_scope("loss"):
        with tf.variable_scope("loss"):
            # Loss for start prediction
            loss_start = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_start, labels=self.answer_span[:, 0])
            self.loss_start = tf.reduce_mean(loss_start) # Average across batch

            # Loss for end prediction
            loss_end = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_end, labels=self.answer_span[:, 1])
            self.loss_end = tf.reduce_mean(loss_end) #Average across batch

            # Total loss
            self.loss = self.loss_start + self.loss_end

## Main Program

In [39]:
## Static variables
data_dir = "./dataset/"

# Hyperparameters
learning_rate = 0.001
batch_size = 60
hidden_size_encoder = 150
context_len = 300
question_len = 30
embed_size = 100

In [40]:
## Getting train and dev data
train_context = os.path.join(data_dir, "train.context")
train_questions = os.path.join(data_dir, "train.question")
train_ans_span = os.path.join(data_dir, "train.span")
dev_context = os.path.join(data_dir, "dev.context")
dev_questions = os.path.join(data_dir, "dev.question")
dev_ans_span = os.path.join(data_dir, "dev.span")

In [41]:
## Initalize Model
id2word, word2id, embed_matrix = None, None, []
mrcModel = mrcModel(id2word, word2id, embed_matrix)

AttributeError: 'mrcModel' object has no attribute 'logits_start'