In [None]:
#@title Download nltk data
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#@title Necessary Imports
import argparse
import linecache
import numpy as np
import tensorflow as tf
import pandas as pd
import json
import nltk
import random
from tqdm import tqdm
from collections import Counter
from six.moves.urllib.request import urlretrieve
import imp
import os
import sys
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import re
import tarfile
import argparse
from os.path import join as pjoin
from six.moves import urllib
import tensorflow.compat.v1.gfile as gfile
from tqdm import tqdm
import zipfile
import argparse

# Data Preparation 

In [None]:
def reporthook(t):
  """https://github.com/tqdm/tqdm"""
  last_b = [0]

  def inner(b=1, bsize=1, tsize=None):
    # for showing progress 
    if tsize is not None:
        t.total = tsize
    t.update((b - last_b[0]) * bsize)
    last_b[0] = b
  return inner

def maybe_download(url, filename, prefix, num_bytes=None):
# Takes an URL download the contents and returns the filename

  local_filename = None
  if not os.path.exists(os.path.join(prefix, filename)):  

    print("Downloading file {}...".format(url + filename))
    with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t:
      local_filename, _ = urlretrieve(url + filename, os.path.join(prefix,filename), reporthook=reporthook(t))
        
  return local_filename

In [None]:
def data_from_json(filename):
  # Reading data in json format
  with open(filename) as data_file:
    data = json.load(data_file)
  return data

def tokenize(sequence):
  tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)]
  return [x for x in tokens] 

def token_idx_map(context, context_tokens):
  acc = ''
  current_token_idx = 0
  token_map = dict()
  for char_idx, char in enumerate(context):
    if char != ' ':
      acc += char
      context_token = str(context_tokens[current_token_idx])
      if acc == context_token:
        syn_start = char_idx - len(acc) + 1
        token_map[syn_start] = [acc, current_token_idx]
        acc = ''
        current_token_idx += 1
  return token_map

def read_write_dataset(dataset, tier, prefix):
# Reads the dataset extracts context, question, answer

    qn, an = 0, 0
    skipped = 0

    with open(os.path.join(prefix, tier +'.context'), 'w') as context_file,  \
         open(os.path.join(prefix, tier +'.question'), 'w') as question_file,\
         open(os.path.join(prefix, tier +'.answer'), 'w') as text_file, \
         open(os.path.join(prefix, tier +'.span'), 'w') as span_file:

        for articles_id in tqdm(list(range(len(dataset['data']))), desc="Preprocessing {}".format(tier)):
            article_paragraphs = dataset['data'][articles_id]['paragraphs']
            for pid in range(len(article_paragraphs)):
                context = article_paragraphs[pid]['context']
               
                context = context.replace("''", '" ')
                context = context.replace("``", '" ')

                context_tokens = tokenize(context)
                answer_map = token_idx_map(context, context_tokens)

                qas = article_paragraphs[pid]['qas']
                for qid in range(len(qas)):
                    question = qas[qid]['question']
                    question_tokens = tokenize(question)

                    answers = qas[qid]['answers']
                    qn += 1

                    num_answers = list(range(1))

                    for ans_id in num_answers:
                        # it contains answer_start, text
                        text = qas[qid]['answers'][ans_id]['text']
                        a_s = qas[qid]['answers'][ans_id]['answer_start']

                        text_tokens = tokenize(text)

                        answer_start = qas[qid]['answers'][ans_id]['answer_start']

                        answer_end = answer_start + len(text)

                        last_word_answer = len(text_tokens[-1]) # add one to get the first char

                        try:
                            a_start_idx = answer_map[answer_start][1]

                            a_end_idx = answer_map[answer_end - last_word_answer][1]

                            # remove length restraint since we deal with it later
                            context_file.write(' '.join(context_tokens) + '\n')
                            question_file.write(' '.join(question_tokens) + '\n')
                            text_file.write(' '.join(text_tokens) + '\n')
                            span_file.write(' '.join([str(a_start_idx), str(a_end_idx)]) + '\n')

                        except Exception as e:
                            skipped += 1

                        an += 1

    print("Skipped {} question/answer pairs in {}".format(skipped, tier))
    return qn,an

def save_files(prefix, tier, indices):
  with open(os.path.join(prefix, tier + '.context'), 'w') as context_file,  \
     open(os.path.join(prefix, tier + '.question'), 'w') as question_file,\
     open(os.path.join(prefix, tier + '.answer'), 'w') as text_file, \
     open(os.path.join(prefix, tier + '.span'), 'w') as span_file:

    for i in indices:
      context_file.write(linecache.getline(os.path.join(prefix, 'train.context'), i))
      question_file.write(linecache.getline(os.path.join(prefix, 'train.question'), i))
      text_file.write(linecache.getline(os.path.join(prefix, 'train.answer'), i))
      span_file.write(linecache.getline(os.path.join(prefix, 'train.span'), i))


def split_tier(prefix, train_percentage = 0.9, shuffle=False):
  # Get number of lines in file
  context_filename = os.path.join(prefix, 'train' + '.context')
  # Get the number of lines
  with open(context_filename) as current_file:
    num_lines = sum(1 for line in current_file)
  # Get indices and split into two files
  indices_dev = list(range(num_lines))[int(num_lines * train_percentage)::]
  if shuffle:
    np.random.shuffle(indices_dev)
    print("Shuffling...")
  save_files(prefix, 'val', indices_dev)
  indices_train = list(range(num_lines))[:int(num_lines * train_percentage)]
  if shuffle:
    np.random.shuffle(indices_train)
  save_files(prefix, 'train', indices_train)


In [None]:
random.seed(42)
np.random.seed(42)

squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

if __name__ == "__main__":

  download_prefix = os.path.join("download", "squad")
  data_prefix = os.path.join("data", "squad")

  print("Downloading datasets into {}".format(download_prefix))
  print("Preprocessing datasets into {}".format(data_prefix))

  if not os.path.exists(download_prefix):
    os.makedirs(download_prefix)
  if not os.path.exists(data_prefix):
    os.makedirs(data_prefix)

  train_filename = "train-v1.1.json"
  dev_filename = "dev-v1.1.json"

  # Downloading dataset
  maybe_download(squad_base_url, train_filename, download_prefix, 30288272)

  train_data = data_from_json(os.path.join(download_prefix, train_filename))

  train_num_questions, train_num_answers = read_write_dataset(train_data, 'train', data_prefix)

  # 1. Split train into train and validation into 90-10
  # 2. Shuffle train, validation
  print("Splitting the dataset into train and validation")
  split_tier(data_prefix, shuffle=True)

  print("Processed {} questions and {} answers in train".format(train_num_questions, train_num_answers))

  print("Downloading {}".format(dev_filename))
  dev_dataset = maybe_download(squad_base_url, dev_filename, download_prefix, 4854279)

  # In dev, we have 10k+ questions, and around 3 answers per question (totaling
  # around 34k+ answers).
  dev_data = data_from_json(os.path.join(download_prefix, dev_filename))
  # list_topics(dev_data)
  dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', data_prefix)
  print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers))

train-v1.1.json: 0.00B [00:00, ?B/s]

Downloading datasets into download/squad
Preprocessing datasets into data/squad
Downloading file https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json...


train-v1.1.json: 30.3MB [00:01, 28.7MB/s]                           
Preprocessing train: 100%|██████████| 442/442 [00:48<00:00,  9.17it/s]


Skipped 763 question/answer pairs in train
Splitting the dataset into train and validation
Shuffling...


dev-v1.1.json: 0.00B [00:00, ?B/s]

Processed 87599 questions and 87599 answers in train
Downloading dev-v1.1.json
Downloading file https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json...


dev-v1.1.json: 4.86MB [00:00, 9.18MB/s]                            
Preprocessing dev: 100%|██████████| 48/48 [00:05<00:00,  8.49it/s]

Skipped 177 question/answer pairs in dev
Processed 10570 questions and 10570 answers in dev





In [None]:
!mv /content/data /content/drive/My\ Drive/Deep\ Data

In [None]:
!mv /content/download /content/drive/My\ Drive/Deep\ Data

In [None]:

if __name__ == '__main__':

  glove_base_url = "http://nlp.stanford.edu/data/"
  glove_filename = "glove.6B.zip"
    
  prefix = os.path.join("download", "dwr")

  print("Storing datasets in {}".format(prefix))

  if not os.path.exists(prefix):
    os.makedirs(prefix)
    glove_zip = maybe_download(glove_base_url, glove_filename, prefix, 862182613)

  glove_zip_ref = zipfile.ZipFile(os.path.join(prefix, glove_filename), 'r')

  glove_zip_ref.extractall(prefix)
  glove_zip_ref.close()

Storing datasets in download/dwr


In [None]:
!mv /content/download/dwr /content/drive/My\ Drive/Deep\ Data/download

# Creating id file using ntlk tokenizer

In [None]:

_PAD = "<pad>"
_SOS = "<sos>"
_UNK = "<unk>"
_START_VOCAB = [_PAD, _SOS, _UNK]

PAD_ID = 0
SOS_ID = 1
UNK_ID = 2

def tokenize(sequence):
  tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)]
  return [x for x in tokens] 

def invert_map(answer_map):
  return {v[1]: [v[0], k] for k, v in answer_map.items()}

def basic_tokenizer(sentence):
  words = []
  for space_separated_fragment in sentence.strip().split():
    words.extend(re.split(" ", space_separated_fragment.decode('utf-8')))
  return [w for w in words if w]

def initialize_vocabulary(vocabulary_path):
  
  # create vocab file
  rev_vocab = []
  with gfile.GFile(vocabulary_path, mode="r") as f:
    rev_vocab.extend(f.readlines())
  rev_vocab = [line.strip('\n') for line in rev_vocab]
  vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
  return vocab, rev_vocab
    

def process_glove(glove_dir, glove_dim, vocab_list, save_path, size=4e5, random_init=True):

    if not gfile.Exists(save_path + ".npz"):
       
        glove_path = os.path.join(glove_dir, "glove.6B.{}d.txt".format(glove_dim))

        if random_init:
            glove = np.random.randn(len(vocab_list), glove_dim)
        else:
            glove = np.zeros((len(vocab_list), glove_dim))
        found = 0
        with open(glove_path, 'r', encoding='utf8') as fh:  
            for line in tqdm(fh, total=size):
                array = line.lstrip().rstrip().split(" ")
                word = array[0]
                vector = list(map(float, array[1:]))
                if word in vocab_list:
                  idx = vocab_list.index(word)
                  glove[idx, :] = vector
                  found += 1
                elif word.capitalize() in vocab_list:
                  idx = vocab_list.index(word.capitalize())
                  glove[idx, :] = vector
                  found += 1
                elif word.lower() in vocab_list:
                  idx = vocab_list.index(word.lower())
                  glove[idx, :] = vector
                  found += 1
                elif word.upper() in vocab_list:
                  idx = vocab_list.index(word.upper())
                  glove[idx, :] = vector
                  found += 1

        print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path))
        np.savez_compressed(save_path, glove=glove)
        print("saved trimmed glove matrix at: {}".format(save_path))


def create_vocabulary(vocabulary_path, data_paths, tokenizer):
    if not gfile.Exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths)))
        vocab = {}
        for path in data_paths:
            with open(path, mode="rb") as f:
                counter = 0
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        print("processing line %d" % counter)
                    tokens = tokenizer(line)
                    for w in tokens:
                        if w in vocab:
                            vocab[w] += 1
                        else:
                            vocab[w] = 1
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        print("Vocabulary size: %d" % len(vocab_list))
        with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
            for w in vocab_list:
                vocab_file.write(w + "\n")


def sentence_to_token_ids(sentence, vocabulary, tokenizer):
  words = tokenizer(sentence)
  return [vocabulary.get(w, UNK_ID) for w in words]


def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer):
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 5000 == 0:
                        print("tokenizing line %d" % counter)
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")


if __name__ == '__main__':
    # args = setup_args()
    vocab_path = pjoin("/content/drive/My Drive/Deep Data/data/squad", "vocab.dat")

    train_path = pjoin("/content/drive/My Drive/Deep Data/data/squad/train", "train")
    valid_path = pjoin("/content/drive/My Drive/Deep Data/data/squad/val", "val")
    dev_path = pjoin("/content/drive/My Drive/Deep Data/data/squad/val", "dev")

    create_vocabulary(vocab_path,
                      [pjoin("/content/drive/My Drive/Deep Data/data/squad/train", "train.context"),
                       pjoin("/content/drive/My Drive/Deep Data/data/squad/train", "train.question"),
                       pjoin("/content/drive/My Drive/Deep Data/data/squad/val", "val.context"),
                       pjoin("/content/drive/My Drive/Deep Data/data/squad/val", "val.question")],
                       basic_tokenizer)
    vocab, rev_vocab = initialize_vocabulary(pjoin("/content/drive/My Drive/Deep Data/data/squad", "vocab.dat"))



    glove_dim = 100
    source_dir = "/content/drive/My Drive/Deep Data/data"
    glove_dir = "/content/drive/My Drive/Deep Data/download/dwr"
    process_glove(glove_dir, glove_dim, rev_vocab, source_dir + "/glove.trimmed.{}".format(glove_dim),
                  random_init=True)

    # Creating Dataset 
    x_train_dis_path = train_path + ".ids.context"
    y_train_ids_path = train_path + ".ids.question"
    data_to_token_ids(train_path + ".context", x_train_dis_path, vocab_path, basic_tokenizer)
    data_to_token_ids(train_path + ".question", y_train_ids_path, vocab_path, basic_tokenizer)

    x_dis_path = valid_path + ".ids.context"
    y_ids_path = valid_path + ".ids.question"
    data_to_token_ids(valid_path + ".context", x_dis_path, vocab_path, basic_tokenizer)
    data_to_token_ids(valid_path + ".question", y_ids_path, vocab_path, basic_tokenizer)

Creating vocabulary /content/drive/My Drive/Deep Data/data/squad/vocab.dat from data ['/content/drive/My Drive/Deep Data/data/squad/train/train.context', '/content/drive/My Drive/Deep Data/data/squad/train/train.question', '/content/drive/My Drive/Deep Data/data/squad/val/val.context', '/content/drive/My Drive/Deep Data/data/squad/val/val.question']
Vocabulary size: 115373


100%|██████████| 400000/400000.0 [31:46<00:00, 209.78it/s]


71733/115373 of word vocab have corresponding vectors in /content/drive/My Drive/Deep Data/download/dwr/glove.6B.100d.txt
saved trimmed glove matrix at: /content/drive/My Drive/Deep Data/data/glove.trimmed.100
Tokenizing data in /content/drive/My Drive/Deep Data/data/squad/train/train.context
tokenizing line 5000
tokenizing line 10000
tokenizing line 15000
tokenizing line 20000
tokenizing line 25000
tokenizing line 30000
tokenizing line 35000
tokenizing line 40000
tokenizing line 45000
tokenizing line 50000
tokenizing line 55000
tokenizing line 60000
tokenizing line 65000
tokenizing line 70000
tokenizing line 75000
Tokenizing data in /content/drive/My Drive/Deep Data/data/squad/train/train.question
tokenizing line 5000
tokenizing line 10000
tokenizing line 15000
tokenizing line 20000
tokenizing line 25000
tokenizing line 30000
tokenizing line 35000
tokenizing line 40000
tokenizing line 45000
tokenizing line 50000
tokenizing line 55000
tokenizing line 60000
tokenizing line 65000
tokeniz