In [1]:
import os
import sys
import gzip
import json_lines
import logging
import collections
import json
import pickle
import multiprocessing
import argparse
import math
from tqdm import tqdm

import numpy as np

In [6]:
class SquadExample(object):
    """
    A single training/test example for the Squad dataset.
    For examples without an answer, the start and end position are -1.
    """

    def __init__(self,
                 qas_id,
                 question_text,
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        s = ""
        s += "qas_id: %s" % (self.qas_id)
        s += ", question_text: %s" % (self.question_text)
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.end_position:
            s += ", end_position: %d" % (self.end_position)
        return s

In [11]:
def read_squad_examples(input_file, debug=False):
    # Read data
    unproc_data = []
    with gzip.open(input_file, 'rt', encoding='utf-8') as f:  # opening file in binary(rb) mode
        for item in json_lines.reader(f):
            # print(item) #or use print(item['X']) for printing specific data
            unproc_data.append(item)

    # Delete header
    unproc_data = unproc_data[1:]
    if debug:
        unproc_data = unproc_data[:100]

    ###################### Make Examples ######################
    examples = []
    for item in unproc_data:
        # 1. Get Context
        doc_tokens = []
        for token in item['context_tokens']:
            # BERT has only [SEP] in it's word piece vocabulary. because we keps all separators char length 5
            # we can replace all of them with [SEP] without modifying the offset
            if token[0] in ['[TLE]', '[PAR]', '[DOC]']:
                token[0] = '[SEP]'
            doc_tokens.append(token[0])

        # 2. qas
        for qa in item['qas']:
            qas_id = qa['qid']
            question_text = qa['question']

            answer_lst = []  # Check for duplicate question
            for answer in qa['detected_answers']:
                orig_answer_text = answer['text']
                # We could find so many duplicate "Detected Answer"...It needs to be erased
                if orig_answer_text in answer_lst:
                    continue
                else:
                    answer_lst.append(orig_answer_text)

                # Only take the first span
                start_position = answer['token_spans'][0][0]
                end_position = answer['token_spans'][0][1]

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    doc_tokens=doc_tokens,
                    orig_answer_text=orig_answer_text,
                    start_position=start_position,
                    end_position=end_position)
                examples.append(example)
    return examples

In [None]:
read_squad_examples("data/train/SQuAD.jsonl.gz", debug = True)