In [16]:
import json
import tokenizers

In [83]:
def load_dataset(path):
    with open(path) as f:
        data = json.load(f)['data']

    data = data[0:1]
    output = {'qids': [], 'questions': [], 'answers': [], 'contexts': [], 'qid2cid': []}
    
    for article in data:
        for p in range(0, 5):
            output['contexts'].append(article['paragraphs'][p]['context'])
            for qa in article['paragraphs'][p]['qas']:
                output['qids'].append(qa['id'])
                output['questions'].append(qa['question'])
                output['qid2cid'].append(len(output['contexts']) - 1)
                if 'answers' in qa:
                    output['answers'].append(qa['answers'])
    return data, output


In [84]:
data, output = load_dataset("train-v2.0.json")

In [85]:
len(output['qid2cid'])

63

In [86]:
from tokenizers import CoreNLPTokenizer
def tokenize(text):
    TOK = CoreNLPTokenizer()
    tokens = TOK.tokenize(text)
    output = {
        'words': tokens.words(),
        'offsets': tokens.offsets(),
        'pos': tokens.pos(),
        'lemma': tokens.lemmas(),
        'ner': tokens.entities(),
    }
    return output

In [91]:
def find_answer(offsets, begin_offset, end_offset):
    """Match token offsets with the char begin/end offsets of the answer."""
    start = [i for i, tok in enumerate(offsets) if tok[0] == begin_offset]
    end = [i for i, tok in enumerate(offsets) if tok[1] == end_offset]
    assert(len(start) <= 1)
    assert(len(end) <= 1)
    if len(start) == 1 and len(end) == 1:
        return start[0], end[0]


def process_dataset(data):
    q_tokens = []
    c_tokens = []
    for i in data['questions']:
        q_tokens.append(tokenize(i))
    for i in data['contexts']:
        c_tokens.append(tokenize(i))
    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets,
                                    ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
        }


In [92]:
with open('out_file.txt', 'w') as f:
    for ex in process_dataset(output):
        f.write(json.dumps(ex) + '\n')