In [1]:
import json
import numpy as np
import os
from glob import glob
from nltk import word_tokenize
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import random

In [2]:
include_nap = True

## preprocess

In [3]:
p = re.compile('^[.\-_\s)]')

def text_preprocess(text):
    while p.findall(text):
        text = text[1:]
    text = text.strip()
    return text

def load_data(fpath):
    with open(fpath, 'r') as f:
        data = json.load(f)
    return data


def data_preprocess(fpath, n_max=100000, include_nap=False, 
                    allow_answer_len = (4, 50), allow_passage_len = (24, 100)):
    data = load_data(fpath)
        
    filtered_ids = []
    filtered_passage = []
    filtered_query = []
    filtered_answer = []
    all_ids = data['answers'].keys()
    print("fpath:", fpath)
    print("n_data:", len(all_ids))
    
    cnt = 0
    for _id in tqdm(all_ids):
        if cnt == n_max:
            break
        answer = data['answers'][_id]
        
        answer_exist = answer[0].lower() != 'No Answer Present.'.lower()
        
        has_uni_answer = len(answer) == 1
        answer = word_tokenize(answer[0].lower())
        length = len(answer)
        allowed_len = length >= allow_answer_len[0] and length <= allow_answer_len[1]
        

        selected_passages = [item['passage_text'] for item in data['passages'][_id] if item['is_selected']]
        
        if not answer_exist and include_nap and random.random() > 0.5:
            answer_exist = True
            _idx = random.choice(range(len(data['passages'][_id])))
            randomly_selected_passage = data['passages'][_id][_idx]['passage_text']
            selected_passages = [randomly_selected_passage]
        
        allowed_len = allowed_len and len(selected_passages) == 1
        
        if allowed_len:
            passage = word_tokenize(selected_passages[0].lower())
            length = len(passage)
            allowed_len = length >= allow_passage_len[0] and length <= allow_passage_len[1]

            query = word_tokenize(data['query'][_id].lower())

            if answer_exist and has_uni_answer and allowed_len:
                filtered_ids.append(_id)
                
                passage = text_preprocess(' '.join(passage))
                query = text_preprocess(' '.join(query))
                answer = text_preprocess(' '.join(answer))
                
                filtered_passage.append(passage)
                filtered_query.append(query)
                filtered_answer.append(answer)
                
                cnt += 1
                
    new_data = []
    for i in range(len(filtered_ids)):
        obj = {
            "id": filtered_ids[i],
            "passage": filtered_passage[i],
            'query': filtered_query[i],
            'answer': filtered_answer[i]
        }
        new_data.append(obj) 
        
    print("preprocessed n_data:", len(new_data))
    return new_data

In [5]:
train_data = load_data('../sti_ddp2/qa/data/train_v2.1.json')
valid_data = load_data('../sti_ddp2/qa/data/dev_v2.1.json')

In [6]:
len(train_data['answers'].keys()),len(valid_data['answers'].keys())

(808731, 101093)

In [8]:
fpath = '../sti_ddp2/qa/data/train_v2.1.json'
data = data_preprocess(fpath, include_nap=include_nap)

fname = os.path.basename(fpath)
if include_nap:
    prefix = 'filtered_nap_'
else:
    prefix = 'filtered'
    
newfpath = os.path.join(os.path.dirname(fpath), prefix + fname)
json.dump(data, open(newfpath, 'w'), indent=2, ensure_ascii = False)

  0%|          | 239/808731 [00:00<05:38, 2388.81it/s]

fpath: ../sti_ddp2/qa/data/train_v2.1.json
n_data: 808731


  9%|▉         | 72418/808731 [00:25<04:15, 2876.90it/s]


KeyboardInterrupt: 

In [18]:
fpath = '../sti_ddp2/qa/data/dev_v2.1.json'
data = data_preprocess(fpath, include_nap=include_nap)

fname = os.path.basename(fpath)
if include_nap:
    prefix = 'filtered_nap_'
else:
    prefix = 'filtered'
    
newfpath = os.path.join(os.path.dirname(fpath), prefix + fname)
json.dump(data, open(newfpath, 'w'), indent=2, ensure_ascii = False)

## tokenizing & splitting

In [22]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2-medium', local_files_only=True)
special_tokens = {'sep_token':'<|sep|>'}
tokenizer.add_special_tokens(special_tokens)

def tokenize(data, ids):
    new_data = []
    for i in ids:
        item = data[i]

        passage = item['passage'].replace('\u200b', '')
        query = item['query'].replace('\u200b', '')
        answer = item['answer'].replace('\u200b', '')
        
        new_item = {
            'id': item['id'],
            'passage': tokenizer.encode(passage),
            'query': tokenizer.encode(query),
            'answer': tokenizer.encode(answer),
            'passage_text': passage,
            'query_text': query,
            'answer_text': answer
        }
        
        new_data.append(new_item)
    
    return new_data

In [23]:
n_train = 100
n_valid = 500
n_test = 20000
seed = 1

In [26]:
np.random.seed(seed)
if include_nap:
    output_dir = f'qa/data/nap'
else:
    output_dir = f'qa/data/wo_nap'

os.makedirs(output_dir, exist_ok=True)
data = load_data('../sti_ddp2/qa/data/filtered_nap_train_v2.1.json')
ids = np.arange(len(data))
np.random.shuffle(ids)

train_ids = ids[:n_train]
valid_ids = ids[n_train:n_train+n_valid]

In [27]:
test_data = load_data('../sti_ddp2/qa/data/filtered_nap_dev_v2.1.json')
ids = np.arange(len(test_data))
np.random.shuffle(ids)

test_ids = ids[:n_test]

In [28]:
train_data = tokenize(data, train_ids)
valid_data = tokenize(data, valid_ids)
test_data = tokenize(test_data, test_ids)

In [29]:
json.dump(train_data, open(os.path.join(output_dir, 'train.json'), 'w'), ensure_ascii = False)
json.dump(valid_data, open(os.path.join(output_dir, 'valid.json'), 'w'), ensure_ascii = False)
json.dump(test_data, open(os.path.join(output_dir, 'test.json'), 'w'), ensure_ascii = False)