In [23]:
%matplotlib inline

from glob import glob
from tqdm.notebook import tqdm

import os
import json
import random
import pandas as pd
import numpy as np

np.random.seed(1)
random.seed(1)

In [2]:
DATA_DIR = "../data"

In [3]:
class SquadDocuments(object):
    
    def __init__(self, filename):
        self.ds = {}
        self.keys = []
        with open(filename) as fp:
            data = json.load(fp)
        for idx, item in enumerate(data["data"]):
            title = item["title"]
            for idy, para in enumerate(item["paragraphs"]):
                context = para["context"]
                key = f"{title}__{idx}__{idy}"
                self.ds[key] = context
                self.keys.append(key)

    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        key = self.keys[idx]
        return {"id": key, "text": self.ds[key]}

In [20]:
def get_features_from_doc(features_dir, doc_id):
    fname = os.path.join(features_dir, doc_id + ".json")
    with open(fname, encoding="utf-8") as fp:
        features = json.load(fp)
    return features

In [24]:
select_ratio = .3

train_docs = SquadDocuments(os.path.join(DATA_DIR, "squad/train-v2.0.json"))
train_features_dir = os.path.join(DATA_DIR, "squad/v0_train/out_features/")
train_triplets = [] # (doc, word, instance)

for doc in tqdm(train_docs):
    features = get_features_from_doc(train_features_dir, doc["id"])
    words = list(features)
    num_words = len(features)
    num_selected = int(num_words * select_ratio)
    for _ in range(num_selected):
        idx = int(random.uniform(0, num_words))
        curr_word = words[idx]
        curr_instance = int(random.uniform(0, len(features[curr_word])))
        train_triplets.append((doc["id"], curr_word, curr_instance))

HBox(children=(FloatProgress(value=0.0, max=19035.0), HTML(value='')))




In [25]:
select_ratio = .3

dev_docs = SquadDocuments(os.path.join(DATA_DIR, "squad/dev-v2.0.json"))
dev_features_dir = os.path.join(DATA_DIR, "squad/v0_dev/out_features/")
dev_triplets = [] # (doc, word, instance)

for doc in tqdm(dev_docs):
    features = get_features_from_doc(dev_features_dir, doc["id"])
    words = list(features)
    num_words = len(features)
    num_selected = int(num_words * select_ratio)
    for _ in range(num_selected):
        idx = int(random.uniform(0, num_words))
        curr_word = words[idx]
        curr_instance = int(random.uniform(0, len(features[curr_word])))
        dev_triplets.append((doc["id"], curr_word, curr_instance))

HBox(children=(FloatProgress(value=0.0, max=1204.0), HTML(value='')))




In [57]:
df=pd.DataFrame(train_triplets, columns=["doc", "word", "instance"])
df.to_csv(os.path.join(DATA_DIR, "squad/v0_train/sample.csv"), index=False)

In [58]:
df=pd.DataFrame(dev_triplets, columns=["doc", "word", "instance"])
df.to_csv(os.path.join(DATA_DIR, "squad/v0_dev/sample.csv"), index=False)