In [1]:
import os
import json
import numpy as np
import pickle
import re
import string 

from scipy.sparse import lil_matrix
from tqdm import tqdm

with open("dev-v1.1.json") as f:
    test = json.load(f)
with open("train-v1.1.json") as f:
    train = json.load(f)

In [2]:
def split_into_sentences(text):
    text = text.lower()
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z]\.)(?<=\.|\?)\s', text)
    return sentences, len(sentences)

In [3]:
def split_clqa(topics):
    """
    for each topic, split into paragraph.
    for each paragraph, split context, label, question and answer

    Attributes:
        topics: list of topic (1 topic has n paragraphs)
    
    Returns:
        context: list of contexts
        label: list of labels
        question: list of questions
        answer: list of answers
    """
    result_context = []
    result_label = []
    result_question = []
    result_answer = []
    
    table = str.maketrans("","",string.punctuation)
    for i in table.keys():
        if i != 36:
            table[i] = " "+chr(i)+" "
            
    for i in range(len(topics)):
        paragraphs = topics[i]['paragraphs']
        for j in range(len(paragraphs)):
            paragraph = paragraphs[j]
            
            temp_context = paragraph['context']
            temp_qas = paragraph['qas']
            
            # remove '[\alphabet]'. ex) [i], [k]
            temp_context = re.sub('[[][a-zA-Z]+[]]', '', temp_context)
#             replace paragraph to sentences with \n
#             temp_context = re.sub('[a-z][.]', '\n', temp_context)
            context, length = split_into_sentences(temp_context)            

            for i in range(len(context)):
                context[i] = context[i].translate(str.maketrans(table))
            label = [i for i in reversed(range(length))] 
            
            for k in range(len(temp_qas)):
                temp_qa = temp_qas[k]
                
                question = temp_qa['question'].lower()
                temp_answers = temp_qa['answers']
                
                for l in range(len(temp_answers)):
                    answer = temp_answers[l]['text'].lower()
                    
                    result_context.append(context)
                    result_label.append(label)
                    result_question.append(question)
                    result_answer.append(answer)
    # check
    if (len(result_context) == len(result_label) == len(result_question) == len(result_answer)):
        print("Data is well prepared!")
        print("total: {}".format(len(result_context)))
    else:
        print("Something is missing! check again")
        print("the number of questions: {}".format(len(result_question)))
        print("the number of answers: {}".format(len(result_answer)))
        print("the number of contexts: {}".format(len(result_context)))
        print("the number of labels: {}".format(len(result_label)))
    
    return result_context, result_label, result_question, result_answer 

In [4]:
train_topics = train['data'][:400] # 400
val_topics = train['data'][400:] # 42
test_topics = test['data'] # 48
train_context, train_label, train_question, train_answer = split_clqa(train_topics)
val_context, val_label, val_question, val_answer = split_clqa(val_topics)
test_context, test_label, test_question, test_answer = split_clqa(test_topics)

Data is well prepared!
total: 79315
Data is well prepared!
total: 8284
Data is well prepared!
total: 34726


In [5]:
del train
del test

In [6]:
# convert to index
# word set
cq_word_set = set()
list_of_context = [train_context, val_context, test_context]
list_of_question = [train_question, val_question, test_question]

for list_ in list_of_context:
    for para in list_:
        for sent in para:
            sent = sent.split()
            cq_word_set.update(sent)
            
for list_ in list_of_question:
    for sent in list_:
        sent = sent.split()
        cq_word_set.update(sent)

In [7]:
answer_word_set = set()
list_of_answer = [train_answer, val_answer, test_answer]
for answers in list_of_answer:
    for answer in answers:
            answer_word_set.add(answer)

In [8]:
print("context and question words: {}".format(len(cq_word_set)))
print("answer words: {}".format(len(answer_word_set)))

context and question words: 119388
answer words: 76945


In [9]:
del train_topics
del val_topics
del test_topics

In [10]:
cq_word_index = {}
for i, word in enumerate(cq_word_set):
    cq_word_index[word] = i

answer_word_index = {}
for i, word in enumerate(answer_word_set):
    answer_one_hot = np.zeros([len(answer_word_set)])
    answer_one_hot[i] = 1
    answer_word_index[word] = answer_one_hot

In [11]:
train_context_index = []
val_context_index = []
test_context_index = []

for para in train_context:
    indexed_para = []
    for sent in para:
        sent = sent.split()
        indexed_sent = []
        for word in sent:
            indexed_sent.append(cq_word_index[word])
        indexed_para.append(indexed_sent)
    train_context_index.append(np.array(indexed_para))
    
for para in val_context:
    indexed_para = []
    for sent in para:
        sent = sent.split()
        indexed_sent = []
        for word in sent:
            indexed_sent.append(cq_word_index[word])
        indexed_para.append(indexed_sent)
    val_context_index.append(np.array(indexed_para))
    
for para in test_context:
    indexed_para = []
    for sent in para:
        sent = sent.split()
        indexed_sent = []
        for word in sent:
            indexed_sent.append(cq_word_index[word])
        indexed_para.append(indexed_sent)
    test_context_index.append(np.array(indexed_para))

In [12]:
if (len(train_context_index) + len(val_context_index) + len(test_context_index)) == (len(train_context) + len(test_context) + len(val_context)):
    print("context encoding is completed!")
else:
    print("Something is missed! Check again")

context encoding is completed!


In [13]:
train_question_index = []
val_question_index = []
test_question_index = []

for sent in train_question:
    sent = sent.split()
    indexed_sent = []
    for word in sent:
        indexed_sent.append(cq_word_index[word])
    train_question_index.append(np.array(indexed_sent))
    
for sent in val_question:
    sent = sent.split()
    indexed_sent = []
    for word in sent:
        indexed_sent.append(cq_word_index[word])
    val_question_index.append(np.array(indexed_sent))
    
for sent in test_question:
    sent = sent.split()
    indexed_sent = []
    for word in sent:
        indexed_sent.append(cq_word_index[word])
    test_question_index.append(np.array(indexed_sent))

In [14]:
if (len(train_question_index) + len(val_question_index) + len(test_question_index)) == (len(train_question) + len(test_question) + len(val_question)):
    print("question encoding is completed!")
else:
    print("Something is missing! Check again")

question encoding is completed!


In [16]:
train_answer_index = []
val_answer_index = []
test_answer_index = []

for answer in tqdm(train_answer):
    indexed_answer = answer_word_index[answer]
    train_answer_index.append(lil_matrix(indexed_answer, dtype = np.int8))
    
for answer in tqdm(val_answer):
    indexed_answer = answer_word_index[answer]
    val_answer_index.append(lil_matrix(indexed_answer, dtype = np.int8))
    
for answer in tqdm(test_answer):
    indexed_answer = answer_word_index[answer]
    test_answer_index.append(lil_matrix(indexed_answer, dtype = np.int8))

100%|██████████| 79315/79315 [00:58<00:00, 1357.36it/s]
100%|██████████| 8284/8284 [00:06<00:00, 1371.46it/s]
100%|██████████| 34726/34726 [00:25<00:00, 1364.36it/s]


In [17]:
if (len(train_answer_index)+ len(val_answer_index) + len(test_answer_index)) == (len(train_answer) + len(test_answer) + len(val_answer)):
    print("answer encoding is completed!")
else:
    print("Something is missing! Check again")

answer encoding is completed!


In [18]:
max_sent_len = 0
for label in (train_label + val_label + test_label):
    if label[0] > max_sent_len:
        max_sent_len = label[0] + 1

In [19]:
train_label_index = []
val_label_index = []
test_label_index = []

for label in tqdm(train_label):
    train_label_index.append(lil_matrix(np.eye(max_sent_len)[label]))
    
for label in tqdm(val_label):
    val_label_index.append(lil_matrix(np.eye(max_sent_len)[label]))

for label in tqdm(test_label):
    test_label_index.append(lil_matrix(np.eye(max_sent_len)[label]))

100%|██████████| 79315/79315 [00:21<00:00, 3692.67it/s]
100%|██████████| 8284/8284 [00:02<00:00, 3946.63it/s]
100%|██████████| 34726/34726 [00:09<00:00, 3797.92it/s]


In [20]:
if (len(train_label_index)+ len(val_label_index) + len(test_label_index)) == (len(train_label) + len(test_label) + len(val_label)):
    print("label encoding is completed!")
else:
    print("Something is missing! Check again")

label encoding is completed!


In [21]:
train_dataset = (train_question_index, train_answer_index, train_context_index, train_label_index)
val_dataset = (val_question_index, val_answer_index, val_context_index, val_label_index)
test_dataset = (test_question_index, test_answer_index, test_context_index, test_label_index)

In [22]:
with open('squad_preprocessd/train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)

In [23]:
with open('squad_preprocessd/val_dataset.pkl', 'wb') as f:
    pickle.dump(val_dataset, f)

In [24]:
with open('squad_preprocessd/test_dataset.pkl', 'wb') as f:
    pickle.dump(test_dataset, f)