In [None]:
!pip install transformers

In [None]:
import json
import random
import os
import torch
import numpy as np
import pandas as pd

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  

In [None]:
seed_everything(42)

In [None]:
DATA_PATH = "Your Path"
DATA_PATH_2 = "Your Path"

In [None]:
with open(DATA_PATH + "train.json", 'rb') as f:
    data_dict = json.load(f)

In [None]:
with open(DATA_PATH_2 + "ko_nia_normal_squad_all.json", 'rb') as f:
    aihub_dict = json.load(f)

In [None]:
def read_data(path):
    with open(path, 'rb') as f:
        data_dict = json.load(f)

    categories = []
    contexts = []
    questions = []
    answers = []
    for group in tqdm(data_dict['data']):
        category = group["news_category"]  
        
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
                    categories.append(category)
  
    return contexts, questions, answers

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [None]:
mrc_contexts, mrc_questions, mrc_answers = read_data(DATA_PATH + "train.json")
add_end_idx(mrc_answers, mrc_contexts)

In [None]:
aihub_contexts, aihub_questions, aihub_answers = read_data(DATA_PATH_2 + "ko_nia_normal_squad_all.json")
add_end_idx(aihub_answers, aihub_contexts)

In [None]:
short_indices = [i for i in range(len(aihub_contexts)) if len(aihub_contexts[i].split()) < 300]
        
aihub_contexts_short = [aihub_contexts[i] for i in short_indices]
aihub_questions_short = [aihub_questions[i] for i in short_indices]
aihub_answers_short = [aihub_answers[i] for i in short_indices]

In [None]:
indices = list(range(20000))
np.random.shuffle(indices)

aihub_contexts_10000 = [aihub_contexts_short[i] for i in indices]
aihub_questions_10000 = [aihub_questions_short[i] for i in indices]
aihub_answers_10000 = [aihub_answers_short[i] for i in indices]

In [None]:
train_contexts = mrc_contexts + aihub_contexts_10000
train_questions = mrc_questions + aihub_questions_10000
train_answers = mrc_answers + aihub_answers_10000

# CustomedDataset 의 inputs