# KorQuAD 데이터 다운로드 및 저장

In [None]:
!mkdir ./data
!wget https://korquad.github.io/dataset/KorQuAD_v1.0_train.json -P ./data
!wget https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json -P ./data

--2023-01-11 13:00:54--  https://korquad.github.io/dataset/KorQuAD_v1.0_train.json
Resolving korquad.github.io (korquad.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to korquad.github.io (korquad.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38527475 (37M) [application/json]
Saving to: ‘./data/KorQuAD_v1.0_train.json’


2023-01-11 13:00:55 (187 MB/s) - ‘./data/KorQuAD_v1.0_train.json’ saved [38527475/38527475]

--2023-01-11 13:00:55--  https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json
Resolving korquad.github.io (korquad.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to korquad.github.io (korquad.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3881058 (3.7M) [application/json]
Saving to: ‘./data/KorQuAD_v1.0_dev.json’


2023-01-11 13:00:55 (53.1 MB/s) - ‘./data/KorQuAD_v1.0_dev.json’ saved [3881058/3881058]


# 트랜스포머 라이브러리 설치

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


# 패키지, 라이브러리, 유틸리티 함수

In [None]:
import argparse
import os
import logging
import glob
import json
import math
import numpy as np
import random
import re
import string
import sys
import timeit

from collections import defaultdict, Counter, namedtuple, OrderedDict
from functools import partial
from multiprocessing import Pool
from tqdm.notebook import tqdm, trange
from typing import List, Optional, Union, Tuple

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

from transformers import (
    AdamW,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
    WEIGHTS_NAME
)

from transformers.models.bert import BasicTokenizer

# Seed 설정
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

# Pytorch Tensor를 List로 변환
def to_list(tensor: torch.tensor) -> List:
    return tensor.detach().cpu().tolist()

# 데이터 타입 및 구조 확인

In [None]:
with open(os.path.join('./data', 'KorQuAD_v1.0_train.json'), 'r', encoding='utf-8') as fin:
    train_data = json.load(fin)

"""
[실습]
데이터의 타입 및 구조를 확인하시오.
"""
#################### 실습 코드 ####################
print("The type of the dataset: {}\n".format(type(train_data)))
print("Keys of the train_data: {}\n".format(train_data.keys()))

print("The type of the train_data['version']: {}".format(type(train_data['version'])))
print("The type of the train_data['data']: {}\n".format(type(train_data['data'])))

print("The value of the train_data['version']: {}\n".format(train_data['version']))

print("The number of train_data['data']: {}".format(len(train_data['data'])))
print("A value of the train_data['data']:")
print("{}\n".format(train_data['data'][0]))

print("Keys of the train_data['data']: {}\n".format(train_data['data'][0].keys()))

print("The type of the train_data['data'][i]['title']: {}".format(type(train_data['data'][0]['title'])))
print("The value of the train_data['data'][i]['title']: {}\n".format(train_data['data'][0]['title']))

print("The type of the train_data['data'][i]['paragraphs']: {}\n".format(type(train_data['data'][0]['paragraphs'])))
print("The number of train_data['data'][i]['paragraphs']: {}".format(len(train_data['data'][0]['paragraphs'])))
print("A value of the train_data['data'][i]['paragraphs'][i]:")
print("{}\n".format(train_data['data'][0]['paragraphs'][0]))

print("Keys of the train_data['data'][i]['paragraphs][i]: {}\n".format(train_data['data'][0]['paragraphs'][0].keys()))
print("The type of the train_data['data'][0]['paragraphs'][0]['context']: {}".format(type(train_data['data'][0]['paragraphs'][0]['context'])))
print("The value of the train_data['data'][0]['paragraphs'][0]['context']:\n{}\n".format(train_data['data'][0]['paragraphs'][0]['context']))

print("The type of the train_data['data'][0]['paragraphs'][0]['qas']: {}".format(type(train_data['data'][0]['paragraphs'][0]['qas'])))
print("A value of the train_data['data'][i]['paragraphs'][i]['qas']:\n{}\n".format(train_data['data'][0]['paragraphs'][0]['qas'][0]))

print()
print(train_data['data'][0]['paragraphs'][0]['context'])
print(train_data['data'][0]['paragraphs'][1]['context'])
###################################################

The type of the dataset: <class 'dict'>

Keys of the train_data: dict_keys(['version', 'data'])

The type of the train_data['version']: <class 'str'>
The type of the train_data['data']: <class 'list'>

The value of the train_data['version']: KorQuAD_v1.0_train

The number of train_data['data']: 1420
A value of the train_data['data']:
{'paragraphs': [{'qas': [{'answers': [{'text': '교향곡', 'answer_start': 54}], 'id': '6566495-0-0', 'question': '바그너는 괴테의 파우스트를 읽고 무엇을 쓰고자 했는가?'}, {'answers': [{'text': '1악장', 'answer_start': 421}], 'id': '6566495-0-1', 'question': '바그너는 교향곡 작곡을 어디까지 쓴 뒤에 중단했는가?'}, {'answers': [{'text': '베토벤의 교향곡 9번', 'answer_start': 194}], 'id': '6566495-0-2', 'question': '바그너가 파우스트 서곡을 쓸 때 어떤 곡의 영향을 받았는가?'}, {'answers': [{'text': '파우스트', 'answer_start': 15}], 'id': '6566518-0-0', 'question': '1839년 바그너가 교향곡의 소재로 쓰려고 했던 책은?'}, {'answers': [{'text': '합창교향곡', 'answer_start': 354}], 'id': '6566518-0-1', 'question': '파우스트 서곡의 라단조 조성이 영향을 받은 베토벤의 곡은?'}, {'answers': [{'text': '183

# 전처리: raw data -> examples -> features

# Raw 데이터 정형화 (raw data -> examples)
## class KorQuADExample
> 정형화된 데이터를 저장하는 클래스

In [None]:
class KorQuADExample(object):
    def __init__(self, qas_id, question_text, context_text, answer_text, start_position_character, title, answers=[], is_impossible=False, ):
        self.qas_id = qas_id
        self.question_text = question_text
        self.context_text = context_text
        self.answer_text = answer_text
        self.title = title
        self.is_impossible = is_impossible
        self.answers = answers

        self.start_position, self.end_position = 0, 0

        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        """
        [실습]
        doc_tokens와 char_to_word_offset 리스트를 구축하시오.

        doc_tokens: 어절 리스트
        char_to_word_offset: 각 어절의 position

        ex. 오늘은 마지막 교육 날이다.
        doc_tokens = ['오늘은', '마지막', '교육', '날이다.']
        char_to_word_offset = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
        """
#################### 실습 코드 ####################
        for c in self.context_text:
            if self._is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)
###################################################

        self.doc_tokens = doc_tokens
        self.char_to_word_offset = char_to_word_offset

        # Train Case
        if start_position_character is not None and not is_impossible:
            self.start_position = char_to_word_offset[start_position_character]
            self.end_position = char_to_word_offset[min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)]
    
    def _is_whitespace(self, c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

## class KorQuADPreprocessor
> 데이터를 읽고, 정형화하는 클래스

In [None]:
from tqdm import tqdm

class KorQuADProcessor:

    def get_train_examples(self, data_dir, filename):
        with open(os.path.join(data_dir, filename), "r", encoding="utf-8") as reader:
            input_data = json.load(reader)["data"]

        return self._create_examples(input_data, "train")

    def get_dev_examples(self, data_dir, filename):
        with open(os.path.join(data_dir, filename), "r", encoding="utf-8") as reader:
            input_data = json.load(reader)["data"]

        return self._create_examples(input_data, "dev")

    # json 데이터를 KorQuADExample 형식으로 반환
    def _create_examples(self, input_data, set_type):
        is_training = set_type == "train"
        examples = []
        """
        [실습]
        input_data를 KorQuADExample 형태로 정형화하시오.

        c.f. is_impossible = input_data['data'][i]]['paragraphs'][j]['qas'][k][is_impossible']
        """
#################### 실습 코드 ####################
        for entry in tqdm(input_data):
            title = entry["title"]
            for paragraph in entry["paragraphs"]:
                context_text = paragraph["context"]
                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position_character = None
                    answer_text = None
                    answers = []

                    is_impossible = qa.get("is_impossible", False)
                    # Train, Dev case
                    if not is_impossible:
                        if is_training:
                            answer = qa["answers"][0]
                            answer_text = answer["text"]
                            start_position_character = answer["answer_start"]
                        else:
                            answers = qa["answers"]

                    example = KorQuADExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        context_text=context_text,
                        answer_text=answer_text,
                        start_position_character=start_position_character,
                        title=title,
                        is_impossible=is_impossible,
                        answers=answers,
                    )
                    examples.append(example)
###################################################

        return examples

# 실습 결과 출력

In [None]:
with open(os.path.join('./data', 'KorQuAD_v1.0_dev.json'), 'r', encoding='utf-8') as fin:
    dev_data = json.load(fin)
sample_data = [dev_data['data'][0]]

exp_preprocessor = KorQuADProcessor()
sample_example = exp_preprocessor._create_examples(sample_data, 'train')

print(len(sample_example))
print(type(sample_example))
print()

print(sample_example[0].qas_id)
print(sample_example[0].question_text)
print(sample_example[0].context_text)
print(sample_example[0].answer_text)
print(sample_example[0].title)
print(sample_example[0].answers)
print(sample_example[0].start_position)
print(sample_example[0].end_position)
print(sample_example[0].doc_tokens)
print(sample_example[0].char_to_word_offset)

100%|██████████| 1/1 [00:00<00:00, 387.57it/s]

11
<class 'list'>

6548850-0-0
임종석이 여의도 농민 폭력 시위를 주도한 혐의로 지명수배 된 날은?
1989년 2월 15일 여의도 농민 폭력 시위를 주도한 혐의(폭력행위등처벌에관한법률위반)으로 지명수배되었다. 1989년 3월 12일 서울지방검찰청 공안부는 임종석의 사전구속영장을 발부받았다. 같은 해 6월 30일 평양축전에 임수경을 대표로 파견하여 국가보안법위반 혐의가 추가되었다. 경찰은 12월 18일~20일 사이 서울 경희대학교에서 임종석이 성명 발표를 추진하고 있다는 첩보를 입수했고, 12월 18일 오전 7시 40분 경 가스총과 전자봉으로 무장한 특공조 및 대공과 직원 12명 등 22명의 사복 경찰을 승용차 8대에 나누어 경희대학교에 투입했다. 1989년 12월 18일 오전 8시 15분 경 서울청량리경찰서는 호위 학생 5명과 함께 경희대학교 학생회관 건물 계단을 내려오는 임종석을 발견, 검거해 구속을 집행했다. 임종석은 청량리경찰서에서 약 1시간 동안 조사를 받은 뒤 오전 9시 50분 경 서울 장안동의 서울지방경찰청 공안분실로 인계되었다.
1989년 2월 15일
임종석
[]
0
2
['1989년', '2월', '15일', '여의도', '농민', '폭력', '시위를', '주도한', '혐의(폭력행위등처벌에관한법률위반)으로', '지명수배되었다.', '1989년', '3월', '12일', '서울지방검찰청', '공안부는', '임종석의', '사전구속영장을', '발부받았다.', '같은', '해', '6월', '30일', '평양축전에', '임수경을', '대표로', '파견하여', '국가보안법위반', '혐의가', '추가되었다.', '경찰은', '12월', '18일~20일', '사이', '서울', '경희대학교에서', '임종석이', '성명', '발표를', '추진하고', '있다는', '첩보를', '입수했고,', '12월', '18일', '오전', '7시', '40분', '경', '가스총과', '전자봉으로', '무장한', '특공조', '및', '대공과', '직




# 데이터 전처리 및 텐서 데이터 구축 (examples -> features)

In [None]:
def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """
    학습을 위한 start, end position을 wordpiece 단위로 조정하는 함수
    """
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

# context 내에 position이 있는지 판단
def _new_check_is_max_context(doc_spans, cur_span_index, position):
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span["start"] + doc_span["length"] - 1
        if position < doc_span["start"]:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span["start"]
        num_right_context = end - position
        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

## class KorQuADFeatures

In [None]:
class KorQuADFeatures:
    def __init__(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        cls_index,
        p_mask,
        example_index,
        unique_id,
        paragraph_len,
        token_is_max_context,
        tokens,
        token_to_orig_map,
        start_position,
        end_position,
        is_impossible,
        qas_id = None,
    ):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.cls_index = cls_index
        self.p_mask = p_mask

        self.example_index = example_index
        self.unique_id = unique_id
        self.paragraph_len = paragraph_len
        self.token_is_max_context = token_is_max_context
        self.tokens = tokens
        self.token_to_orig_map = token_to_orig_map

        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible
        self.qas_id = qas_id

## def KorQuAD_convert_example_to_features

In [None]:
def KorQuAD_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
    features = []
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    """
    [실습]
    all_doc_tokens, tok_to_orig_index, orig_to_tok_index 리스트를 구축하고,
    실제 정답과 어절 단위 정답이 불일치할 경우, _improve_answer_span 함수를 이용해
    wordpiece 단위의 start, end position을 구하시오.

    c.f.
    all_doc_tokens: Bert Tokenizer를 이용해 wordpiece 단위의 토큰 리스트
    tok_to_orig_index: wordpiece 단위 토큰의 어절 단위 인덱스 값
    orig_to_tok_index: 각 어절의 wordpiece 시작 인덱스 값

    ex. "1989년 2월 15일 여의도 농민 폭력 시위를 주도한 혐의"
    all_doc_tokens = ['1989년', '2월', '15일', '여', '##의', '##도', '농', '##민', '폭', '##력', '시', '##위를', '주', '##도', '##한', '혐', '##의']
    tok_to_orig_index = [0, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8]
    orig_to_tok_index = [0, 1, 2, 3, 6, 8, 10, 12, 15]
    """
#################### 실습 코드 ####################
    for i, token in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    if is_training and not example.is_impossible:
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1
        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
        )

    """
    c.f.
    BERT 입력 길이만큼 입력 시퀀스를 자르고,
    입력 시퀀스 중 context 내에 정답이 있는 경우 그대로 사용,
    입력 시퀀스 내에 정답이 없는 경우 doc_stride 만큼 입력 시퀀스의 context 조정하는 코드
    """
###################################################

    spans = []
    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length)

    sequence_added_tokens = 2 # [CLS] context [SEP]
    sequence_pair_added_tokens = 3 # [CLS] context1 [SEP] context2 [SEP]

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):
        encoded_dict = tokenizer.encode_plus(
            truncated_query,
            span_doc_tokens,
            truncation="only_second",
            padding="max_length",
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
            return_token_type_ids=True,
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(truncated_query) + sequence_added_tokens + i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len

        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict or (
            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
        ):
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]


    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        p_mask = np.ones_like(span["token_type_ids"])
        p_mask[len(truncated_query) + sequence_added_tokens :] = 0

        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()

        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1
        p_mask[cls_index] = 0

        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                doc_offset = len(truncated_query) + sequence_added_tokens

                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

        features.append(
            KorQuADFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=0, 
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
                is_impossible=span_is_impossible,
                qas_id=example.qas_id,
            )
        )
    return features

## def KorQuAD_convert_examples_to_features

In [None]:
def KorQuAD_convert_example_to_features_init(tokenizer_for_convert):
    global tokenizer
    tokenizer = tokenizer_for_convert

def KorQuAD_convert_examples_to_features(
    examples,
    tokenizer,
    max_seq_length,
    doc_stride,
    max_query_length,
    is_training,
):
    features = []

    # Multiprocessing을 통한 features list 구축
    with Pool(os.cpu_count(), initializer=KorQuAD_convert_example_to_features_init, initargs=(tokenizer,)) as p:
        annotate_ = partial(
            KorQuAD_convert_example_to_features,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=is_training,
        )
        features = list(
            tqdm(
                p.imap(annotate_, examples, chunksize=32),
                total=len(examples),
                desc="convert KorQuAD examples to features",
            )
        )

    new_features = []
    unique_id = 1000000000
    example_index = 0
    for example_features in tqdm(features, total=len(features), desc="add example index and unique id"):
        if not example_features:
            continue
        for example_feature in example_features:
            example_feature.example_index = example_index
            example_feature.unique_id = unique_id
            new_features.append(example_feature)
            unique_id += 1
        example_index += 1
    features = new_features
    del new_features

    # Convert to Tensors and build dataset
    """
    [실습]
    torch.tensor 자료형의

    all_input_ids,
    all_attention_masks,
    all_token_type_ids,
    all_cls_index,
    all_p_mask,
    all_is_impossible,
    all_start_positions,
    all_end_positions

    를 구축하시오.
    """
#################### 실습 코드 ####################
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
###################################################

    if not is_training:
        all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask)
    else:
#################### 실습 코드 ####################
        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
###################################################

        dataset = TensorDataset(
            all_input_ids,
            all_attention_masks,
            all_token_type_ids,
            all_start_positions,
            all_end_positions,
            all_cls_index,
            all_p_mask,
            all_is_impossible,
        )

    return features, dataset

# 실습 출력

In [None]:
sample_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
sample_max_seq_length = 384
sample_max_query_length = 64
sample_doc_stride = 128

sample_features, sample_dataset = KorQuAD_convert_examples_to_features(
    examples=[sample_example[1]],
    tokenizer=sample_tokenizer,
    max_seq_length=sample_max_seq_length,
    doc_stride=sample_doc_stride,
    max_query_length=sample_max_query_length,
    is_training=True,
)

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

convert KorQuAD examples to features: 100%|██████████| 1/1 [00:00<00:00, 81.67it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 1042.32it/s]


# 데이터 후처리

In [None]:
class KorQuADResult:
    def __init__(self, unique_id, start_logits, end_logits):
        self.start_logits = start_logits
        self.end_logits = end_logits
        self.unique_id = unique_id

In [None]:
def _get_best_indexes(logits, n_best_size):
    """Get the n-best logits from a list."""
    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)

    best_indexes = []
    for i in range(len(index_and_score)):
        if i >= n_best_size:
            break
        best_indexes.append(index_and_score[i][0])
    return best_indexes

def _compute_softmax(scores):
    """Compute softmax probability over raw logits."""
    if not scores:
        return []

    max_score = None
    for score in scores:
        if max_score is None or score > max_score:
            max_score = score

    exp_scores = []
    total_sum = 0.0
    for score in scores:
        x = math.exp(score - max_score)
        exp_scores.append(x)
        total_sum += x

    probs = []
    for score in exp_scores:
        probs.append(score / total_sum)
    return probs

def get_final_text(pred_text, orig_text, do_lower_case):
    """Project the tokenized prediction back to the original text."""

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = OrderedDict()
        for i, c in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        return orig_text
        
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        return orig_text

    tok_s_to_ns_map = {}
    for i, tok_index in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        return orig_text

    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
    return output_text


def compute_predictions_logits(
    all_examples,
    all_features,
    all_results,
    n_best_size,
    max_answer_length,
    do_lower_case,
    output_prediction_file,
    output_nbest_file,
    tokenizer,
):
    if output_prediction_file:
        logger.info(f"Writing predictions to: {output_prediction_file}")
    if output_nbest_file:
        logger.info(f"Writing nbest to: {output_nbest_file}")

    example_index_to_features = defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    _PrelimPrediction = namedtuple( 
        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
    )

    all_predictions = OrderedDict()
    all_nbest_json = OrderedDict()
    scores_diff_json = OrderedDict()

    for example_index, example in enumerate(all_examples):
        features = example_index_to_features[example_index]

        prelim_predictions = []
        # keep track of the minimum score of null start+end of position 0
        score_null = 1000000  # large and positive
        min_null_feature_index = 0  # the paragraph slice with min null score
        null_start_logit = 0  # the start logit at the slice with min null score
        null_end_logit = 0  # the end logit at the slice with min null score
        for feature_index, feature in enumerate(features):
            result = unique_id_to_result[feature.unique_id]
            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # We could hypothetically create invalid predictions, e.g., predict
                    # that the start of the span is in the question. We throw out all
                    # invalid predictions.
                    if start_index >= len(feature.tokens):
                        continue
                    if end_index >= len(feature.tokens):
                        continue
                    if start_index not in feature.token_to_orig_map:
                        continue
                    if end_index not in feature.token_to_orig_map:
                        continue
                    if not feature.token_is_max_context.get(start_index, False):
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue
                    prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=feature_index,
                            start_index=start_index,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index],
                        )
                    )
        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)

        _NbestPrediction = namedtuple( 
            "NbestPrediction", ["text", "start_logit", "end_logit"]
        )

        seen_predictions = {}
        nbest = []
        for pred in prelim_predictions:
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
            if pred.start_index > 0:  # this is a non-null prediction
                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]

                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

                final_text = get_final_text(tok_text, orig_text, do_lower_case)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True
            else:
                final_text = ""
                seen_predictions[final_text] = True

            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))

        if not nbest:
            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        assert len(nbest) >= 1, "No valid predictions"

        total_scores = []
        best_non_null_entry = None
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
            if not best_non_null_entry:
                if entry.text:
                    best_non_null_entry = entry

        probs = _compute_softmax(total_scores)

        nbest_json = []
        for i, entry in enumerate(nbest):
            output = OrderedDict()
            output["text"] = entry.text
            output["probability"] = probs[i]
            output["start_logit"] = entry.start_logit
            output["end_logit"] = entry.end_logit
            nbest_json.append(output)

        assert len(nbest_json) >= 1, "No valid predictions"

        score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
        scores_diff_json[example.qas_id] = score_diff
        all_predictions[example.qas_id] = best_non_null_entry.text
            
        all_nbest_json[example.qas_id] = nbest_json

    if output_prediction_file:
        with open(output_prediction_file, "w", encoding='utf-8') as writer:
            json.dump(all_predictions, writer, indent='\t', ensure_ascii=False)

    if output_nbest_file:
        with open(output_nbest_file, "w", encoding='utf-8') as writer:
            json.dump(all_nbest_json, writer, indent='\t', ensure_ascii=False)

    return all_predictions

In [None]:
def normalize_answer(s):    
    def remove_(text):
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub('《', " ", text)
        text = re.sub('》', " ", text)
        text = re.sub('<', " ", text)
        text = re.sub('>', " ", text) 
        text = re.sub('〈', " ", text)
        text = re.sub('〉', " ", text)   
        text = re.sub("\(", " ", text)
        text = re.sub("\)", " ", text)
        text = re.sub("‘", " ", text)
        text = re.sub("’", " ", text)      
        return text

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(remove_(s))))

def f1_score(prediction, ground_truth):
    """
    [실습]
    f1-score를 구하시오.

    c.f.
    단위: string
    precision: (pred & true) / pred
    recall: (pred & true) / true
    """
#################### 실습 코드 ####################
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
   
    prediction_Char = []
    for tok in prediction_tokens:
        now = [a for a in tok]
        prediction_Char.extend(now)
        
    ground_truth_Char = []
    for tok in ground_truth_tokens:
        now = [a for a in tok]
        ground_truth_Char.extend(now)
    
    common = Counter(prediction_Char) & Counter(ground_truth_Char)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    
    precision = 1.0 * num_same / len(prediction_Char)
    recall = 1.0 * num_same / len(ground_truth_Char)
    f1 = (2 * precision * recall) / (precision + recall)
###################################################

    return f1

def exact_match_score(prediction, ground_truth):
    """
    [실습]
    EM score를 구하시오.

    """
#################### 실습 코드 ####################
    em = (normalize_answer(prediction) == normalize_answer(ground_truth))
###################################################

    return em

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)

    return max(scores_for_ground_truths)

def KorQuAD_evaluate(examples, predictions):
    f1 = exact_match = total = 0
    for cnt, example in enumerate(examples):
        total += 1
        qas_id = example.qas_id
        if qas_id not in predictions:
            message = 'Unanswered question ' + qas_id + ' will receive score 0.'
            print(message, file=sys.stderr)
            continue
        ground_truths = [answer["text"] for answer in example.answers]
        prediction = predictions[qas_id]
        exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
        if cnt == 0 or qas_id == "6332405-1-0":
            logger.info("ground truths: {}".format(ground_truths))
            logger.info("prediction: {}".format(prediction))
            logger.info("F1: {:.3f} || EM: {:.3f}\n".format(metric_max_over_ground_truths(f1_score, prediction, ground_truths), metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)))

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

# 데이터 로딩
## def load_and_cache_examples
> 정형화된 데이터가 저장되어 있는 경우 이를 불러오고,
> 그렇지 않을 경우 데이터를 정형화한 후 저장(caching) 및 반환하는 함수

In [None]:
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)
        """
        [실습]
        전처리를 위해 정의한 클래스와 함수를 이용해
        example, features, dataset을 구축하시오.
        """
#################### 실습 코드 ####################
        processor = KorQuADProcessor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
            print("eval: ", len(examples))
            examples = examples[:5000] # 앞의 5000개만 사용
        else:
            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
            print("train: ", len(examples))
            examples = examples[:10000] # 앞의 10000개만 사용

        features, dataset = KorQuAD_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )
###################################################

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset

# 모델 정의: class BertForKorQuAD

In [None]:
class BertForKorQuAD(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config, add_pooling_layer=False)
        """
        [실습]
        start_position과 end_position을 학습하는 MLP를 선언하시오.
        """
#################### 실습 코드 ####################
        self.korquad_outputs = nn.Linear(config.hidden_size, config.num_labels)
###################################################

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor]:

        """
        [실습]
        bert로부터 입력 시퀀스의 모든 토큰에 대한 hidden_state를 받은 후,
        MLP를 통해 각 토큰마다 start_logits와 end_logits를 계산하는 모델을 구현하시오.

        c.f.
        https://huggingface.co/docs/transformers/model_doc/bert
        """
#################### 실습 코드 ####################
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        sequence_output = outputs[0]

        logits = self.korquad_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
###################################################

        total_loss = None
        if start_positions is not None and end_positions is not None:
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            """
            [실습]
            loss function을 선언한 후, total_loss를 구하시오.
            """
#################### 실습 코드 ####################
            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
###################################################

        output = (start_logits, end_logits) + outputs[2:]

        return ((total_loss,) + output) if total_loss is not None else output

# Evaluate 함수 정의

In [None]:
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = KorQuADResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = KorQuAD_evaluate(examples, predictions)

    return results

# Train 함수 정의

In [None]:
def train(args, train_dataset, model, tokenizer):
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.train_batch_size)
    logger.info(
        "  Total train batch size (w. accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss = 0.0
    """
    [실습]
    학습 프로세스를 구현하시오.
    """
#################### 실습 코드 ####################
    logging_loss = 0.0

    model.zero_grad()
    set_seed(args)
    for _ in trange(int(args.num_train_epochs), position=0, desc="Epoch..."):
        model.train()
        for step, batch in enumerate(tqdm(train_dataloader, position=0, desc="Iteration...")):
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }
            outputs = model(**inputs)
            loss = outputs[0]

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
                
            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  
                model.zero_grad()
                global_step += 1

        # Log metrics and Save model checkpoint!
        results = evaluate(args, model, tokenizer)
        logger.info("***** Evaluation result *****")
        for key, value in results.items():
            logger.info("eval_{}: {}".format(key, value))

        output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
        os.makedirs(output_dir)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to %s", output_dir)

        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
        logger.info("Saving optimizer and scheduler states to %s", output_dir)
###################################################      

    return global_step, tr_loss / global_step

# Main함수 정의

In [None]:
def main(args):
    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    logger.warning(
        "Process device: %s, n_gpu: %s",
        args.device,
        args.n_gpu,
    )

    # Set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Load BERT configurations
    config = BertConfig.from_pretrained(args.model_name_or_path)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case)
    model = BertForKorQuAD.from_pretrained(args.model_name_or_path, config=config)
    model.to(args.device)

    # Train
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

        # Save the trained model and the tokenizer
        logger.info("Saving model checkpoint to %s", args.output_dir)
        model.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        model = BertForKorQuAD.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluate
    results = {}
    if args.do_eval:
        logger.info("Loading checkpoints saved during training for evaluation")
        checkpoints = [args.output_dir]
        if args.do_train and args.eval_all_checkpoints:
                checkpoints = list(
                    os.path.dirname(c)
                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
                )

        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = BertForKorQuAD.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))

    return results


# Arguments Parsing과 Main 함수 실행

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Arguments for several paths or file names
    parser.add_argument(
        "--model_name_or_path",
        default='bert-base-multilingual-cased',
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--data_dir",
        default="./data",
        type=str,
        help="The input data dir. Should contain the .json files for the task.",
    )
    parser.add_argument(
        "--output_dir",
        default="./outputs/",
        type=str,
        help="The output directory where the model checkpoints and predictions will be written.",
    )
    parser.add_argument(
        "--train_file",
        default="KorQuAD_v1.0_train.json",
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there",
    )
    parser.add_argument(
        "--predict_file",
        default="KorQuAD_v1.0_dev.json",
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there",
    )

    # Hyperparameters
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--train_batch_size", default=8, type=int, help="Batch size GPU/CPU for training.")
    parser.add_argument(
        "--eval_batch_size", default=8, type=int, help="Batch size GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )

    # Actions
    parser.add_argument("--do_train", default=True, action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", default=True, action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case", default=False, action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", default=False, action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", default=True, action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", default=False, action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    args = parser.parse_args(args=[])

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    logger = logging.getLogger(__name__)

    # Logging hyperparameters
    logger.info("Training and evaluation parameters")
    for k, v in args.__dict__.items():
        logger.info("{}: {}".format(k, v))
    
    main(args)



Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForKorQuAD: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.predictions.transform.dense.bias', 'bert.pooler.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForKorQuAD from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForKorQuAD from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForKorQuAD were not initialized from the model checkpoi

train:  60407


convert KorQuAD examples to features: 100%|██████████| 10000/10000 [01:09<00:00, 143.26it/s]
add example index and unique id: 100%|██████████| 10000/10000 [00:00<00:00, 831823.57it/s]


Epoch...:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration...: 100%|██████████| 1660/1660 [16:59<00:00,  1.63it/s]

  0%|          | 0/140 [00:00<?, ?it/s][A
  9%|▉         | 13/140 [00:00<00:00, 129.11it/s][A
 19%|█▊        | 26/140 [00:00<00:01, 107.17it/s][A
 27%|██▋       | 38/140 [00:00<00:00, 110.39it/s][A
 39%|███▉      | 55/140 [00:00<00:00, 97.02it/s] [A
 47%|████▋     | 66/140 [00:00<00:00, 80.10it/s][A
 54%|█████▎    | 75/140 [00:00<00:00, 70.01it/s][A
 61%|██████    | 85/140 [00:01<00:00, 72.12it/s][A
 69%|██████▊   | 96/140 [00:01<00:00, 77.88it/s][A
 79%|███████▉  | 111/140 [00:01<00:00, 91.97it/s][A
 88%|████████▊ | 123/140 [00:01<00:00, 98.60it/s][A
100%|██████████| 140/140 [00:01<00:00, 82.65it/s]

eval:  5774




convert KorQuAD examples to features:   0%|          | 0/5000 [00:00<?, ?it/s][A
convert KorQuAD examples to features:   0%|          | 1/5000 [00:00<27:46,  3.00it/s][A
convert KorQuAD examples to features:   1%|          | 33/5000 [00:00<00:54, 90.76it/s][A
convert KorQuAD examples to features:   1%|▏         | 65/5000 [00:01<01:12, 68.30it/s][A
convert KorQuAD examples to features:   3%|▎         | 129/5000 [00:01<00:37, 128.62it/s][A
convert KorQuAD examples to features:   3%|▎         | 161/5000 [00:01<00:32, 151.05it/s][A
convert KorQuAD examples to features:   4%|▍         | 193/5000 [00:01<00:35, 133.96it/s][A
convert KorQuAD examples to features:   5%|▌         | 257/5000 [00:02<00:32, 144.69it/s][A
convert KorQuAD examples to features:   6%|▋         | 321/5000 [00:02<00:29, 156.84it/s][A
convert KorQuAD examples to features:   7%|▋         | 353/5000 [00:02<00:26, 175.82it/s][A
convert KorQuAD examples to features:   8%|▊         | 385/5000 [00:02<00:31, 144.57it