# 트랜스포머 라이브러리 설치

In [1]:
!pip install transformers



# KorQuAD 데이터 다운로드 및 저장

In [1]:
!mkdir ./data
!wget https://korquad.github.io/dataset/KorQuAD_v1.0_train.json -P ./data
!wget https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json -P ./data

--2023-12-12 18:01:17--  https://korquad.github.io/dataset/KorQuAD_v1.0_train.json
Resolving korquad.github.io (korquad.github.io)... 185.199.110.153, 185.199.108.153, 185.199.109.153, ...
Connecting to korquad.github.io (korquad.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38527475 (37M) [application/json]
Saving to: ‘./data/KorQuAD_v1.0_train.json’


2023-12-12 18:01:18 (108 MB/s) - ‘./data/KorQuAD_v1.0_train.json’ saved [38527475/38527475]

--2023-12-12 18:01:18--  https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json
Resolving korquad.github.io (korquad.github.io)... 185.199.111.153, 185.199.109.153, 185.199.108.153, ...
Connecting to korquad.github.io (korquad.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3881058 (3.7M) [application/json]
Saving to: ‘./data/KorQuAD_v1.0_dev.json’


2023-12-12 18:01:18 (92.8 MB/s) - ‘./data/KorQuAD_v1.0_dev.json’ saved [3881058/3881058]


# 패키지, 라이브러리, 유틸리티 함수

In [1]:
import argparse
import os
import logging
import glob
import json
import math
import numpy as np
import random
import re
import string
import sys
import timeit

from collections import defaultdict, Counter, namedtuple, OrderedDict
from functools import partial
from multiprocessing import Pool
from tqdm.notebook import tqdm, trange
from typing import List, Optional, Union, Tuple

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

from transformers import (
    AdamW,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    get_linear_schedule_with_warmup,
    WEIGHTS_NAME
)

from transformers.models.bert import BasicTokenizer

# Seed 설정
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

# Pytorch Tensor를 List로 변환
def to_list(tensor: torch.tensor) -> List:
    return tensor.detach().cpu().tolist()

# 데이터 타입 및 구조 확인

In [2]:
with open(os.path.join('./data', 'KorQuAD_v1.0_train.json'), 'r', encoding='utf-8') as fin:
    train_data = json.load(fin)

with open(os.path.join('./data', 'KorQuAD_v1.0_dev.json'), 'r', encoding='utf-8') as fin:
    dev_data = json.load(fin)
"""
[실습]
데이터의 타입 및 구조를 확인하시오.
"""
#################### 실습 코드 1 ##################
print("The type of the dataset: {}\n".format(type(train_data)))
print("Keys of the train_data: {}\n".format(train_data.keys()))

print("The type of the train_data['version']: {}".format(type(train_data['version'])))
print("The type of the train_data['data']: {}\n".format(type(train_data['data'])))

print("The value of the train_data['version']: {}\n".format(train_data['version']))

print("The number of train_data['data']: {}".format(len(train_data['data'])))
print("A value of the train_data['data']:")
print("{}\n".format(train_data['data'][0]))

print("Keys of the train_data['data']: {}\n".format(train_data['data'][0].keys()))

print("The type of the train_data['data'][i]['title']: {}".format(type(train_data['data'][0]['title'])))
print("The value of the train_data['data'][i]['title']: {}\n".format(train_data['data'][0]['title']))

print("The type of the train_data['data'][i]['paragraphs']: {}\n".format(type(train_data['data'][0]['paragraphs'])))
print("The number of train_data['data'][i]['paragraphs']: {}".format(len(train_data['data'][0]['paragraphs'])))
print("A value of the train_data['data'][i]['paragraphs'][i]:")
print("{}\n".format(train_data['data'][0]['paragraphs'][0]))

print("Keys of the train_data['data'][i]['paragraphs][i]: {}\n".format(train_data['data'][0]['paragraphs'][0].keys()))
print("The type of the train_data['data'][0]['paragraphs'][0]['context']: {}".format(type(train_data['data'][0]['paragraphs'][0]['context'])))
print("The value of the train_data['data'][0]['paragraphs'][0]['context']:\n{}\n".format(train_data['data'][0]['paragraphs'][0]['context']))

print("The type of the train_data['data'][0]['paragraphs'][0]['qas']: {}".format(type(train_data['data'][0]['paragraphs'][0]['qas'])))
print("A value of the train_data['data'][i]['paragraphs'][i]['qas']:\n{}\n".format(train_data['data'][0]['paragraphs'][0]['qas'][0]))

print()
print(train_data['data'][0]['paragraphs'][0]['context'])
print(train_data['data'][0]['paragraphs'][1]['context'])
###################################################

The type of the dataset: <class 'dict'>

Keys of the train_data: dict_keys(['version', 'data'])

The type of the train_data['version']: <class 'str'>
The type of the train_data['data']: <class 'list'>

The value of the train_data['version']: KorQuAD_v1.0_train

The number of train_data['data']: 1420
A value of the train_data['data']:
{'paragraphs': [{'qas': [{'answers': [{'text': '교향곡', 'answer_start': 54}], 'id': '6566495-0-0', 'question': '바그너는 괴테의 파우스트를 읽고 무엇을 쓰고자 했는가?'}, {'answers': [{'text': '1악장', 'answer_start': 421}], 'id': '6566495-0-1', 'question': '바그너는 교향곡 작곡을 어디까지 쓴 뒤에 중단했는가?'}, {'answers': [{'text': '베토벤의 교향곡 9번', 'answer_start': 194}], 'id': '6566495-0-2', 'question': '바그너가 파우스트 서곡을 쓸 때 어떤 곡의 영향을 받았는가?'}, {'answers': [{'text': '파우스트', 'answer_start': 15}], 'id': '6566518-0-0', 'question': '1839년 바그너가 교향곡의 소재로 쓰려고 했던 책은?'}, {'answers': [{'text': '합창교향곡', 'answer_start': 354}], 'id': '6566518-0-1', 'question': '파우스트 서곡의 라단조 조성이 영향을 받은 베토벤의 곡은?'}, {'answers': [{'text': '183

# 전처리: encoding data

In [3]:
import pickle
from tqdm import tqdm

def encoding_data(documents, tokenizer, n_documents = 1000, is_train=True):
    # 파일 이름 설정
    if is_train:
        encoded_context_path = f'./pickles/contexts_encoded_train_{n_documents}.pkl'
        encoded_qas_path = f'./pickles/qas_encoded_train_{n_documents}.pkl'
    else:
        encoded_context_path = f'./pickles/contexts_encoded_dev_{n_documents}.pkl'
        encoded_qas_path = f'./pickles/qas_encoded_dev_{n_documents}.pkl'

    # 이미 파일이 존재할 경우
    if os.path.exists(encoded_context_path) and os.path.exists(encoded_qas_path):
        return

    # pickles 폴더 존재 여부 확인
    if not os.path.exists('./pickles'):
      os.mkdir('./pickles')

    contexts_encoded = []
    qas_encoded = []

    paragraphs = []

    for d_i in documents[:n_documents]:
        for p_i in d_i['paragraphs']:
            paragraphs.append(p_i)

#################### 실습 코드 2 ##################
    # tokenizer로 context encoding 후 저장
    contexts_encoded = []
    for idx, paragraph_i in enumerate(paragraphs):
        contexts_encoded.append(tokenizer.encode(paragraph_i['context']))
#################################################

    with open(encoded_context_path, 'wb') as f:
        pickle.dump(contexts_encoded, f)
        f.close()

    # tokenizer로 question answer encoding 후 저장
    for idx, paragraph_i in tqdm(enumerate(paragraphs)):
        context = paragraph_i['context']
        qas = paragraph_i['qas']

        qas_encoded_element = []

        for qa in qas:
            question = tokenizer.encode(qa['question'])

            if is_train:
#################### 실습 코드 3 ##################
                # 학습 데이터의 경우, 다음 3가지 결과를 저장
                # 1) question을 tokenize 한 결과
                # 2) tokenize된 context 내에서 정답의 token start index
                # 3) tokenize된 context 내에서 정답의 token end index

                answer = qa['answers'][0]['text']
                index_start = qa['answers'][0]['answer_start']
                index_end = index_start + len(answer)
                qa['answers'][0]['answer_end'] = index_end

                token_start = len(tokenizer.tokenize(context[0:index_start]))
                token_end = token_start + len(tokenizer.tokenize(context[index_start:index_end]))
#################################################

                qas_encoded_element.append({'question':question, 'token_start':token_start, 'token_end':token_end})

            else:
                # 평가 데이터의 경우
                # 1) question을 tokenize 한 결과 
                # 2) question id 저장
                id_question=qa['id']
                qas_encoded_element.append({'question':question, 'id':id_question})

        qas_encoded.append(qas_encoded_element)

    # qa 쌍에 대한 데이터 저장
    with open(encoded_qas_path, 'wb') as f:
        pickle.dump(qas_encoded, f)
        f.close()

In [4]:
# pkl 파일 생성이 잘못된 경우에만 주석을 풀고 코드 실행
# %cd pickles
# !rm *
# %cd ..
# !ls -al

In [5]:
# bert tokenizer
model_name_or_path = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=False)

# 학습 및 평가에 사용할 데이터 수 조절 (qa pair 수가 아닌 document 수)
n_train_data = 1000
n_dev_data = 500

# pkl 파일 생성
encoding_data(train_data['data'], tokenizer, n_documents=n_train_data)
encoding_data(dev_data['data'], tokenizer, n_documents=n_dev_data, is_train=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
6854it [00:29, 229.57it/s]
964it [00:00, 1088.70it/s]


# 전처리: prepare_data

In [6]:
def prepare_data(contexts_encoded, qas_encoded, is_train=True):
    # context: [CLS] + context + [SEP]
    # question: [CLS] + question + [SEP]
    # model input: [CLS] + question + [SEP] + Segment (context의 일부) + [SEP]

    datas = []
    labels = []

    for idx, qas in enumerate(qas_encoded):
        # [CLS] 제거 ([CLS] token id = 2)
        contexts_encoded[idx].remove(2)
        # [SEP] 제거 ([SEP] token id = 3)
        contexts_encoded[idx].remove(3)

        for qa in qas:
            data=[]
            # [CLS] Question [SEP] Segment [SEP] (Total Length: 최대 길이는 512이지만 384까지만 입력 받도록 구현)
            size_segment = 383 - len(qa['question'])

            pos_start = 0
            pos_end = size_segment

            segment = {}
            while pos_start < len(contexts_encoded[idx]):
                if pos_end > len(contexts_encoded[idx]):
#################### 실습 코드 4 ##################
                    # input_ids
                    #     1) [CLS] Question [SEP] Segment [SEP] => input_ids 그대로 사용
                    #     2) Padding => max_seq_len의 남은 token들을 모두 [PAD] token의 id인 0 사용

                    # token type ids
                    #     1) [CLS] Question [SEP] => 0
                    #     2) Segment [SEP] => 1
                    #     3) Padding => 0

                    # attention_mask
                    #     1) [CLS] Question [SEP] Segment [SEP] => 1
                    #     2) Padding => 0

                    # [CLS] Q [SEP] S [SEP] Padding
                    
                    # Not Padding
                    segment['input_ids'] = qa['question'] + contexts_encoded[idx][pos_start:len(contexts_encoded[idx])] + [3]
                    segment['token_type_ids'] = [0] * len(qa['question']) + [1] * (len(contexts_encoded[idx]) - pos_start + 1)
                    segment['attention_mask'] = [1] * (len(qa['question']) + len(contexts_encoded[idx]) - pos_start + 1)

                    # Padding. ([PAD] token id = 0)
                    segment['input_ids'] += [0] * (pos_end - len(contexts_encoded[idx]))
                    segment['token_type_ids'] += [0] * (pos_end - len(contexts_encoded[idx]))
                    segment['attention_mask'] += [0] * (pos_end - len(contexts_encoded[idx]))
#################################################

                else:
                    segment['input_ids'] = qa['question'] + contexts_encoded[idx][pos_start:pos_end] + [3]
                    segment['token_type_ids'] = [0] * len(qa['question']) + [1] * (pos_end - pos_start + 1)
                    segment['attention_mask'] = [1] * (len(qa['question']) + pos_end - pos_start + 1)

                data.append(segment)

                pos_start += size_segment
                pos_end += size_segment

            datas.append(data)

            if is_train:
                # 학습에 사용되는 정답 토큰의 위치 정보
                #     1) datas 내에서 정답을 포함하는 segment index
                #     2) 정답을 포함하는 segment 내에서 token index
                
                label_start = (int(qa['token_start'] / size_segment), len(qa['question']) + qa['token_start'] % size_segment)
                label_end = (int(qa['token_end'] / size_segment), len(qa['question']) + qa['token_end'] % size_segment)
                labels.append({'start':label_start, 'end':label_end})
            else:
                labels.append(qa['id'])


    return datas, labels

# 데이터셋 클래스 구현

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class TrainDatasetKorquad(Dataset):
    """
    Dataset for "train" set.
    """

    def __init__(self, datas, labels, max_segment):
        self.datas = []
        self.labels_segment = []
        self.labels_start = []
        self.labels_end = []

        for idx, label in enumerate(labels):
            if len(datas[idx]) > max_segment:
                continue

            self.datas.append(datas[idx])

#################### 실습 코드 5 ##################
            # label_segment, labels_start, labels_end에 원소를 추가
            # 1) label_segment: 정답이 존재하는 segment만 1, 나머지 segment는 0으로 마킹된 list
            # 2) labels_start: label에 존재하는 start
            # 3) labels_end: label에 존재하는 end

            label_segment = [0] * len(datas[idx])
            for index in range(label['start'][0], label['end'][0] + 1):
                label_segment[index] = 1

            self.labels_segment.append(label_segment)

            self.labels_start.append(label['start'])
            self.labels_end.append(label['end'])
#################################################

        print(f"Train data: {len(self.datas)}")


    def __len__(self):
        return len(self.datas)


    def __getitem__(self,idx):
#################### 실습 코드 6 ##################
        # BERT input 구성
        # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel와 아래 dev dataset 참고
        item = {'input_ids':[], 'attention_mask':[], 'token_type_ids':[]}

        for data_i in self.datas[idx]:
            item['input_ids'].append(data_i['input_ids'])
            item['attention_mask'].append(data_i['attention_mask'])
            item['token_type_ids'].append(data_i['token_type_ids'])

        # datas[idx] = [{'input_ids: [], 'attention_mask: [], 'token_type_ids': []}]
        # item = {'input_ids': [[], [], [], ...], 'attention_mask': [[], [], [], ...], 'token_type_ids':[[], [], [], ...]}

        batch = {k: torch.tensor(v, dtype = torch.long) for k, v in item.items()}
        batch['label_segment'] = torch.tensor(self.labels_segment[idx], dtype = torch.float)
#################################################
        
        batch['label_start'] = (self.labels_start[idx][0], torch.tensor(self.labels_start[idx][1], dtype = torch.long))
        batch['label_end'] = (self.labels_end[idx][0], torch.tensor(self.labels_end[idx][1], dtype = torch.long))

        return batch

class DevDatasetKorquad(Dataset):
    """
    Dataset for "dev" set.
    """

    def __init__(self, datas, ids, max_segment):
        self.datas = []
        self.ids=ids

        for idx, data in enumerate(datas):
            if idx == 0:
                print(data)
            if len(data) < max_segment:
                self.datas.append(data)
            elif len(data) >= max_segment:
                self.datas.append(data[0:max_segment])

        print(f"Dev data: {len(self.datas)}")

    def __getitem__(self,idx):
        item = {'input_ids':[], 'attention_mask':[], 'token_type_ids':[]}
        for data_i in self.datas[idx]:
            item['input_ids'].append(data_i['input_ids'])
            item['attention_mask'].append(data_i['attention_mask'])
            item['token_type_ids'].append(data_i['token_type_ids'])

        batch = {k: torch.tensor(v, dtype = torch.long) for k, v in item.items()}
        batch['id'] = self.ids[idx]

        return batch

    def __len__(self):
        return len(self.datas)

In [8]:
# Train dataset
with open(f'./pickles/contexts_encoded_train_{n_train_data}.pkl', 'rb') as f:
    train_contexts_encoded = pickle.load(f)
    f.close()

with open(f'./pickles/qas_encoded_train_{n_train_data}.pkl', 'rb') as f:
    train_qas_encoded = pickle.load(f)
    f.close()

train_datas, train_label = prepare_data(train_contexts_encoded, train_qas_encoded)
train_dataset = TrainDatasetKorquad(datas=train_datas, labels=train_label, max_segment=12)

# Dev dataset
with open(f'./pickles/contexts_encoded_dev_{n_dev_data}.pkl', 'rb') as f:
    dev_contexts_encoded = pickle.load(f)
    f.close()

with open(f'./pickles/qas_encoded_dev_{n_dev_data}.pkl', 'rb') as f:
    dev_qas_encoded = pickle.load(f)
    f.close()

dev_datas, ids = prepare_data(dev_contexts_encoded, dev_qas_encoded, is_train=False)
dev_dataset = DevDatasetKorquad(datas=dev_datas, ids=ids, max_segment=12)

Train data: 42966
[{'input_ids': [2, 30251, 2052, 7915, 6290, 5206, 5754, 2138, 4682, 2470, 4456, 2200, 7119, 2113, 2393, 860, 721, 2073, 35, 3, 9190, 2440, 22, 2429, 3749, 2210, 7915, 6290, 5206, 5754, 2138, 4682, 2470, 4456, 12, 5206, 10091, 2491, 2294, 2295, 2170, 2133, 2470, 2520, 2595, 2090, 2536, 13, 3603, 7119, 2113, 2393, 2496, 2359, 2062, 18, 9190, 2440, 23, 2429, 3710, 2210, 3671, 16311, 2736, 20809, 13164, 2144, 2259, 30251, 2079, 4858, 2251, 2354, 17113, 2069, 14536, 2757, 2886, 2062, 18, 555, 2073, 1897, 26, 2429, 3740, 2210, 7141, 2174, 2165, 2170, 1510, 2113, 2382, 2069, 3661, 2200, 6430, 7488, 3728, 2178, 2283, 2520, 2090, 2536, 4456, 2116, 4140, 2496, 2359, 2062, 18, 3858, 2073, 3710, 2429, 3801, 2210, 97, 3619, 2210, 3734, 3671, 13837, 4622, 27135, 30251, 2052, 6712, 3913, 2138, 3835, 19521, 1513, 4000, 18479, 2138, 11774, 2371, 2088, 16, 3710, 2429, 3801, 2210, 4400, 27, 2067, 4064, 2377, 591, 5809, 2859, 2145, 4299, 2126, 6233, 7747, 2470, 22010, 2446, 1116, 15982, 

# 모델 정의: class BertForKorQuAD

In [9]:
class SelfAttention(nn.Module):
    # 각 Segment에 해당하는 CLS 사이의 self attention 연산

    def __init__(self, d_model):
        super().__init__()

        self.w_q=nn.Linear(d_model, d_model)
        self.w_k=nn.Linear(d_model, d_model)
        self.w_v=nn.Linear(d_model, d_model)

        self.temperature=d_model**0.5
        self.dropout=nn.Dropout(0.1)

    def forward(self,x):
        q=self.w_q(x)
        k=self.w_k(x)
        v=self.w_v(x)

        attn=torch.matmul(q/self.temperature, k.transpose(0,1))
        attn=self.dropout(nn.functional.softmax(attn, dim=-1))

        output=torch.matmul(attn, v)

        return output


class BertForKorQuAD(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config, add_pooling_layer=False)
        self.attn = SelfAttention(config.hidden_size)
        self.loss_fct_ce = nn.CrossEntropyLoss()
        self.loss_fct_bce = nn.BCEWithLogitsLoss()
        """
        [실습]
        start_position과 end_position을 학습하는 MLP를 선언하시오.
        """
#################### 실습 코드 7 ##################
        # 각 segment에 대해 정답이 존재하는 영역인지 0 ~ 1로 판단하는 Linear Layer (output dim: 1)
        # self.pooler_segment = 
        self.pooler_segment = nn.Linear(config.hidden_size, 1)

        # 각 token에 대해 정답 token의 start token인지, end token인지 0 ~ 1로 판단하는 Linear Layer (output dim: 2)
        # self.korquad_outputs = 
        self.korquad_outputs = nn.Linear(config.hidden_size, 2)
#################################################

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        label_segment: Optional[torch.Tensor] = None,
        label_start: Optional[torch.Tensor] = None,
        label_end: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor]:

#################### 실습 코드 8 ##################
        # bert로 각 segment에 대한 hidden state를 계산
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        # outputs[0].shape: N_segment x max_seq_len ( = [CLS] Q [SEP] C [SEP] ) x hidden_size
        
        sequence_output = outputs[0]

        # segment logit 계산
        # 1) 각 segment의 첫번째 토큰 ([CLS] token)을 stack
        # 2) CLS token 사이에 attention 연산 수행
        # 3) pooler_segment layer를 통해서 segment logit 계산
        CLSs = []
        for segment in sequence_output:
            CLSs.append(segment[0])

        output_segment = self.pooler_segment(self.attn(torch.stack(CLSs))).squeeze(-1)

        # span logit 계산
        # 1) korquad_outputs layer와 각 token의 hidden state 를 통해서
        #    각 token이 start token, end token에 해당할 logit값 계산

        output_span = self.korquad_outputs(sequence_output)
###################################################

        logits_start, logits_end = output_span.split(1, dim=-1)
        # n(segments) * max_length
        logits_start = logits_start.squeeze(-1)
        # n(segments)*max_length
        logits_end = logits_end.squeeze(-1)

        total_loss = None

        # train인 경우 정답이 주어짐
        if label_start is not None and label_end is not None:
#################### 실습 코드 9 ##################
            # segment_loss 계산
            # 1) 각 segment가 정답을 포함하는지 여부는 label_segment에 저장되어 있음
            # 2) binary cross entropy loss 활용
            segment_loss = self.loss_fct_bce(output_segment, label_segment)
            
            # span_loss 계산
            # 1) 정답을 포함하는 segment에 대한 logit값만 loss 계산에 사용
            #      정답 segment index는 label_start[0] 또는 label_end[0]에 저장되어 있음
            #      정답을 포함하는 segment내에서 정답 span의 start index와 end index는 label_start[1]과 label_end[1]에 저장되어 있음
            # 2) cross entropy loss 활용하여 start_logit에 대한 loss와 end_logit에 대한 loss를 계산
            preds_start = logits_start[label_start[0].item()].unsqueeze(0)
            preds_end = logits_end[label_end[0].item()].unsqueeze(0)

            start_loss = self.loss_fct_ce(preds_start, label_start[1])
            end_loss = self.loss_fct_ce(preds_end, label_end[1])
            
            # 최종 loss는 start_loss와 end_loss + segment_loss의 합을 사용
            total_loss = segment_loss + start_loss + end_loss

###################################################
            output = (preds_start, preds_end) + outputs[2:]
        else:
            output = (output_segment, output_span) + outputs[2:]



        return ((total_loss,) + output) if total_loss is not None else output

# Train 함수 정의

In [10]:
def train(args, train_dataset, model, tokenizer):
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=1, shuffle=True)

    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # optimizer & schedule 설정
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.train_batch_size)
    logger.info(
        "  Total train batch size (w. accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss = 0.0

    model.zero_grad()
    set_seed(args)
#################### 실습 코드 10 #################
    # 학습 프로세스 구현

    # 1) 1st for loop: epoch 수 만큼 반복
    for _ in trange(int(args.num_train_epochs), position=0, desc="Epoch..."):
        model.train()

        # 2) 2nd for loop: dataloader에서 매 batch를 불러와 학습
        for step, batch in enumerate(tqdm(train_dataloader, position=0, desc="Iteration...")):
            # 3) batch의 경우 model과 동일한 device로 세팅해야함
            # N개의 segment를 n개의 data를 가지는 batch처럼 처리하기 때문에 squeeze(0) 해줘야 함

            inputs = {k: v.to(args.device).squeeze(0) for k, v in batch.items() if (k != 'label_start') and k != 'label_end'}
            inputs['label_start'] = (batch['label_start'][0], batch['label_start'][1].to(args.device))
            inputs['label_end'] = (batch['label_end'][0], batch['label_end'][1].to(args.device))

            outputs = model(**inputs)
            loss = outputs[0]

            # 4) gradient accumulation step 적용
            if args.gradient_accumulation_steps > 1:
                # loss를 단순히 누적할 경우 gradient_accumulation_step만큼 커지게 되므로 gradient_accumulation_step로 나눠줘야 함
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

        # Log metrics and Save model checkpoint!
        # 5) 매 2nd for loop가 끝나면 평가 함수 실행
        results = evaluate(args, dev_dataset, model, tokenizer, n_dev_data)
        logger.info("***** Evaluation result *****")
        for key, value in results.items():
            logger.info("eval_{}: {}".format(key, value))

        # 6) model, optimizer, scheduler, args 저장
        #    optimizer, scheduler, args는 학습을 중단했다가 다시 이어서 진행할 경우 활용
        output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
        os.makedirs(output_dir)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to %s", output_dir)

        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
        logger.info("Saving optimizer and scheduler states to %s", output_dir)
###################################################

    return global_step, tr_loss / global_step

# Evaluate 함수 정의

In [11]:
def evaluate(args, eval_dataset, model, tokenizer, n_documents=None):
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    start_time = timeit.default_timer()

    json_preds = {}

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()

        with torch.no_grad():
            inputs = {
                "input_ids": batch["input_ids"].squeeze(0).to(args.device),
                "attention_mask": batch["attention_mask"].squeeze(0).to(args.device),
                "token_type_ids": batch["token_type_ids"].squeeze(0).to(args.device),
            }

            outputs = model(**inputs)

#################### 실습 코드 11 #################
            # model input에 "input_ids", "attention_mask", "token_type_ids"만 주어졌을 때 모델 output을 고려
            # 1) 입력으로 들어간 segment 중에서 어떤 segment 내에 정답이 존재하는지 판단
            # 2) 정답이 존재하는 segment 내에서 정답 span의 pos_start와 pos_end 찾기
            output_segment = outputs[0].squeeze(-1)
            id_segment = torch.argmax(output_segment).item()

            logits_start, logits_end = outputs[1].split(1,dim=-1)
            logits_start = logits_start.squeeze(-1)
            logits_end = logits_end.squeeze(-1)

            pos_start = torch.argmax(logits_start[id_segment]).item()
            pos_end = torch.argmax(logits_end[id_segment]).item()

#################################################

        if(pos_start > pos_end): # 예측한 span의 범위가 start index가 end index보다 뒤일 때
            json_preds[batch['id'][0]] = ""
        else:
            ans = tokenizer.decode(batch["input_ids"][0][id_segment][pos_start:pos_end])
            json_preds[batch['id'][0]] = ans

        del outputs, logits_start, logits_end


    with open('./predictions.json', 'w') as f:
        json.dump(json_preds, f, ensure_ascii=False)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))

    with open(os.path.join('./data', 'KorQuAD_v1.0_dev.json'), 'r', encoding='utf-8') as fin:
        dev_data = json.load(fin)['data']
    # Compute the F1 and exact scores.
    if n_documents:
      dev_data = dev_data[:n_documents]
    results = KorQuAD_evaluate(dev_data, json_preds)

    return results

# MRC 평가 함수

In [12]:
def normalize_answer(s):
    def remove_(text):
        text = re.sub("'", " ", text)
        text = re.sub('"', " ", text)
        text = re.sub('《', " ", text)
        text = re.sub('》', " ", text)
        text = re.sub('<', " ", text)
        text = re.sub('>', " ", text)
        text = re.sub('〈', " ", text)
        text = re.sub('〉', " ", text)
        text = re.sub("\(", " ", text)
        text = re.sub("\)", " ", text)
        text = re.sub("‘", " ", text)
        text = re.sub("’", " ", text)
        return text

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(remove_(s))))

# EM = 정확하게 일치하는 경우에만 정답을 맞춤
# F1 = 일부 문자열이 일치하면 부분 점수를 부여
def f1_score(prediction, ground_truth):

#################### 실습 코드 12 #################
    # f1-score를 구하시오.

    # 단위: string
    # precision: 0 <= (pred & true) / pred <= 1
    # recall: 0 <= (pred & true) / true <= 1

    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()

    prediction_Char = []
    for tok in prediction_tokens:
        now = [a for a in tok]
        prediction_Char.extend(now)

    ground_truth_Char = []
    for tok in ground_truth_tokens:
        now = [a for a in tok]
        ground_truth_Char.extend(now)

    common = Counter(prediction_Char) & Counter(ground_truth_Char)
    num_same = sum(common.values())
    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(prediction_Char)
    recall = 1.0 * num_same / len(ground_truth_Char)
    f1 = (2 * precision * recall) / (precision + recall)
###################################################

    return f1

def exact_match_score(prediction, ground_truth):
#################### 실습 코드 13 #################
    # EM score 구현
    em = (normalize_answer(prediction) == normalize_answer(ground_truth))
#################################################

    return em

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)

    return max(scores_for_ground_truths)


def KorQuAD_evaluate(dev_data, predictions):
    f1 = exact_match = total = 0

    for document in dev_data:
        for paragraph_i in document['paragraphs']:
            for qa in paragraph_i['qas']:
                total += 1

                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue

                ground_truths = [answer["text"] for answer in qa['answers']]
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

                if total == 1:
                    logger.info("question id: {}".format(qa['id']))
                    logger.info("ground truths: {}".format(ground_truths))
                    logger.info("prediction: {}".format(prediction))
                    logger.info("F1: {:.3f} || EM: {:.3f}\n".format(metric_max_over_ground_truths(f1_score, prediction, ground_truths), metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)))

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

# Main함수 정의

In [13]:
def main(args):
    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    logger.warning(
        "Process device: %s, n_gpu: %s",
        args.device,
        args.n_gpu,
    )

    # Set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Load BERT configurations
    config = BertConfig.from_pretrained(args.model_name_or_path)
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case)
    model = BertForKorQuAD.from_pretrained(args.model_name_or_path, config=config)
    model.to(args.device)

    # Train
    if args.do_train:
        # train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

        # Save the trained model and the tokenizer
        logger.info("Saving model checkpoint to %s", args.output_dir)
        model.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        model = BertForKorQuAD.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluate
    results = {}
    if args.do_eval:
        logger.info("Loading checkpoints saved during training for evaluation")
        checkpoints = [args.output_dir]
        if args.do_train and args.eval_all_checkpoints:
                checkpoints = list(
                    os.path.dirname(c)
                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
                )

        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = BertForKorQuAD.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, dev_dataset, model, tokenizer, n_dev_data)

            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))
    print("Results: {}".format(results))

    return results


# Arguments Parsing과 Main 함수 실행

In [14]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Arguments for several paths or file names
    parser.add_argument(
        "--model_name_or_path",
        default="klue/bert-base",
        type=str,
        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--data_dir",
        default="./data",
        type=str,
        help="The input data dir. Should contain the .json files for the task.",
    )
    parser.add_argument(
        "--output_dir",
        default="./outputs/",
        type=str,
        help="The output directory where the model checkpoints and predictions will be written.",
    )
    parser.add_argument(
        "--train_file",
        default="KorQuAD_v1.0_train.json",
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there",
    )
    parser.add_argument(
        "--predict_file",
        default="KorQuAD_v1.0_dev.json",
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there",
    )

    # Hyperparameters
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--train_batch_size", default=1, type=int, help="Batch size GPU/CPU for training.")
    parser.add_argument(
        "--eval_batch_size", default=1, type=int, help="Batch size GPU/CPU for evaluation."
    )
    # RNN SGD 1e-2 / Adam 1e-4     | BERT, GPT 5e-5 ~ 1e-5
    parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=32,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )

    # Actions
    parser.add_argument("--do_train", default=True, action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", default=True, action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case", default=False, action="store_true", help="Set this flag if you are using an uncased model."
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", default=False, action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", default=True, action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", default=False, action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    args = parser.parse_args(args=[])

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    logger = logging.getLogger(__name__)

    # Logging hyperparameters
    logger.info("Training and evaluation parameters")
    for k, v in args.__dict__.items():
        logger.info("{}: {}".format(k, v))

    main(args)

12/12/2023 20:13:18 - INFO - __main__ -   Training and evaluation parameters
12/12/2023 20:13:18 - INFO - __main__ -   model_name_or_path: klue/bert-base
12/12/2023 20:13:18 - INFO - __main__ -   data_dir: ./data
12/12/2023 20:13:18 - INFO - __main__ -   output_dir: ./outputs/
12/12/2023 20:13:18 - INFO - __main__ -   train_file: KorQuAD_v1.0_train.json
12/12/2023 20:13:18 - INFO - __main__ -   predict_file: KorQuAD_v1.0_dev.json
12/12/2023 20:13:18 - INFO - __main__ -   seed: 42
12/12/2023 20:13:18 - INFO - __main__ -   max_seq_length: 384
12/12/2023 20:13:18 - INFO - __main__ -   doc_stride: 128
12/12/2023 20:13:18 - INFO - __main__ -   max_query_length: 64
12/12/2023 20:13:18 - INFO - __main__ -   train_batch_size: 1
12/12/2023 20:13:18 - INFO - __main__ -   eval_batch_size: 1
12/12/2023 20:13:18 - INFO - __main__ -   learning_rate: 3e-05
12/12/2023 20:13:18 - INFO - __main__ -   gradient_accumulation_steps: 32
12/12/2023 20:13:18 - INFO - __main__ -   weight_decay: 0.0
12/12/2023 2

Epoch...:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration...: 100%|██████████| 42966/42966 [24:03<00:00, 29.76it/s]
12/12/2023 20:37:25 - INFO - __main__ -   ***** Running evaluation *****
12/12/2023 20:37:25 - INFO - __main__ -     Num examples = 5774
12/12/2023 20:37:25 - INFO - __main__ -     Batch size = 1
Evaluating: 100%|██████████| 5774/5774 [01:04<00:00, 88.85it/s]
12/12/2023 20:38:30 - INFO - __main__ -     Evaluation done in total 64.990641 secs (0.011256 sec per example)
12/12/2023 20:38:30 - INFO - __main__ -   question id: 6548850-0-0
12/12/2023 20:38:30 - INFO - __main__ -   ground truths: ['1989년 2월 15일']
12/12/2023 20:38:30 - INFO - __main__ -   prediction: 1989년 2월 15일
12/12/2023 20:38:30 - INFO - __main__ -   F1: 1.000 || EM: 1.000

12/12/2023 20:38:31 - INFO - __main__ -   ***** Evaluation result *****
12/12/2023 20:38:31 - INFO - __main__ -   eval_exact_match: 71.49289920332525
12/12/2023 20:38:31 - INFO - __main__ -   eval_f1: 81.72007879927816
12/12/2023 20:38:31 - INFO - __main__ -   Saving model checkpoint to

Results: {'exact_match': 72.75718739175615, 'f1': 82.63539641928865}
