# 문서 검색 효율화를 위한 기계독해
- 1차 모의경진대회(22.11.14 ~ 22.11.25)
- 자연어 기계독해(Machine Reading Comprehension) 과제

## 데이터 구조

```
$ MRC/
├── DATA/
│   ├── train.json
│   ├── test.json
│   └── sample_submission.csv
├── prediction.csv (코드 실행 후 생성)
├── results/ (코드 실행 후 생성)
```

#0. 사전 준비

##0.1 구글 드라이브 마운트

In [1]:
# 구글 Colaboratory 를 사용하기 위해 구글 계정으로 로그인합니다. 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##0.2 라이브러리 설치

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##1. 라이브러리 불러오기

In [3]:
import os
import sys
import csv
import copy
import json
import random
import shutil
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from datetime import datetime, timezone, timedelta

from transformers import ElectraTokenizerFast
from transformers import ElectraForQuestionAnswering

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

##2. 하이퍼파라미터 및 기타 인자 설정

###2.1 데이터 경로

In [4]:
PROJECT_DIR = '/content/drive/MyDrive/YDS/AIConnet_YDS_1/NLP_MRC' # 프로젝트 디렉토리 설정
DATA_DIR= '/content/drive/MyDrive/YDS/AIConnet_YDS_1/NLP_MRC/DATA' # 데이터 디렉토리 설정

###2.2 시드 설정

In [5]:
# 난수 생성기가 항상 일정한 값을 출력하게 하기 위해 seed 고정
RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

###2.3 하이퍼파라미터 설정

In [6]:
LEARNING_RATE = 5.0e-4     # 학습률(learning rate)은 경사하강법(gradient descent)을 통해 내리막길을 내려갈 때의 보폭
BATCH_SIZE = 20    # 배치(batch)는 모델의 가중치(weights)를 업데이트하는 학습 데이터의 단위. 여기서는 16개를 학습할 때마다 모델의 가중치(weights)를 업데이트한다는 것
PIN_MEMORY = True
NUM_WORKERS = 0
EPOCHS = 2     # 에폭은 전체 학습 데이터를 학습에 사용하는 횟수. 주어진 학습 데이터를 여러번 학습할 수 있음
DROP_LAST = False
EARLY_STOPPING_MODE = min
EARLY_STOPPING_PATIENCE = 10
EARLY_STOPPING_TARGET = 'val_loss'     # validation set의 loss를 기준으로 early_stopping 여부를 결정할 것
LOGGING_INTERVAL = 200

###2.4 디바이스 설정

In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# !nvidia-smi
print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Device: cuda
Current cuda device: 0
Count of using GPUs: 1


In [9]:
!nvidia-smi

Mon Nov 21 11:27:18 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

##3. Dataset 정의

In [10]:
class QADataset(Dataset):     # 데이터를 input으로 변환해주는 Dataset 클래스를 상속하여, QA(Question Answering) 과제에 맞게 커스터마이징한다
    
    def __init__ (self, data_dir: str, tokenizer, max_seq_len: int, mode = 'train'):     # Dataset 클래스는 기본적으로 __init__, __len__, __getitem__를 정의해 주어야 한다
        self.mode = mode
        self.data = json.load(open(data_dir, 'r', encoding='utf8'))
        
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
        if mode == 'test':
            self.encodings, self.question_ids = self.preprocess()
        else:
            self.encodings, self.answers = self.preprocess()
        
    def __len__(self):     # index를 통해 input을 순차적으로 읽어오기 위해서는 데이터의 길이가 먼저 확인되어야 한다. __len__ 함수는 input의 길이를 반환해주는 함수
        return len(self.encodings.input_ids)

    def __getitem__(self, index: int):     # input의 길이가 확인되면 index를 통해 데이터를 불러올 수 있다. __getitem__ 함수는 index에 해당하는 input 데이터를 반환해주는 함수
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}

    
    def preprocess(self):
        contexts, questions, answers, question_ids = self.read_squad()     # SQuAD(Stanford Question Answering Dataset) 형식의 데이터에서 contexts, questions, answers, question_ids를 읽어오는 함수
        if self.mode == 'test':
            encodings = self.tokenizer(contexts, questions, truncation=True, max_length = self.max_seq_len, padding=True)
            return encodings, question_ids
        else: # train or val
            self.add_end_idx(answers, contexts)     # train.json에는 질문에 대한 답이 context 내에서 시작되는 index인 'answer_srart'만 있기 때문에, 추가로 'answer_end'를 찾아주는 함수
            encodings = self.tokenizer(contexts, questions, truncation=True, max_length = self.max_seq_len, padding=True)
            self.add_token_positions(encodings, answers)
        
            return encodings, answers
        
    
    def read_squad(self):     # SQuAD(Stanford Question Answering Dataset) 형식의 데이터에서 contexts, questions, answers, question_ids를 읽어오는 함수
        contexts = []
        questions = []
        question_ids = []
        answers = []
        
        # train - val split
        if self.mode == 'train':
            self.data['data'] = self.data['data'][:-1*int(len(self.data['data'])*0.1)]
        elif self.mode == 'val':
            self.data['data'] = self.data['data'][-1*int(len(self.data['data'])*0.1):]
        
        
        till = len(self.data['data'])
        

        for group in self.data['data'][:till]:
            for passage in group['paragraphs']:
                context = passage['context']
                for qa in passage['qas']:
                    question = qa['question']
                    if self.mode == 'test':
                        contexts.append(context)
                        questions.append(question)
                        question_ids.append(qa['question_id'])
                    else: # train or val
                        for ans in qa['answers']:
                            contexts.append(context)
                            questions.append(question)

                            if qa['is_impossible']:
                                answers.append({'text':'','answer_start':-1})
                            else:
                                answers.append(ans)
                
        # return formatted data lists
        return contexts, questions, answers, question_ids
    
    
    def add_end_idx(self, answers, contexts):     # train.json에는 질문에 대한 답이 context 내에서 시작되는 index인 'answer_srart'만 있기 때문에, 추가로 'answer_end'를 찾아주는 함수
        for answer, context in zip(answers, contexts):
            gold_text = answer['text']
            start_idx = answer['answer_start']
            end_idx = start_idx + len(gold_text)

            # in case the indices are off 1-2 idxs
            if context[start_idx:end_idx] == gold_text:
                answer['answer_end'] = end_idx
            else:
                for n in [1, 2]:
                    if context[start_idx-n:end_idx-n] == gold_text:
                        answer['answer_start'] = start_idx - n
                        answer['answer_end'] = end_idx - n
                    elif context[start_idx+n:end_idx+n] == gold_text:
                        answer['answer_start'] = start_idx + n
                        answer['answer_end'] = end_idx + n
                        

    def add_token_positions(self, encodings, answers):
        # should use Fast tokenizer
        start_positions = []
        end_positions = []
        for i in range(len(answers)):
            if answers[i]['answer_start'] == -1:
                # set [CLS] token as answer if is_impossible
                start_positions.append(0)
                end_positions.append(1)
            else:
                start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))

                assert 'answer_end' in answers[i].keys(), f'no answer_end at {i}'
                end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

            # answer passage truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length                
            # end position cannot be found, shift until found
            shift = 1
            while end_positions[-1] is None:
                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
                shift += 1
                
        # char-based -> token based
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

##4. 모델 정의

In [11]:
class electra(nn.Module):     # pytorch의 모든 neural network 모델들은 torch.nn.Module 클래스를 상속해야 한다. 기본적으로 __init__()과 forward 함수가 override(재정의)되어야 하며, forward 함수는 모델의 계산을 실행하는 것을 뜻한다.

    def __init__(self, pretrained, **kwargs):
        super(electra, self).__init__()

        self.model = ElectraForQuestionAnswering.from_pretrained(pretrained)     # Hugging Face에서 pretrain된 모델을 가져와서 model 변수에 저장한다.
        

    def forward(self, input_ids, attention_mask, start_positions=None, end_positions=None):
        
        outputs = self.model(input_ids = input_ids, 
                             attention_mask = attention_mask,
                             start_positions = start_positions,
                             end_positions = end_positions)
        
        return outputs

##5. Utils 정의
###5.1 EarlyStopper

In [12]:
class EarlyStopper():     # 일정 기간 모델 성능에 개선이 없으면, 학습을 중단하는 기능

    def __init__(self, patience: int, mode:str)-> None:
        self.patience = patience
        self.mode = mode

        # Initiate
        self.patience_counter = 0
        self.stop = False
        self.best_loss = np.inf

        print(f"Initiated early stopper, mode: {self.mode}, best score: {self.best_loss}, patience: {self.patience}")

        
    def check_early_stopping(self, loss: float)-> None:
        loss = -loss if self.mode == 'max' else loss  # get max value if mode set to max

        if loss > self.best_loss:
            # got worse score
            self.patience_counter += 1

            print(f"Early stopper, counter {self.patience_counter}/{self.patience}, best:{abs(self.best_loss)} -> now:{abs(loss)}")
            
            if self.patience_counter == self.patience:
                print(f"Early stopper, stop")
                self.stop = True  # end

        elif loss <= self.best_loss:
            # got better score
            self.patience_counter = 0
            
            print(f"Early stopper, counter {self.patience_counter}/{self.patience}, best:{abs(self.best_loss)} -> now:{abs(loss)}")
            print(f"Set counter as {self.patience_counter}")
            print(f"Update best score as {abs(loss)}")
            
            self.best_loss = loss
            
        else:
            print('debug')

###5.2 Trainer

In [13]:
class Trainer():     # 학습을 위한 Trainer 클래스 정의

    def __init__(self,
                 model,
                 optimizer,
                 loss,
                 metrics,
                 device,
                 tokenizer,
                 interval=100):
        
        self.model = model
        self.optimizer = optimizer
        self.loss = loss
        self.metrics = metrics
        self.device = device
        self.interval = interval
        self.tokenizer = tokenizer

        # History
        self.loss_sum = 0  # Epoch loss sum
        self.loss_mean = 0 # Epoch loss mean
        self.y = list()
        self.y_preds = list()
        self.score_dict = dict()  # metric score
        self.elapsed_time = 0
        

    def train(self, mode, dataloader, tokenizer, epoch_index=0):
        
        start_timestamp = time()
        self.model.train() if mode == 'train' else self.model.eval()     # 모델을 train(eval) mode로 전환.  train(eval) mode에서는 dropout, batchnorm이 적용된다(적용되지 않는다)
 
        for batch_index, batch in enumerate(tqdm(dataloader, leave=True)):
            
            self.optimizer.zero_grad()     # 파라미터 업데이트는 batch 단위로 이루어지고, 매 batch마다 이전 스텝에서 계산된 gradient를 초기화해주어야 함
            # pull all the tensor batches required for training
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            start_positions = batch['start_positions'].to(self.device)
            end_positions = batch['end_positions'].to(self.device)
            
            # train model on batch and return outputs (incl. loss)
            # Inference
            outputs = self.model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            
            loss = outputs.loss
            start_score = outputs.start_logits
            end_score = outputs.end_logits
            
            
            start_idx = torch.argmax(start_score, dim=1).cpu().tolist()
            end_idx = torch.argmax(end_score, dim=1).cpu().tolist()
            
            # Update
            if mode == 'train':
                loss.backward()     # backpropagation
                self.optimizer.step()     # 파라미터 업데이트
                
            elif mode in ['val', 'test']:
                pass
            
            # History
            self.loss_sum += loss.item()
            
            # create answer; list of strings
            for i in range(len(input_ids)):
                if start_idx[i] > end_idx[i]:
                    output = ''
                
                self.y_preds.append(self.tokenizer.decode(input_ids[i][start_idx[i]:end_idx[i]]))
                self.y.append(self.tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i]]))


            # Logging
            if batch_index % self.interval == 0:
                print(f"batch: {batch_index}/{len(dataloader)} loss: {loss.item()}")
                
        # Epoch history
        self.loss_mean = self.loss_sum / len(dataloader)  # Epoch loss mean

        # Metric
        score = self.metrics(self.y, self.y_preds)
        self.score_dict['metric_name'] = score

        # Elapsed time
        end_timestamp = time()
        self.elapsed_time = end_timestamp - start_timestamp

    def clear_history(self):
        self.loss_sum = 0
        self.loss_mean = 0
        self.y_preds = list()
        self.y = list()
        self.score_dict = dict()
        self.elapsed_time = 0

###5.3 Recorder

In [14]:
class Recorder():

    def __init__(self,
                 record_dir: str,
                 model: object,
                 optimizer: object):
        
        self.record_dir = record_dir
        self.record_filepath = os.path.join(self.record_dir, 'record.csv')
        self.weight_path = os.path.join(record_dir, 'model.pt')

        self.model = model
        self.optimizer = optimizer

        
    def set_model(self, model: 'model'):
        self.model = model


    def add_row(self, row_dict: dict):

        fieldnames = list(row_dict.keys())

        with open(self.record_filepath, newline='', mode='a') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            if f.tell() == 0:
                writer.writeheader()

            writer.writerow(row_dict)
            print(f"Write row {row_dict['epoch_index']}")

            
    def save_weight(self, epoch: int)-> None:
        check_point = {
            'epoch': epoch + 1,
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
        }
        
        torch.save(check_point, self.weight_path)
        print(f"Recorder, epoch {epoch} Model saved: {self.weight_path}")

##6. 모델 학습

###6.1 모델과 기타 utils 설정

In [15]:
# Load model
model = electra(pretrained='monologg/koelectra-small-v3-discriminator').to(device)

# Set optimizer, loss function, metric function
# optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)
# optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss = F.cross_entropy
metrics = accuracy_score

# Set tokenizer
tokenizer = ElectraTokenizerFast.from_pretrained('monologg/koelectra-small-v3-discriminator')
# klue/roberta-large
# monologg/koelectra-small-v3-discriminator

# Set Trainer
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  loss=loss,
                  metrics=metrics,
                  device=device,
                  tokenizer=tokenizer,
                  interval=LOGGING_INTERVAL)

# Set earlystopper
early_stopper = EarlyStopper(patience=EARLY_STOPPING_PATIENCE,
                            mode=min)

# Set train serial
kst = timezone(timedelta(hours=9))
train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")


# Set recorder 
RECORDER_DIR = os.path.join(PROJECT_DIR, 'results', 'train', train_serial)
os.makedirs(RECORDER_DIR, exist_ok=True)

recorder = Recorder(record_dir=RECORDER_DIR,
                    model=model,
                    optimizer=optimizer)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['qa_outputs.weigh

Initiated early stopper, mode: <built-in function min>, best score: inf, patience: 10


###6.2 Dataset & Dataloader 설정

In [16]:
'''
# torch.utils.data.Dataset : 데이터를 input으로 변환
train_dataset = QADataset(data_dir=os.path.join(DATA_DIR, 'train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'train')
val_dataset = QADataset(data_dir=os.path.join(DATA_DIR, 'train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'val')

# torch.utils.data.DataLoader : input을 배치 단위로 리턴해주는 기능
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, 
                              shuffle=True,
                              pin_memory=PIN_MEMORY,
                              drop_last=DROP_LAST)

val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=NUM_WORKERS, 
                            shuffle=True,
                            pin_memory=PIN_MEMORY,
                            drop_last=DROP_LAST)

print(f"Load data, train:{len(train_dataset)} val:{len(val_dataset)}")
'''

'\n# torch.utils.data.Dataset : 데이터를 input으로 변환\ntrain_dataset = QADataset(data_dir=os.path.join(DATA_DIR, \'train.json\'), tokenizer = tokenizer, max_seq_len = 512, mode = \'train\')\nval_dataset = QADataset(data_dir=os.path.join(DATA_DIR, \'train.json\'), tokenizer = tokenizer, max_seq_len = 512, mode = \'val\')\n\n# torch.utils.data.DataLoader : input을 배치 단위로 리턴해주는 기능\ntrain_dataloader = DataLoader(dataset=train_dataset,\n                              batch_size=BATCH_SIZE,\n                              num_workers=NUM_WORKERS, \n                              shuffle=True,\n                              pin_memory=PIN_MEMORY,\n                              drop_last=DROP_LAST)\n\nval_dataloader = DataLoader(dataset=val_dataset,\n                            batch_size=BATCH_SIZE,\n                            num_workers=NUM_WORKERS, \n                            shuffle=True,\n                            pin_memory=PIN_MEMORY,\n                            drop_last=DROP_LAST)\n\npri

In [16]:
# torch.utils.data.Dataset : 데이터를 input으로 변환
train_dataset = QADataset(data_dir=os.path.join(DATA_DIR, 'AI_train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'train')
# train_dataset2 = QADataset(data_dir=os.path.join(DATA_DIR, 'train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'train')

print(f"Load data, train:{len(train_dataset)}")
# print(f"Load data, train:{len(train_dataset2)}")

Load data, train:106128


In [None]:
'''
train_dataset3 = train_dataset + train_dataset2
print(f"Load data, train:{len(train_dataset3)} ") # val:{len(val_dataset)}")
'''

In [17]:
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, 
                              shuffle=True,
                              pin_memory=PIN_MEMORY,
                              drop_last=DROP_LAST)

In [18]:
val_dataset = QADataset(data_dir=os.path.join(DATA_DIR, 'AI_train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'val')
# val_dataset2 = QADataset(data_dir=os.path.join(DATA_DIR, 'train.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'val')

print(f"Load data, train:{len(val_dataset)}")
# print(f"Load data, train:{len(val_dataset2)}")

Load data, train:13512


In [None]:
'''
val_dataset3 = val_dataset + val_dataset2
print(f"Load data, val:{len(val_dataset3)} ") 
'''

In [19]:
val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=NUM_WORKERS, 
                            shuffle=True,
                            pin_memory=PIN_MEMORY,
                            drop_last=DROP_LAST)

###6.3 Epoch 단위 학습 진행

In [None]:
# PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512

In [20]:
# Train
for epoch_index in range(EPOCHS):

    # Set Recorder row
    row_dict = dict()
    row_dict['epoch_index'] = epoch_index
    row_dict['train_serial'] = train_serial

    """
    Train
    """
    print(f"Train {epoch_index}/{EPOCHS}")
    print(f"--Train {epoch_index}/{EPOCHS}")
    trainer.train(dataloader=train_dataloader, epoch_index=epoch_index, tokenizer=tokenizer, mode='train')

    row_dict['train_loss'] = trainer.loss_mean
    row_dict['train_elapsed_time'] = trainer.elapsed_time 

    for metric_str, score in trainer.score_dict.items():
        row_dict[f"train_{metric_str}"] = score
    trainer.clear_history()

    """
    Validation
    """
    print(f"Val {epoch_index}/{EPOCHS}")
    print(f"--Val {epoch_index}/{EPOCHS}")
    trainer.train(dataloader=val_dataloader, epoch_index=epoch_index, tokenizer=tokenizer, mode='val')

    row_dict['val_loss'] = trainer.loss_mean
    row_dict['val_elapsed_time'] = trainer.elapsed_time 

    for metric_str, score in trainer.score_dict.items():
        row_dict[f"val_{metric_str}"] = score
    trainer.clear_history()

    """
    Record
    """
    recorder.add_row(row_dict)

    """
    Early stopper
    """
    early_stopping_target = EARLY_STOPPING_TARGET
    early_stopper.check_early_stopping(loss=row_dict[early_stopping_target])

    if early_stopper.patience_counter == 0:
        recorder.save_weight(epoch=epoch_index)
        best_row_dict = copy.deepcopy(row_dict)

    if early_stopper.stop == True:
        print(f"Early stopped, counter {early_stopper.patience_counter}/{EARLY_STOPPING_PATIENCE}")

        break

Train 0/2
--Train 0/2


  0%|          | 1/5306 [00:02<3:33:17,  2.41s/it]

batch: 0/5306 loss: 6.097280502319336


  4%|▍         | 201/5306 [00:57<23:41,  3.59it/s]

batch: 200/5306 loss: 2.0061466693878174


  8%|▊         | 401/5306 [01:54<23:15,  3.52it/s]

batch: 400/5306 loss: 2.081408977508545


 11%|█▏        | 601/5306 [02:52<22:15,  3.52it/s]

batch: 600/5306 loss: 1.7726274728775024


 15%|█▌        | 801/5306 [03:50<21:52,  3.43it/s]

batch: 800/5306 loss: 1.4184608459472656


 19%|█▉        | 1001/5306 [04:48<20:41,  3.47it/s]

batch: 1000/5306 loss: 1.1594948768615723


 23%|██▎       | 1201/5306 [05:46<19:45,  3.46it/s]

batch: 1200/5306 loss: 1.193901777267456


 26%|██▋       | 1401/5306 [06:44<18:48,  3.46it/s]

batch: 1400/5306 loss: 1.036330223083496


 30%|███       | 1601/5306 [07:42<17:52,  3.45it/s]

batch: 1600/5306 loss: 1.201899528503418


 34%|███▍      | 1801/5306 [08:40<17:00,  3.43it/s]

batch: 1800/5306 loss: 1.2312896251678467


 38%|███▊      | 2001/5306 [09:38<15:55,  3.46it/s]

batch: 2000/5306 loss: 1.4633071422576904


 41%|████▏     | 2201/5306 [10:36<15:05,  3.43it/s]

batch: 2200/5306 loss: 1.4702510833740234


 45%|████▌     | 2401/5306 [11:34<14:33,  3.33it/s]

batch: 2400/5306 loss: 1.166877031326294


 49%|████▉     | 2601/5306 [12:32<13:13,  3.41it/s]

batch: 2600/5306 loss: 1.8957993984222412


 53%|█████▎    | 2801/5306 [13:30<12:02,  3.47it/s]

batch: 2800/5306 loss: 1.2905688285827637


 57%|█████▋    | 3001/5306 [14:28<11:14,  3.42it/s]

batch: 3000/5306 loss: 1.282942771911621


 60%|██████    | 3201/5306 [15:26<10:06,  3.47it/s]

batch: 3200/5306 loss: 0.3872619569301605


 64%|██████▍   | 3401/5306 [16:24<09:11,  3.46it/s]

batch: 3400/5306 loss: 0.9050285220146179


 68%|██████▊   | 3601/5306 [17:22<08:11,  3.47it/s]

batch: 3600/5306 loss: 1.0574480295181274


 72%|███████▏  | 3801/5306 [18:20<07:17,  3.44it/s]

batch: 3800/5306 loss: 0.9419869184494019


 75%|███████▌  | 4001/5306 [19:18<06:18,  3.45it/s]

batch: 4000/5306 loss: 1.0904759168624878


 79%|███████▉  | 4201/5306 [20:16<05:19,  3.45it/s]

batch: 4200/5306 loss: 1.002315878868103


 83%|████████▎ | 4401/5306 [21:14<04:19,  3.48it/s]

batch: 4400/5306 loss: 1.1483268737792969


 87%|████████▋ | 4601/5306 [22:12<03:22,  3.48it/s]

batch: 4600/5306 loss: 1.1201850175857544


 90%|█████████ | 4801/5306 [23:10<02:25,  3.47it/s]

batch: 4800/5306 loss: 1.1958630084991455


 94%|█████████▍| 5001/5306 [24:08<01:28,  3.46it/s]

batch: 5000/5306 loss: 1.2316328287124634


 98%|█████████▊| 5201/5306 [25:05<00:30,  3.44it/s]

batch: 5200/5306 loss: 1.1553043127059937


100%|██████████| 5306/5306 [25:36<00:00,  3.45it/s]


Val 0/2
--Val 0/2


  0%|          | 2/675 [00:00<01:02, 10.78it/s]

batch: 0/675 loss: 0.8037025332450867


 30%|██▉       | 202/675 [00:19<00:46, 10.18it/s]

batch: 200/675 loss: 1.1759521961212158


 60%|█████▉    | 402/675 [00:38<00:25, 10.50it/s]

batch: 400/675 loss: 1.2113394737243652


 89%|████████▉ | 602/675 [00:57<00:06, 10.55it/s]

batch: 600/675 loss: 2.0144705772399902


100%|██████████| 675/675 [01:04<00:00, 10.50it/s]


Write row 0
Early stopper, counter 0/10, best:inf -> now:1.0793779963034171
Set counter as 0
Update best score as 1.0793779963034171
Recorder, epoch 0 Model saved: /content/drive/MyDrive/YDS/AIConnet_YDS_1/NLP_MRC/results/train/20221121_202740/model.pt
Train 1/2
--Train 1/2


  0%|          | 1/5306 [00:00<25:55,  3.41it/s]

batch: 0/5306 loss: 1.0723776817321777


  4%|▍         | 201/5306 [00:58<24:35,  3.46it/s]

batch: 200/5306 loss: 0.7653268575668335


  8%|▊         | 401/5306 [01:56<23:34,  3.47it/s]

batch: 400/5306 loss: 1.07666015625


 11%|█▏        | 601/5306 [02:54<22:40,  3.46it/s]

batch: 600/5306 loss: 1.2659580707550049


 15%|█▌        | 801/5306 [03:52<21:39,  3.47it/s]

batch: 800/5306 loss: 0.962070107460022


 19%|█▉        | 1001/5306 [04:50<20:50,  3.44it/s]

batch: 1000/5306 loss: 0.5896621346473694


 23%|██▎       | 1201/5306 [05:47<19:42,  3.47it/s]

batch: 1200/5306 loss: 0.8851637840270996


 26%|██▋       | 1401/5306 [06:45<18:49,  3.46it/s]

batch: 1400/5306 loss: 1.3529481887817383


 30%|███       | 1601/5306 [07:43<17:49,  3.46it/s]

batch: 1600/5306 loss: 0.8355503678321838


 34%|███▍      | 1801/5306 [08:41<16:49,  3.47it/s]

batch: 1800/5306 loss: 1.139141321182251


 38%|███▊      | 2001/5306 [09:39<15:56,  3.46it/s]

batch: 2000/5306 loss: 0.7301837801933289


 41%|████▏     | 2201/5306 [10:37<14:55,  3.47it/s]

batch: 2200/5306 loss: 0.9010317325592041


 45%|████▌     | 2401/5306 [11:35<14:02,  3.45it/s]

batch: 2400/5306 loss: 1.2282500267028809


 49%|████▉     | 2601/5306 [12:33<13:01,  3.46it/s]

batch: 2600/5306 loss: 0.8904396891593933


 53%|█████▎    | 2801/5306 [13:31<12:36,  3.31it/s]

batch: 2800/5306 loss: 1.0048015117645264


 57%|█████▋    | 3001/5306 [14:28<11:23,  3.37it/s]

batch: 3000/5306 loss: 0.5522158145904541


 60%|██████    | 3201/5306 [15:26<10:06,  3.47it/s]

batch: 3200/5306 loss: 1.0766043663024902


 64%|██████▍   | 3401/5306 [16:24<09:10,  3.46it/s]

batch: 3400/5306 loss: 1.147250771522522


 68%|██████▊   | 3601/5306 [17:22<08:10,  3.48it/s]

batch: 3600/5306 loss: 1.2335906028747559


 72%|███████▏  | 3801/5306 [18:20<07:17,  3.44it/s]

batch: 3800/5306 loss: 1.0735461711883545


 75%|███████▌  | 4001/5306 [19:17<06:15,  3.47it/s]

batch: 4000/5306 loss: 1.0574977397918701


 79%|███████▉  | 4201/5306 [20:15<05:17,  3.48it/s]

batch: 4200/5306 loss: 1.5012662410736084


 83%|████████▎ | 4401/5306 [21:13<04:20,  3.47it/s]

batch: 4400/5306 loss: 1.0503668785095215


 87%|████████▋ | 4601/5306 [22:11<03:49,  3.07it/s]

batch: 4600/5306 loss: 0.8314367532730103


 90%|█████████ | 4801/5306 [23:17<02:48,  3.00it/s]

batch: 4800/5306 loss: 0.7750580310821533


 94%|█████████▍| 5001/5306 [24:24<01:40,  3.05it/s]

batch: 5000/5306 loss: 0.6859911680221558


 98%|█████████▊| 5201/5306 [25:30<00:34,  3.01it/s]

batch: 5200/5306 loss: 1.1635366678237915


100%|██████████| 5306/5306 [26:05<00:00,  3.39it/s]


Val 1/2
--Val 1/2


  0%|          | 1/675 [00:00<01:38,  6.83it/s]

batch: 0/675 loss: 0.7045469284057617


 30%|██▉       | 202/675 [00:28<01:06,  7.08it/s]

batch: 200/675 loss: 1.2186768054962158


 60%|█████▉    | 402/675 [00:57<00:38,  7.08it/s]

batch: 400/675 loss: 1.264329195022583


 89%|████████▉ | 602/675 [01:25<00:10,  7.07it/s]

batch: 600/675 loss: 0.9629223346710205


100%|██████████| 675/675 [01:36<00:00,  7.02it/s]


Write row 1
Early stopper, counter 0/10, best:1.0793779963034171 -> now:1.0201761136673115
Set counter as 0
Update best score as 1.0201761136673115
Recorder, epoch 1 Model saved: /content/drive/MyDrive/YDS/AIConnet_YDS_1/NLP_MRC/results/train/20221121_202740/model.pt


In [33]:
type(val_dataloader)

torch.utils.data.dataloader.DataLoader

In [22]:
row_dict
# train_loss = row_dict['train_loss']
# val_loss = row_dict['val_loss']
# plt.show()

{'epoch_index': 1,
 'train_serial': '20221121_192143',
 'train_loss': 0.8400932408662891,
 'train_elapsed_time': 1499.973861694336,
 'train_metric_name': 0.6905717624001206,
 'val_loss': 0.9580538479028943,
 'val_elapsed_time': 89.3628511428833,
 'val_metric_name': 0.6887211367673179}

##7. 추론

###7.1 테스트 Dataset & Dataloader 설정

In [21]:
# Load data
test_dataset = QADataset(data_dir=os.path.join(DATA_DIR, 'test.json'), tokenizer = tokenizer, max_seq_len = 512, mode = 'test')

question_ids = test_dataset.question_ids

test_dataloader = DataLoader(dataset=test_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=NUM_WORKERS, 
                            shuffle=False,
                            pin_memory=PIN_MEMORY,
                            drop_last=DROP_LAST)

###7.2 모델 로드

In [22]:
# Load model

model = electra(pretrained='monologg/koelectra-small-v3-discriminator').to(device)

checkpoint = torch.load(os.path.join(RECORDER_DIR, 'model.pt'))

model.load_state_dict(checkpoint['model'])

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['qa_outputs.weigh

<All keys matched successfully>

In [23]:
RECORDER_DIR

'/content/drive/MyDrive/YDS/AIConnet_YDS_1/NLP_MRC/results/train/20221121_202740'

###7.3 추론 진행

In [27]:
model.eval()     # 모델을 eval mode로 전환. train mode와 달리 eval mode에서는 dropout, batchnorm이 적용되지 않는다

pred_df = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))

for batch_index, batch in enumerate(tqdm(test_dataloader, leave=True)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Inference
    outputs = model(input_ids, attention_mask=attention_mask)

    start_score = outputs.start_logits
    end_score = outputs.end_logits

    start_idx = torch.argmax(start_score, dim=1).cpu().tolist()
    end_idx = torch.argmax(end_score, dim=1).cpu().tolist()

    y_pred = []
    for i in range(len(input_ids)):
        if start_idx[i] > end_idx[i]:
            output = ''

        ans_txt = tokenizer.decode(input_ids[i][start_idx[i]:end_idx[i]]).replace('#','')

        if ans_txt == '[CLS]':
            ans_txt == ''

        y_pred.append(ans_txt)


    q_end_idx = BATCH_SIZE*batch_index + len(y_pred)
    for q_id, pred in zip(question_ids[BATCH_SIZE*batch_index:q_end_idx], y_pred):
        pred_df.loc[pred_df['question_id'] == q_id,'answer_text'] = pred

100%|██████████| 81/81 [00:08<00:00,  9.29it/s]


In [28]:
pred_df

Unnamed: 0,question_id,answer_text
0,QUES_cyOI2451l1,한국원자력안전기술원
1,QUES_pz2vbWpWWo,가출청소년 문제
2,QUES_1g3jI4y7eo,[CLS]
3,QUES_qzwOZwaeeY,Prime Air
4,QUES_hfdtXCtdzf,장애인케어서비스
...,...,...
1621,QUES_JtsKBSQITG,
1622,QUES_IajaDLmxvq,
1623,QUES_lR6hjzsptY,
1624,QUES_ACwZJGYBfp,


###7.4 결과 저장

In [29]:
# Set predict serial
kst = timezone(timedelta(hours=9))
predict_timestamp = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")
predict_serial = predict_timestamp
predict_serial

PREDICT_DIR = os.path.join(PROJECT_DIR, 'results', 'predict', predict_serial)
os.makedirs(PREDICT_DIR, exist_ok=True)

pred_df.to_csv(os.path.join(PREDICT_DIR, 'prediction.csv'), index=False)

In [31]:
# pred_df.to_csv('prediction1.csv')