## Load Libraries

In [None]:
# install 
!pip install datasets transformers numpy sentencepiece accelerate -U evaluate

In [2]:
import os
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

from tokenization_kobert import KoBertTokenizer

## Set Hyperparameters

In [3]:
SEED = 456 # random seed 고정 (재현성을 위해)
random.seed(SEED)
np.random.seed(SEED) 
torch.manual_seed(SEED) 
torch.cuda.manual_seed(SEED) 
torch.cuda.manual_seed_all(SEED) 

In [7]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # 디바이스 설정
DEVICE

device(type='cuda')

In [4]:
# 경로 설정
BASE_DIR = os.getcwd() # 현재 디렉토리
DATA_DIR = os.path.join(BASE_DIR, '../data') 
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

아래 조건에 맞춰 스스로 코드를 짜봅시다!
- 사용할 모델 : https://huggingface.co/monologg/kobert
- tokenizer는 ./SK_Day2/text-cls-ynat/tokenization_kobert.py 를 사용
- 모델 로드에는 AutoModelForSequenceClassification 클래스를 사용

In [None]:
model_name = 'monologg/kobert' 
tokenizer = KoBertTokenizer.from_pretrained(model_name) 
# 다양한 Transformer 모델의 구조를 자동으로 인식하고, 이에 적합한 분류 모델을 생성해주는 AutoModelForSequenceClassification 클래스를 사용
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

## Define Dataset

- pd.read_csv 를 사용하여 데이터 불러오기
- 불러온 데이터를 train/valid 셋으로 분리 (train set : valid set = 7:3)

In [6]:
data = pd.read_csv('./data/train_original.csv') # 데이터 불러오기
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, random_state=SEED) # train, valid 데이터셋 분리

- BERTDataset 클래스 완성하기

In [7]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text'] # input
        targets = data['target'] # label
        
        self.inputs = []; self.labels = [] 
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt') # tokenizer로 text를 tokenizing
            self.inputs.append(tokenized_input) # input을 inputs에 저장
            self.labels.append(torch.tensor(label)) # label을 labels에 저장
    
    def __getitem__(self, idx): 
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0), # [1, seq_len] -> [seq_len]
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0), 
            'labels': self.labels[idx].squeeze(0) 
        }
    
    def __len__(self):
        return len(self.labels)

In [8]:
data_train = BERTDataset(dataset_train, tokenizer) # train 데이터셋 생성
data_valid = BERTDataset(dataset_valid, tokenizer) # valid 데이터셋 생성

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # padding을 위한 data_collator 생성

## Define Metric

In [10]:
f1 = evaluate.load('f1') # f1 score를 계산하기 위한 함수
def compute_metrics(eval_pred):
    predictions, labels = eval_pred # model의 output과 실제 label을 받음
    predictions = np.argmax(predictions, axis=1) # model의 output에서 가장 높은 값을 가진 index를 예측값으로 사용
    return f1.compute(predictions=predictions, references=labels, average='macro') # f1 score 계산, macro: label별 f1 score의 평균


## Train Model

In [11]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True, 
    seed=SEED,
    auto_find_batch_size=True 
)

- Trainer의 인자들을 넣어봅시다!

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

- 훈련을 시켜봅시다! 
- (훈련 시간이 너무 오래걸려서 로그가 찍히는 것만 확인하고 중지 시키시면 됩니다!)

In [13]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhanadul[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.15.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Step,Training Loss,Validation Loss,F1
100,1.127,0.594221,0.833971
200,0.5103,0.465016,0.8515
300,0.4129,0.438041,0.854132




TrainOutput(global_step=324, training_loss=0.6600066290961372, metrics={'train_runtime': 475.2761, 'train_samples_per_second': 86.495, 'train_steps_per_second': 0.682, 'total_flos': 1.08167179477248e+16, 'train_loss': 0.6600066290961372, 'epoch': 3.0})

## Evaluate Model

- test set 에 대해 예측을 해봅시다!

In [5]:
# 데이터 읽어오기
DATA_DIR = './data'
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test_original.csv'))

In [9]:
model_name = 'monologg/kobert'
tokenizer = KoBertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained('/home/hana/nas2/SK/output_clean/checkpoint-300', num_labels=7).to(DEVICE)
model.eval() # model을 evaluation 모드로 변경
preds = [] 
for idx, sample in dataset_test.iterrows():
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad(): # gradient 계산 비활성화
        logits = model(**inputs).logits # model의 output
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy() # model의 output에서 가장 높은 값을 가진 index를 예측값으로 사용
        preds.extend(pred) 

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


- 결과 csv 파일로 저장

In [11]:
dataset_test['target'] = preds
BASE_DIR = os.getcwd()
dataset_test.to_csv(os.path.join(BASE_DIR, 'test_output.csv'), index=False)

In [12]:
trainer.evaluate()

NameError: name 'trainer' is not defined