<!-- ---
- Project: 2023 Winter School
- Author: Gyu-min Lee
- Version: 0.5
- Changelog
    - 0.1 -- Initiated the file
    - 0.5 -- First Draft
--- -->

2023 전산언어학 겨울학교 3일차 3교시

# Language Models 

## Project: NSMC Classification with 🤗 Transformers

- NSMC는 Naver의 영화 리뷰를 기반으로 구축된 웹 텍스트 기반 감성 분석 텍스트입니다
- 여기서는 🤗 Model Hub에서 일반적인 한국어 텍스트로 구축된 BERT 모델과, 댓글로 구축한 BERT 모델을 각각 불러와 Fine-Tuning하여 결과를 비교해 봅니다
- 📔NOTE: 빠른 실행을 위해 Runtime 유형을 'GPU'로 해 주세요

In [None]:
!git clone https://github.com/e9t/nsmc.git

In [None]:
!pip install transformers==4.26.0 

In [None]:
# STEP0: Load libraries
import csv 
import random

import torch
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import pipeline
from transformers import Trainer, TrainingArguments

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from tqdm import tqdm

In [None]:
# STEP1: Prepare models
BERT_GENERAL_NAME = "snunlp/KR-BERT-char16424"
# https://huggingface.co/snunlp/KR-BERT-char16424
BERT_COMMENT_NAME = "beomi/kcbert-base"
# https://huggingface.co/beomi/kcbert-base

bert_general = AutoModelForSequenceClassification.from_pretrained(BERT_GENERAL_NAME,
                                                                  num_labels=2)
bert_general_tknizer = AutoTokenizer.from_pretrained(BERT_GENERAL_NAME)

bert_comment = AutoModelForSequenceClassification.from_pretrained(BERT_COMMENT_NAME,
                                                                  num_labels=2)
bert_comment_tknizer = AutoTokenizer.from_pretrained(BERT_COMMENT_NAME)

In [None]:
bert_general

In [None]:
from transformers import BertForSequenceClassification

BertForSequenceClassification??

In [None]:
# STEP2: Prepare the data 

DATA_PATH = './nsmc/ratings.txt'

with open(DATA_PATH) as f:
    nsmc_reader = csv.reader(f, delimiter='\t')
    
    nsmc = list()

    for row in list(nsmc_reader)[1:]:
        nsmc.append({"inputs": row[1],
                     "labels": int(row[2])})

class NsmcDataset(Dataset):
    def __init__(self, processed_data, tokenizer):
        self.processed_data = processed_data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.processed_data)
    
    def __getitem__(self, idx):
        inputs = self.processed_data[idx]['inputs']
        input_ids = self.tokenizer(inputs, truncation=True).input_ids
        labels = self.processed_data[idx]['labels']

        return {
            "inputs": inputs, 
            "input_ids": input_ids, "labels": labels}

def construct_datasets(dataset: list, tokenizer, random_state: int=263) -> dict:
    """split dataset into train-dev-test sets in the ratio of 0.7, 0.2, 0.1

    Will also return the data into NsmcDatset instances with input tokenized with tokenizer
    """

    train, others = train_test_split(dataset, test_size=0.3, random_state=random_state)
    dev, test = train_test_split(others, test_size=1/3, random_state=random_state)

    return {"train": NsmcDataset(train, tokenizer),
            "dev": NsmcDataset(dev[:int(len(dev)/100)], tokenizer),
            # the size of 'dev' is reduced only for demonstration purpose
            "test": NsmcDataset(test, tokenizer)}
    
nsmc_general = construct_datasets(nsmc, bert_general_tknizer)
nsmc_comment = construct_datasets(nsmc, bert_comment_tknizer)


In [None]:
bert_comment_tknizer.tokenize('이런거 정말 읽을수 있는 거임? ㅋㅋㅋ')

In [None]:
# STEP3: Set up a trainer

training_args_general = TrainingArguments(num_train_epochs=0.1,
                                        # probably increase epochs for better result
                                          output_dir='./checkpoints/general',
                                          per_device_train_batch_size=64,
                                          per_device_eval_batch_size=128,
                                          evaluation_strategy='steps', # or, 'epoch'
                                          logging_dir='./checkpoints/general/logs',
                                          logging_steps=50,
                                          save_steps=100,
                                          load_best_model_at_end=True,
                                        #   no_cuda=True
                                        )

training_args_comment = TrainingArguments(num_train_epochs=0.1,
                                        # probably increase epochs for better result
                                          output_dir='./checkpoints/comment',
                                          per_device_train_batch_size=64,
                                          per_device_eval_batch_size=128,
                                          evaluation_strategy='steps', # or, 'epoch'
                                          logging_dir='./checkpoints/comment/logs',
                                          logging_steps=50,
                                          save_steps=100,
                                          load_best_model_at_end=True,
                                          # no_cuda=True
                                        )

In [None]:
def metrics(model_output) -> dict:
    labels = model_output.label_ids
    predictions = model_output.predictions.argmax(-1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {"accuracy": accuracy,
            "f1": f1}

In [None]:
trainer_general = Trainer(model=bert_general,
                          args=training_args_general,
                          train_dataset=nsmc_general["train"], 
                          eval_dataset=nsmc_general["dev"],
                          tokenizer=bert_general_tknizer,
                          compute_metrics=metrics,
                          )

trainer_comment = Trainer(model=bert_comment,
                          args=training_args_comment,
                          train_dataset=nsmc_comment["train"], 
                          eval_dataset=nsmc_comment["dev"],
                          tokenizer=bert_comment_tknizer,
                          compute_metrics=metrics,
                          )

In [None]:
# STEP4: train

# check GPU availability

torch.cuda.is_available()

In [None]:
trainer_general.train()

In [None]:
trainer_comment.train()

In [None]:
# STEP5: predict
DEVICE = 'cuda:0' if torch.cuda.is_available else 'cpu'

pipe_general = pipeline(task='text-classification',
                model=bert_general,
                tokenizer=bert_general_tknizer,
                device=DEVICE,
                )

pipe_comment = pipeline(task='text-classification',
                model=bert_comment,
                tokenizer=bert_comment_tknizer,
                device=DEVICE,
                )

test_inputs_general = [nsmc_general['test'][idx]['inputs'] for idx in range(len(nsmc_general['test']))]
test_labels_general = [nsmc_general['test'][idx]['labels'] for idx in range(len(nsmc_general['test']))]
test_inputs_comment = [nsmc_comment['test'][idx]['inputs'] for idx in range(len(nsmc_comment['test']))]
test_labels_comment = [nsmc_comment['test'][idx]['labels'] for idx in range(len(nsmc_comment['test']))]

test_inputs_general = random.sample(test_inputs_general, 100)
test_labels_general = random.sample(test_labels_general, 100)
test_inputs_comment = random.sample(test_inputs_comment, 100)
test_labels_comment = random.sample(test_labels_comment, 100)

test_preds_general = pipe_general(test_inputs_general)
test_preds_comment = pipe_comment(test_inputs_comment)



In [None]:
test_preds_general[:10]

In [None]:
test_preds_comment[:10]

In [None]:
def metrics_for_pipe_result(pipe_res, labels):
    predictions = [0 if res["label"] == "LABEL_0" else 1 for res in pipe_res]

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {"accuracy": accuracy,
            "f1": f1}

In [None]:
metrics_for_pipe_result(test_preds_general, test_labels_general)

In [None]:
metrics_for_pipe_result(test_preds_comment, test_labels_comment)

In [None]:
NUM_PRINT = 10 

sample_indices = random.sample(range(len(test_preds_general)), NUM_PRINT)

for sample_index in sample_indices:
    print(f"Result for index: {sample_index}")
    print("Prediction on:")
    print(f"\t{test_inputs_general[sample_index]}")
    print("Answer:")
    print(f"\t{'POS' if test_labels_general[sample_index]==1 else 'NEG'}")
    print("Prediction with general bert:")
    print(f"\t{'POS' if test_preds_general[sample_index]['label']=='LABEL_1' else 'NEG'}", end='  ')
    print(f"\t{test_preds_general[sample_index]['score']:04.2f}")
    print("Prediction with comment bert:")
    print(f"\t{'POS' if test_preds_comment[sample_index]['label']=='LABEL_1' else 'NEG'}", end='  ')
    print(f"\t{test_preds_comment[sample_index]['score']:04.2f}")
    print('\n')

In [None]:
# EXTRA: save and load trained model

loaded_model = AutoModelForSequenceClassification.from_pretrained(
    "./checkpoints/comment/checkpoint-100", # 모델이 저장된 경로명
    )

loaded_tokenizer = AutoTokenizer.from_pretrained(
    "./checkpoints/comment/checkpoint-100", # 모델이 저장된 경로명
    )


In [None]:
pipe_loaded = pipeline(task='text-classification',
                model=loaded_model,
                tokenizer=loaded_tokenizer,
            )

pipe_loaded("이런 댓글도 해석할 수 있으려나요...?")


In [None]:
!zip -r './model_save.zip' './checkpoints/comment/checkpoint-100'