<!-- ---
- Project: 2023 Winter School
- Author: Gyu-min Lee
- Version: 0.10
- Changelog
    - 0.1 -- Initiated the file
    - 0.5 -- First Draft
    - 0.9 -- Proofread
        - 0.9.5 -- Dataset from SNLI to CoLA
    - 0.10 -- Restructured with Tensorboard integration
--- -->

2023 전산언어학 겨울학교 5일차 1교시

# Syntax 

## Project: CoLA with ALBERT 

- 🤗 Hub의 모델과 데이터셋을 불러와 파인튜닝 및 성능 평가를 진행합니다
- 📔NOTE: 빠른 실행을 위해 Runtime 유형을 'GPU'로 해 주세요

In [None]:
!pip install transformers==4.26.0 datasets==2.9.0

In [None]:
# STEP0: Load libraries
import random

import torch

from sklearn.metrics import accuracy_score, f1_score

import datasets

from transformers import pipeline
from transformers import Trainer, TrainingArguments

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from tqdm import tqdm

In [None]:
# STEP1: Prepare data
dataset = datasets.load_dataset("glue", "cola")
# https://huggingface.co/datasets/glue
# https://nyu-mll.github.io/CoLA/

dataset

In [None]:
dataset['train'].features

In [None]:
# STEP2: Prepare models
MODEL_NAME = "albert-base-v2"
# https://huggingface.co/albert-base-v2

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                           num_labels = 2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def preprocess(dataset):
    text = dataset['sentence']
    result = text.lower()
    result = tokenizer(result, truncation=True)

    return result 

In [None]:
dataset['train'] = dataset['train'].map(preprocess)
dataset['validation'] = dataset['validation'].map(preprocess)

In [None]:
dataset['train'] = dataset['train'].rename_column("label", "labels")
dataset['validation'] = dataset['validation'].rename_column("label", "labels")

In [None]:
# STEP3: Set up a trainer

def metrics(model_output) -> dict:
    labels = model_output.label_ids
    predictions = model_output.predictions.argmax(-1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {"accuracy": accuracy,
            "f1": f1}
            
training_args = TrainingArguments(num_train_epochs=3,
                                    output_dir='./checkpoints',
                                    per_device_train_batch_size=64,
                                    per_device_eval_batch_size=128,
                                    evaluation_strategy='steps', # or, 'epoch'
                                    logging_dir='./checkpoints/logs',
                                    logging_steps=50,
                                    save_steps=100,
                                    load_best_model_at_end=True,
                                    )

trainer = Trainer(model=model,
                    args=training_args,
                    train_dataset=dataset["train"], 
                    eval_dataset=dataset["validation"],
                    tokenizer=tokenizer,
                    compute_metrics=metrics,
                )
                          

In [None]:
torch.cuda.is_available()

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./checkpoints/logs

In [None]:
# STEP4: train
trainer.train()

In [None]:
# STEP5: predict
DEVICE = 'cuda:0' if torch.cuda.is_available else 'cpu'

pipe = pipeline(task='text-classification',
                model=model,
                tokenizer=tokenizer,
                device=DEVICE,
                )

test_inputs =  [data['sentence'] for data in dataset['test']]

preds = pipe(test_inputs)


In [None]:
preds[:10]

In [None]:
# The following is not available since
# the answer labels are not public for CoLA

# accruacy = accuracy_score(test_labels, preds)
# f1 = f1_score(test_labels, preds, average='weighted')

# print(f"Accuracy: \t{accruacy:04.2f}")
# print(f"F1: \t{f1:04.2f}")

In [None]:
# EXTRA: saving the dataset
import json

test_dict = dataset['validation'].to_dict()
# to_dict, to_csv, to_pandas

with open('out.json', 'w') as f:
    json.dump(test_dict, f, ensure_ascii=False, indent=4)

In [None]:
test_dict.keys()

In [None]:
!head -n 30 out.json

In [None]:
!pip install pandas openpyxl

In [None]:
import pandas as pd

test_df = dataset['validation'].to_pandas()

In [None]:
test_df

In [None]:
test_df = test_df.drop('input_ids', axis=1)
test_df = test_df.drop('token_type_ids', axis=1)
test_df = test_df.drop('attention_mask', axis=1)
test_df

In [None]:
test_df.to_excel('test_excel.xlsx',
                 header=True,
                 index=False,
                 )