<!-- ---
- Project: 2023 Winter School
- Author: Gyu-min Lee
- Version: 0.10
- Changelog
    - 0.1 -- Initiated the file
    - 0.5 -- First Draft
    - 0.9 -- Proofread
    - 0.10 -- Restructured with Tensorboard integration
--- -->

2023 전산언어학 겨울학교 5일차 2교시

# Syntax 

## Project: SNLI with ALBERT 

- 🤗 Hub의 모델과 데이터셋을 불러와 파인튜닝 및 성능 평가를 진행합니다
- 📔NOTE: 빠른 실행을 위해 Runtime 유형을 'GPU'로 해 주세요

In [None]:
!pip install transformers==4.26.0 datasets==2.9.0

In [None]:
# STEP0: Load libraries
import random

import torch

from sklearn.metrics import accuracy_score, f1_score

import datasets

from transformers import pipeline
from transformers import Trainer, TrainingArguments

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from tqdm import tqdm

In [None]:
# STEP1: Prepare data
DATASET_NAME = "snli"
# https://huggingface.co/datasets/snli

dataset = datasets.load_dataset(DATASET_NAME)
dataset

In [None]:
# STEP2: Prepare models
MODEL_NAME = "albert-base-v2"
# https://huggingface.co/albert-base-v2

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                           num_labels = 3)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def preprocess(dataset):
    text = dataset['premise'] + ' ' + dataset['hypothesis']
    result = text.lower()
    result = tokenizer(result, truncation=True)

    return result 

In [None]:
dataset['train'] = dataset['train'].select(random.sample(range(len(dataset['train'])), 7000))
dataset['test'] = dataset['test'].select(random.sample(range(len(dataset['test'])), 2000))
dataset['validation'] = dataset['validation'].select(random.sample(range(len(dataset['validation'])), 1000))
# sampling the data for demonstration purpose only

In [None]:
dataset['train'] = dataset['train'].map(preprocess)
dataset['test'] = dataset['test'].map(preprocess)
dataset['validation'] = dataset['validation'].map(preprocess)

In [None]:
dataset['train'] = dataset['train'].rename_column("label", "labels")
dataset['test'] = dataset['test'].rename_column("label", "labels")
dataset['validation'] = dataset['validation'].rename_column("label", "labels")


dataset['train'] = dataset['train'].filter(lambda x: x['labels'] != -1) 
dataset['test'] = dataset['test'].filter(lambda x: x['labels'] != -1)
dataset['validation'] = dataset['validation'].filter(lambda x: x['labels'] != -1)

In [None]:
# STEP3: Set up a trainer

def metrics(model_output) -> dict:
    labels = model_output.label_ids
    predictions = model_output.predictions.argmax(-1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {"accuracy": accuracy,
            "f1": f1}
            
training_args = TrainingArguments(num_train_epochs=2,
                                    # probably increase epochs for better result
                                    output_dir='./checkpoints',
                                    per_device_train_batch_size=64,
                                    per_device_eval_batch_size=128,
                                    evaluation_strategy='steps', # or, 'epoch'
                                    logging_dir='./checkpoints/logs',
                                    logging_steps=50,
                                    save_steps=100,
                                    load_best_model_at_end=True,
                                    )

trainer = Trainer(model=model,
                    args=training_args,
                    train_dataset=dataset["train"], 
                    eval_dataset=dataset["validation"],
                    tokenizer=tokenizer,
                    compute_metrics=metrics,
                )
                          

In [None]:
torch.cuda.is_available()

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./checkpoints/logs

In [None]:
# STEP4: train
trainer.train()

In [None]:
# STEP5: predict
DEVICE = 'cuda:0' if torch.cuda.is_available else 'cpu'

pipe = pipeline(task='text-classification',
                model=model,
                tokenizer=tokenizer,
                device=DEVICE,
                )

test_inputs =  [data['premise'] + ' ' + data['hypothesis'] for data in dataset['test']]
test_labels = [data['labels'] for data in dataset['test']]

preds = pipe(test_inputs)


In [None]:
preds[:10]

In [None]:
counter = 0

for input, label, pred in zip(test_inputs, test_labels, preds): 
    print(input) 
    print(label, end='\t')
    print(pred['label'])
    print('\n')
    counter += 1
    if counter >= 10:
        break

In [None]:
dataset['train'].features['labels']

In [None]:
preds = [0 if pred['label'] == "LABEL_0" 
         else 1 if pred['label'] == "LABEL_1" else 2
         for pred in preds]


In [None]:
accruacy = accuracy_score(test_labels, preds)
f1 = f1_score(test_labels, preds, average='weighted')

print(f"Accuracy: \t{accruacy:04.2f}")
print(f"F1: \t{f1:04.2f}")