In [None]:
!pip install transformers datasets accelerate scikit-learn


In [11]:
#library
import os
os.environ["WANDB_DISABLED"] = "true"
import time
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score

# 랜덤 시드 고정 (재현성 확보)
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed(42)

# GPU 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device: {device}")

Current device: cuda


In [3]:
#load data from github
from datasets import load_dataset

#load nsmc daa in csv file format
data_files = {
    "train": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt",
    "test": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt"
}

# use csv instead of the nsmc and use delimiter
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

# check if data loading is done alright
print(f"Train set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

#load tokenizer
from transformers import AutoTokenizer
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# check the data
print(f"Train set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train set size: 150000
Test set size: 50000


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Train set size: 150000
Test set size: 50000


In [None]:

# Get rid of Nan
dataset = dataset.filter(lambda x: x['document'] is not None)

# define preprocessing function
def preprocess_function(examples):
    # max_length는 128 정도면 NSMC 분석에 충분하며 속도도 빠름
    return tokenizer(
        examples["document"],
        truncation=True,
        max_length=128,
        padding=False # Dynamic Padding을 위해 여기서는 패딩하지 않음
    )

# tokenize the entire dataset (use batch_size for parallel processing)
encoded_datasets = dataset.map(preprocess_function, batched=True)

# get rid of useless columns (Yeahhhhhh get rid of them!!!)
encoded_datasets = encoded_datasets.remove_columns(["id", "document"])
encoded_datasets = encoded_datasets.rename_column("label", "labels")
encoded_datasets.set_format("torch")

#data split (one for train set and another for test set)
train_dataset = encoded_datasets["train"]
eval_dataset = encoded_datasets["test"]

In [8]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #choose the logit with the highest value as a predicted value
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

In [None]:
# reinitialize a model
def get_model():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=2
    ).to(device)

# Dynamic Padding Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# training w/o bucketing
training_args_step4 = TrainingArguments(
    output_dir="./results_step4",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    group_by_length=False,
    fp16=True,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Define trainer
trainer_step4 = Trainer(
    model=get_model(),
    args=training_args_step4,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# measure time spent
print(">>> STEP 4: Training WITHOUT Bucketing...")
start_time_4 = time.time()
train_result_4 = trainer_step4.train()
end_time_4 = time.time()

# save results
step4_time = end_time_4 - start_time_4
step4_acc = trainer_step4.evaluate()['eval_accuracy']

print(f"STEP 4 Training Time: {step4_time:.2f} seconds")
print(f"STEP 4 Accuracy: {step4_acc:.4f}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_step4 = Trainer(


>>> STEP 4: Training WITHOUT Bucketing...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2441,0.238908,0.901814
2,0.1745,0.258919,0.906874
3,0.1136,0.305589,0.905454


STEP 4 Training Time: 1988.23 seconds
STEP 4 Accuracy: 0.9069


In [None]:
# same as above except using bucketing
training_args_step5 = TrainingArguments(
    output_dir="./results_step5",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    group_by_length=True,           # align by length
    fp16=True,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# same as above
trainer_step5 = Trainer(
    model=get_model(),
    args=training_args_step5,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# measure time spent in training the model
print("\n>>> STEP 5: Training WITH Bucketing...")
start_time_5 = time.time()
train_result_5 = trainer_step5.train()
end_time_5 = time.time()

#save the results
step5_time = end_time_5 - start_time_5
step5_acc = trainer_step5.evaluate()['eval_accuracy']

print(f"STEP 5 Training Time: {step5_time:.2f} seconds")
print(f"STEP 5 Accuracy: {step5_acc:.4f}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_step5 = Trainer(



>>> STEP 5: Training WITH Bucketing...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2437,0.250168,0.902934
2,0.1739,0.256222,0.907474
3,0.1154,0.304459,0.907734


STEP 5 Training Time: 1383.28 seconds
STEP 5 Accuracy: 0.9077


In [None]:
print("FINAL COMPARISON REPORT")
print(f"Standard Accuracy : {step4_acc*100:.2f}%")
print(f"Bucketing Accuracy: {step5_acc*100:.2f}%")
print(f"Accuracy Gap: {(step5_acc - step4_acc)*100:.2f}%p")
print(f"Standard Time: {step4_time:.2f} sec")
print(f"Bucketing Time: {step5_time:.2f} sec")
print(f"Speed Improvement: {step4_time - step5_time:.2f} sec faster")

if step4_acc >= 0.9:
    print("Validation accuracy > 90%")
else:
    print("Accuracy is less than 90%")

if step5_time < step4_time:
    print("Bucketing saved me some time for real")
else:
    print("No difference")

FINAL COMPARISON REPORT
Standard Accuracy : 90.69%
Bucketing Accuracy: 90.77%
Accuracy Gap: 0.09%p
Standard Time: 1988.23 sec
Bucketing Time: 1383.28 sec
Speed Improvement: 604.96 sec faster
Validation accuracy > 90%
Bucketting saved me some time for real


회고:
bucketing On/Off 하나로 연산속도 차이가 상당해서 꽤 놀라웠다.
모델 성능 차이도 많이 놀랍긴 했으나 (기존 우리가 진행한 것에 비해) 사실 우리가 지금 하는건 일반화 능력이 있는 (=Pre-Trained) 모델을 가지고 우리의 태스크에 맞춤형으로 바꾸는 Fine-Tuning 이라는 점을 감안하면 어찌보면 당연하다고 생각된다. 