In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Dataset load

In [126]:
mnli = load_dataset("nyu-mll/glue", "mnli")

In [128]:
mnli

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [130]:
mnli["train"][0]["hypothesis"]

'Product and geography are what make cream skimming work. '

In [132]:
mnli["train"][0]["premise"]

'Conceptually cream skimming has two basic dimensions - product and geography.'

## train data를 split
* train data와 validation data로 나눔

In [134]:
mnli_train_val = mnli["train"].train_test_split(test_size=0.2, seed=42)

In [136]:
mnli_train_val

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 314161
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 78541
    })
})

In [138]:
mnli_train, mnli_val = mnli_train_val["train"].select(range(1000)), mnli_train_val["test"].select(range(200))

In [140]:
mnli_train

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 1000
})

In [142]:
mnli_val

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 200
})

In [144]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(data):
    return tokenizer(data["premise"], data["hypothesis"], truncation=True, padding=True)

mnli_train_tokenized = mnli_train.map(preprocess_function, batched=True)
mnli_val_tokenized = mnli_val.map(preprocess_function, batched=True)
validation_matched_tokenized = mnli["validation_matched"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

## Model 구현

In [146]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [160]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="hf_mnli",
    num_train_epochs = 10,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    logging_strategy = 'epoch',
    do_train = True,
    do_eval = True,
    eval_strategy = 'epoch',
    save_strategy= 'epoch',
    learning_rate = 2e-5,
    load_best_model_at_end = True
)

In [162]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## 학습 코드

In [164]:
from transformers import EarlyStoppingCallback


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mnli_train_tokenized,
    eval_dataset=mnli_val_tokenized,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
)

  trainer = Trainer(


In [166]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0879,1.090286,0.415
2,1.0685,1.068494,0.41
3,1.0092,1.022075,0.465
4,0.904,1.016271,0.44
5,0.7783,0.981894,0.475
6,0.6689,0.956881,0.53
7,0.5738,0.987727,0.54
8,0.4996,0.960167,0.55
9,0.4633,1.008297,0.54
10,0.4326,0.986872,0.555


TrainOutput(global_step=160, training_loss=0.7486150115728378, metrics={'train_runtime': 321.6257, 'train_samples_per_second': 31.092, 'train_steps_per_second': 0.497, 'total_flos': 966119580720000.0, 'train_loss': 0.7486150115728378, 'epoch': 10.0})

## 훈련에서 Epoch의 의미
* 한 Epoch = 전체 학습 데이터셋을 한 번 다 사용해서 학습한 것
* Epoch 1 끝나면 모델은 초기 가중치에서 한 번 학습된 상태
* Epoch 2에서는 Epoch 1에서 이어진 모델 상태를 다시 학습
* Epoch 10까지 같은 모델을 점점 개선하는 과정을 거치는 것

## load_best_model_at_end = True 설정 시
* 현재까지 학습된 모델 상태로 validation dataset에 대해 평가 (eval_loss, accuracy, f1 등 metric 이용)
* 이전까지 저장한 best model보다 좋으면 현재 모델을 checkpoints로 저장
  - 실제로 output_dir = 'hf_transformer' 경로에 10개의 checkpoints 폴더가 있음
  - config, optimizer, tokenizer, tokenizer_config, training_args 등을 저장함
* 학습이 끝나면 가장 좋았던 시점의 checkpoints를 불러옴
  - Epoch 1 -> validation loss : 0.301 기록
  - Epoch 2 -> validation loss : 0.285 기록 (best)
  - ...
  - Epoch n -> validation loss가 best보다 좋지 않으면 기록하지 않음

In [187]:
trainer.evaluate(validation_matched_tokenized)

{'eval_loss': 1.0123803615570068,
 'eval_accuracy': 0.5257259296994397,
 'eval_runtime': 76.5501,
 'eval_samples_per_second': 128.217,
 'eval_steps_per_second': 2.012,
 'epoch': 10.0}

* eval_accuracy는 52.5%