### Check. Library

In [1]:
import tensorflow as tf
import numpy as np
import transformers
import datasets

In [2]:
print(tensorflow.__version__)
print(numpy.__version__)
print(transformers.__version__)
print(datasets.__version__)

2.6.0
1.21.4
4.11.3
1.14.0


### Load. Library

In [2]:
# 파일 저장 커스텀 모듈

import sys
sys.path.append("../custom")

from importlib import reload
import custom_utils
reload(custom_utils)

from custom_utils import save_var, load_var


In [3]:
# tf version 및 gpu 확인
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print(tf.test.gpu_device_name())

2.6.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
/device:GPU:0


In [3]:
import torch

# Check for and set up GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_cuda_memory_summary():
    # Obtain and print GPU memory summary
    memory_summary = torch.cuda.memory_summary(device=device, abbreviated=False)
    print(memory_summary)
    
def empty_cuda_cache():
    # Run your deep learning code on the GPU
    torch.cuda.empty_cache()

# STEP 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [4]:
import datasets
from datasets import load_dataset

ds = load_dataset('nsmc')
print(ds)

Using custom data configuration default
Reusing dataset nsmc (/aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [5]:
ds_split = ds['train'].train_test_split(test_size=0.2)

Loading cached split indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-a3c98fba040dae08.arrow and /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-4552e18e170dc7b2.arrow


# STEP 2. klue/bert-base model 및 tokenizer 불러오기

In [6]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

huggingface_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
huggingface_model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels = 2)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

# STEP 3. 위에서 불러온 tokenizer으로 데이터셋을 전처리하고, model 학습 진행해 보기

## tokenize

In [7]:
def transform(data):
    return huggingface_tokenizer(
        data['document'],
        truncation = True,
        padding = 'max_length',
        return_token_type_ids = False,
        )

In [8]:
hf_dataset_train = ds_split['train'].map(transform, batched=True)
hf_dataset_val = ds_split['test'].map(transform, batched=True)
hf_dataset_test = ds['test'].map(transform, batched=True)

  0%|          | 0/120 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [60]:
print(hf_dataset_train[0])

{'id': '5050479', 'label': 0, 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [15]:
prefix = 't0'

In [16]:
save_var(hf_dataset_train, f"{prefix}_hf_dataset_train")
save_var(hf_dataset_val, f"{prefix}_hf_dataset_val")
save_var(hf_dataset_test, f"{prefix}_hf_dataset_test")

In [17]:
hf_dataset_train = load_var(f"{prefix}_hf_dataset_train")
hf_dataset_val = load_var(f"{prefix}_hf_dataset_val")
hf_dataset_test = load_var(f"{prefix}_hf_dataset_test")

## train - 0

In [10]:
import os
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = 'results'

In [42]:
training_arguments = TrainingArguments(
    output_dir,                        # output이 저장될 경로
    evaluation_strategy="epoch",       # evaluation하는 빈도
    learning_rate = 2e-5,              # learning_rate
    per_device_train_batch_size = 4,   # 각 device 당 batch size
    per_device_eval_batch_size = 4,    # evaluation 시에 batch size
    num_train_epochs = 1,              # train 시킬 총 epochs
    weight_decay = 0.01,               # weight decay
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


> batch size 를 '8' 만 되어도 메모리 부족이 된다. 하...  
> '4' 로 해야 겨우 학습을 진행 할 수 있다.

In [14]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):    
    predictions,labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references = labels)

In [43]:
trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_dataset_train,    # training dataset
    eval_dataset=hf_dataset_val,       # evaluation dataset
    compute_metrics=compute_metrics,
)

In [78]:
del trainer

In [21]:
empty_cuda_cache()

In [28]:
get_cuda_memory_summary()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  432881 KB |  432881 KB |  432881 KB |       0 B  |
|       from large pool |  432384 KB |  432384 KB |  432384 KB |       0 B  |
|       from small pool |     497 KB |     497 KB |     497 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  432881 KB |  432881 KB |  432881 KB |       0 B  |
|       from large pool |  432384 KB |  432384 KB |  432384 KB |       0 B  |
|       from small pool |     497 KB |     497 KB |     497 KB |       0 B  |
|---------------------------------------------------------------

> trainer 생성을 하면 CUDA memory 에 올라가는데, 16G 거의 꽉 찬다.  
> 그런데 trainer 변수를 메모리에서 제거하고, torch.cuda.empty_cache() 로 메모리를 비우려고 해도 비워지지 않는다.  
> 결과적으로 같은 설정으로 다시 학습시키려하면, 메모리 부족으로 진행이 되지 않는다.  
> kernel 을 재시작하여 처음부터 다시 해야한다. 하...  

In [29]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 120000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 90000


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3861,0.473481,0.8926
1,0.3542,0.489593,0.89224
1,0.3542,0.489593,0.89224


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

RuntimeError: [enforce fail at inline_container.cc:298] . unexpected pos 98344640 vs 98344592

> 1 에퐄에 3시간 넘게 걸린다.  
> 4시간쯤 지났을 때, 하드디스크 용량 초과로 학습이 중단되었다.  
> 500 마다 checkpoint 저장이 되는데, 그 결과 90기가 이상 쌓이면서 하드디스크 용량이 초과되었다. 하...  

In [31]:
result = trainer.evaluate(hf_dataset_test)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 4


In [32]:
result

{'eval_loss': 0.4895934462547302, 'eval_accuracy': 0.89224}

In [33]:
prefix = "t0"

In [34]:
histories = {}

In [35]:
histories[prefix] = result

In [36]:
save_var(histories, "histories")

Epoch 1 and more - about 34,000 steps  
> Training Loss: 0.354200  
> Validation Loss: 0.489593  
> Accuracy: 0.892240  

## 아래 fine-tuning 비교를 위해 6,500 steps 에서 평가

In [49]:
training_arguments = TrainingArguments(
    output_dir,                        # output이 저장될 경로
    evaluation_strategy="steps",       # evaluation하는 빈도
    eval_steps=6000,
    learning_rate = 2e-5,              # learning_rate
    per_device_train_batch_size = 4,   # 각 device 당 batch size
    per_device_eval_batch_size = 4,    # evaluation 시에 batch size
    num_train_epochs = 1,              # train 시킬 총 epochs
    weight_decay = 0.01,               # weight decay
    save_total_limit=1,                # Limit the total number of checkpoints
    save_steps=500,                    # Save a checkpoint every 500 steps
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [46]:
# Load the model checkpoint
prefix = "t0"
model_checkpoint = output_dir + f"/{prefix}_checkpoint-500"
ckp_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

loading configuration file results/t0_checkpoint-500/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file results/t0_checkpoint-500/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized f

In [50]:
trainer = Trainer(
    model=ckp_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_dataset_train,    # training dataset
    eval_dataset=hf_dataset_val,       # evaluation dataset
    compute_metrics=compute_metrics,
)

In [51]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 30000


Step,Training Loss,Validation Loss,Accuracy
6000,0.12,0.694401,0.883133


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2500
Configurat

KeyboardInterrupt: 

6,500 steps  
> Training Loss: 0.120000  
> Validation Loss: 0.694401  
> Accuracy: 0.883133  

# STEP 4. Fine-tuning을 통하여 모델 성능(accuarcy) 향상시키기

데이터 전처리, TrainingArguments 등을 조정하여 모델의 정확도를 90% 이상으로 끌어올려봅시다.

## train - 1

In [11]:
training_arguments = TrainingArguments(
    output_dir=output_dir,              # Output directory
    evaluation_strategy="steps",       # Evaluation frequency
    eval_steps=10000,                    # Evaluation steps
    learning_rate=2e-5,                # Learning rate
    per_device_train_batch_size=4,      # Batch size per GPU
    per_device_eval_batch_size=4,       # Evaluation batch size per GPU
    num_train_epochs=2,                # Total number of training epochs
    weight_decay=0.01,                 # Weight decay
    lr_scheduler_type="cosine_with_restarts",  # Learning rate scheduler type
    warmup_steps=500,                  # Number of warmup steps
    save_total_limit=1,                # Limit the total number of checkpoints
    save_steps=500,                    # Save a checkpoint every 500 steps
)

In [None]:
# Load the model checkpoint
prefix = "t1"
model_checkpoint = output_dir + f"/archived/{prefix}_checkpoint-6500"
ckp_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

loading configuration file results/archived/t1_checkpoint-6500/config.json
Model config BertConfig {
  "_name_or_path": "results/checkpoint-6000",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file results/archived/t1_checkpoint-6500/pytorch_model.bin


In [41]:
trainer = Trainer(
    model=ckp_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_dataset_train,    # training dataset
    eval_dataset=hf_dataset_val,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 30000


Step,Training Loss,Validation Loss,Accuracy
500,0.1983,0.686171,0.8911


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 4
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin


KeyboardInterrupt: 

TrainingArguments 수정 후 6,500 step 에서,  
> Training Loss: 0.198300  
> Validation Loss: 0.686171  
> Accuracy: 0.891100  

In [15]:
trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_dataset_train,    # training dataset
    eval_dataset=hf_dataset_val,       # evaluation dataset
    compute_metrics=compute_metrics,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 120000
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 60000


Step,Training Loss,Validation Loss,Accuracy
10000,0.4541,0.483779,0.879267
20000,0.4196,0.453592,0.891867
30000,0.3688,0.451635,0.892567
40000,0.2826,0.506669,0.898667
50000,0.3049,0.466787,0.900367
60000,0.2751,0.457565,0.901933


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2500
Configurat

TrainOutput(global_step=60000, training_loss=0.3681246737162272, metrics={'train_runtime': 29739.8295, 'train_samples_per_second': 8.07, 'train_steps_per_second': 2.017, 'total_flos': 6.31466532864e+16, 'train_loss': 0.3681246737162272, 'epoch': 2.0})

2 epochs - 60,000 steps  
> Training Loss: 0.275100  
> Validation Loss: 0.457565  
> Accuracy: 0.901933  

In [19]:
result = trainer.evaluate(hf_dataset_test)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 4


In [20]:
result

{'eval_loss': 0.4542900025844574,
 'eval_accuracy': 0.90268,
 'eval_runtime': 1649.7985,
 'eval_samples_per_second': 30.307,
 'eval_steps_per_second': 7.577,
 'epoch': 2.0}

In [18]:
prefix = "t1"

In [17]:
histories

{'t0': {'eval_loss': 0.4895934462547302, 'eval_accuracy': 0.89224}}

In [21]:
histories[prefix] = result

In [22]:
save_var(histories, "histories")

In [16]:
histories = load_var("histories")

Evaluation
> eval_loss: 0.454290  
> eval_accuracy: 0.90268  

# STEP 5. Bucketing을 적용하여 학습시키고, STEP 4의 결과와의 비교

아래 링크를 바탕으로 bucketing과 dynamic padding이 무엇인지 알아보고, 이들을 적용하여 model을 학습시킵니다.

- [Data Collator](https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/data_collator)
- [Training Arguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) - group_by_length


## train - 2

STEP 4에 학습한 결과와 bucketing을 적용하여 학습시킨 결과를 비교해보고, 모델 성능 향상과 훈련 시간 두 가지 측면에서 각각 어떤 이점이 있는지 비교해봅시다.

In [27]:
from transformers import DataCollatorWithPadding

In [26]:
data_collator = DataCollatorWithPadding(huggingface_tokenizer)

In [28]:
training_arguments = TrainingArguments(
    output_dir=output_dir,              # Output directory
    evaluation_strategy="steps",       # Evaluation frequency
    eval_steps=10000,                    # Evaluation steps
    learning_rate=2e-5,                # Learning rate
    per_device_train_batch_size=4,      # Batch size per GPU
    per_device_eval_batch_size=4,       # Evaluation batch size per GPU
    num_train_epochs=1,                # Total number of training epochs
    weight_decay=0.01,                 # Weight decay
    lr_scheduler_type="cosine_with_restarts",  # Learning rate scheduler type
    warmup_steps=500,                  # Number of warmup steps
    save_total_limit=1,                # Limit the total number of checkpoints
    save_steps=500,                    # Save a checkpoint every 500 steps
    group_by_length=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [29]:
del huggingface_model
huggingface_model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels = 2)

loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/klue/bert-base/resolve/main/pytorch_model.bin from cache at /aiffel/.cache/huggingface/transform

In [30]:
trainer = Trainer(
    model=huggingface_model,           # 학습시킬 model
    args=training_arguments,           # TrainingArguments을 통해 설정한 arguments
    train_dataset=hf_dataset_train,    # training dataset
    eval_dataset=hf_dataset_val,       # evaluation dataset
    compute_metrics=compute_metrics,
    data_collator=data_collator
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 30000


Step,Training Loss,Validation Loss,Accuracy
10000,0.444,0.43573,0.8845
20000,0.3823,0.401782,0.8966
30000,0.35,0.391521,0.899567


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2500
Configurat

TrainOutput(global_step=30000, training_loss=0.4265346171061198, metrics={'train_runtime': 14857.1138, 'train_samples_per_second': 8.077, 'train_steps_per_second': 2.019, 'total_flos': 3.15733266432e+16, 'train_loss': 0.4265346171061198, 'epoch': 1.0})

30,000 step

> Training Loss: 0.35  
> Validation Loss: 0.391521  
> Accuracy: 0.898567  

> 1 epoch, 30,000 step 에서,  
> Data Collator, group_by_length 를 썼을 때,  
> 이전 결과인 0.368800 / 0.451635 / 0.892567 에 비해서  
> Loss 는 좀 더 낮아지고, Accuracy 는 좀 더 올라간다.  

In [31]:
result = trainer.evaluate(hf_dataset_test)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 4


In [32]:
result

{'eval_loss': 0.39486292004585266,
 'eval_accuracy': 0.8979,
 'eval_runtime': 1651.5242,
 'eval_samples_per_second': 30.275,
 'eval_steps_per_second': 7.569,
 'epoch': 1.0}

In [33]:
prefix = "t2"

In [34]:
histories[prefix] = result

In [35]:
save_var(histories, "histories")

Evaluation
> eval_loss: 0.394862...  
> eval_accuracy: 0.8979  

> 시간상 30,000 step 밖에 진행을 못했지만,  
> 50,000 step 에서 0.9 에 도달했던 앞선 결과와 비교하면,  
> 40,000 step 에서 0.9 에 도달할 것으로 판단된다.  

### 추가 학습

In [None]:
res = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 30000


Step,Training Loss,Validation Loss


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to results/checkpoint-2500
Configurat

# 회고

- 이틀 안에 1 epoch 당 3~4시간 걸리는 학습을 진행하기는 좀 버거웠다.
  - 이틀이라는 시간은 좀 짧은 시간이었지만, 
  - 별다른 커스터마이징과 큰 파인튜닝 없이도 이 정도 성능이라면 실제 간단한 프로젝트에서는 괜찮지 않을까.
- 아쉬운 점
  - 학습 데이터를 많이 줄여서 빠른 학습으로 TrainingArguments 파라미터를 이것저것 조정해봤어도 좋았을 것 같다.
  - Loss/Accuracy 값으로 결과를 확인하는 것 외에 어떤 문장을 잘못 판단했는지 분석해보지 못한 것도 좀 아쉽다. 
  - nsmc 데이터를 가지고 klue/bert-base 모델의 결과와 이전에 다른 모델을 썼던 결과를 비교해보면 좋겠다.