In [74]:
# 필요한 라이브러리 가져오기
import os
import sys
import math
import torch
import wandb
import logging
import datasets
import argparse
import evaluate
import transformers

from typing import Optional
from itertools import chain
from dataclasses import dataclass, field

from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator
)
from transformers.trainer_utils import get_last_checkpoint

In [75]:
# wandb 연결 후 project 생성
wandb.init(project='Hanghae99')
wandb.run.name = 'gpt-finetuning-1'

In [76]:
# Argument 정의
@dataclass
class Arguments:
    model_name_or_path: Optional[str] = field(default=None)  # HuggingFace hub에서 pre-trained 모델로 사용할 모델의 이름
    # 우리 모델의 precision(=> data type)
    torch_dtype: Optional[str] = field(default=None, metadata={'choices': ['auto', 'bfloat16', 'float16', 'float32']})
    dataset_name: Optional[str] = field(default=None)  # Fine-tuning으로 사용할 huggingface hub에서의 dataset 이름
    dataset_config_name: Optional[str] = field(default=None)  # Fine-tuning으로 사용할 huggingface hub에서의 dataset configuration
    block_size: int = field(default=1024)  # Fine-tuning에 사용할 input text의 길이
    num_workers: Optional[int] = field(default=None)  # Data를 업로드하거나 전처리할 때 사용할 worker 숫자

arg_dict = {
    "model_name_or_path": "gpt2",
    "torch_dtype": "float32",
    "dataset_name": "wikitext",
    "dataset_config_name": "wikitext-2-raw-v1",
    "block_size": 128,
    "num_workers": 2,
    "output_dir": "./gpt2-output",
    "overwrite_output_dir": True,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,
    "logging_strategy": "epoch",
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "remove_unused_columns": False
}

parser = HfArgumentParser((Arguments, TrainingArguments))
args, training_args = parser.parse_dict(arg_dict)

[INFO|training_args.py:2164] 2025-05-11 14:40:05,975 >> PyTorch: setting up devices
[INFO|training_args.py:1835] 2025-05-11 14:40:05,976 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [77]:
# logger 설정하기

logger = logging.getLogger()

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
    transformers.utils.logging.set_verbosity_info()  # log level을 INFO로 변경 

log_level = training_args.get_process_log_level()

# 우리가 가지고 있는 logger와 HuggingFace의 logger의 log level 설정
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)

# 기타 HuggingFace logger option들을 설정
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.info(f"Training/evaluation parameters {training_args}")

05/11/2025 14:40:05 - INFO - root - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,

In [78]:
# dataset 로드하기
raw_datasets = load_dataset(
    args.dataset_name,
    args.dataset_config_name
)

print(raw_datasets)
eval_dataset = raw_datasets["validation"] if "validation" in raw_datasets else None
if eval_dataset is not None:
    tokenized_eval = eval_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=eval_dataset.column_names
    )
    eval_dataset = tokenized_eval.map(
        group_texts,
        batched=True,
        num_proc=args.num_workers
    )

print(eval_dataset)

# model config
config = AutoConfig.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    config=config,
    torch_dtype=args.torch_dtype
)

Overwrite dataset info from restored data version if exists.


05/11/2025 14:40:12 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3


05/11/2025 14:40:12 - INFO - datasets.info - Loading Dataset info from /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3


Found cached dataset wikitext (/Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3)


05/11/2025 14:40:12 - INFO - datasets.builder - Found cached dataset wikitext (/Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3)


Loading Dataset info from /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3


05/11/2025 14:40:12 - INFO - datasets.info - Loading Dataset info from /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa.arrow


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
05/11/2025 14:40:12 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa.arrow


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00000_of_00002.arrow


05/11/2025 14:40:12 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00001_of_00002.arrow


05/11/2025 14:40:12 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_*_of_00002.arrow


05/11/2025 14:40:12 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:12 - INFO - datasets.arrow_dataset - Concatenating 2 shards
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1930
})


[INFO|configuration_utils.py:693] 2025-05-11 14:40:12,970 >> loading configuration file config.json from cache at /Users/seungchanhong/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
[INFO|configuration_utils.py:765] 2025-05-11 14:40:12,972 >> Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_param

In [79]:
# model config
tokenizer = AutoTokenizer.from_pretrained("gpt2")  
model = AutoModelForCausalLM.from_pretrained("gpt2")

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"

embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

column_names = list(raw_datasets["train"].features)
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    output = tokenizer(examples[text_column_name])
    return output

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)
    
with training_args.main_process_first(desc="dataset map tokenization"):
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=args.num_workers,
        remove_columns=column_names
    )

max_pos_embeddings = config.max_position_embeddings if hasattr(config, "max_position_embeddings") else 1024
block_size = args.block_size if tokenizer.model_max_length is None else min(args.block_size, tokenizer.model_max_length)

def group_texts(examples):
    # 주어진 text들을 모두 concat 해줍니다. 
    # 예를 들어 examples = {'train': [['Hello!'], ['Yes, that is great!']]}이면 결과물은 {'train': ['Hello! Yes, that is great!']}가 됩니다.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    
    # 전체 길이를 측정합니다.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    
    # block_size로 text를 쪼갭니다.
    # 예를 들어 block_size=3일 때 {'train': ['Hello! Yes, that is great!']}는
    # {'train': ['Hel', 'lo!', ' Ye', 's, ', 'tha', ...]}가 됩니다. 
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    
    # Next token prediction이니 label은 자기 자신으로 설정합니다.
    result["labels"] = result["input_ids"].copy()
    return result
    
with training_args.main_process_first(desc="grouping texts together"):
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=args.num_workers
    )

[INFO|configuration_utils.py:693] 2025-05-11 14:40:13,737 >> loading configuration file config.json from cache at /Users/seungchanhong/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
[INFO|configuration_utils.py:765] 2025-05-11 14:40:13,740 >> Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_param

05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa.arrow


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b_00000_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b_00001_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b_*_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-e5c2b46b2e42895b_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Concatenating 2 shards


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f_00000_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f_00001_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f_*_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-18838ae4e0f2bf1f_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Concatenating 2 shards


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa_00000_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa_00001_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa_*_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-83d8f972563338fa_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Concatenating 2 shards


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-7a8cbaace6482f50_00000_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-7a8cbaace6482f50_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-7a8cbaace6482f50_00001_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-7a8cbaace6482f50_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-7a8cbaace6482f50_*_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-7a8cbaace6482f50_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Concatenating 2 shards


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-6df5f404cf046b04_00000_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-6df5f404cf046b04_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-6df5f404cf046b04_00001_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-6df5f404cf046b04_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-6df5f404cf046b04_*_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-6df5f404cf046b04_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Concatenating 2 shards


Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00000_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #0 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00000_of_00002.arrow


Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00001_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Process #1 will write at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_00001_of_00002.arrow


Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_*_of_00002.arrow


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/seungchanhong/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/0.0.0/b08601e04326c79dfdd32d625aee71d232d685c3/cache-cc2549edca868b7c_*_of_00002.arrow


Concatenating 2 shards


05/11/2025 14:40:14 - INFO - datasets.arrow_dataset - Concatenating 2 shards


In [80]:
train_dataset = lm_datasets["train"]

# trainer에 validation_dataset 추가
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

checkpoint = None

# 만약 output_dir에 checkpoint가 남아있으면 이를 사용하고, 없으면 None이 return됨
last_checkpoint = get_last_checkpoint(training_args.output_dir)
# output_dir이 아닌 다른 위치에서의 checkpoint를 resume_from_checkpoint로 지정할 수 있음
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
else:  
    # 아니면 last_checkpoint로 checkpoint를 지정함  
    checkpoint = last_checkpoint
    
train_result = trainer.train(resume_from_checkpoint=checkpoint)

trainer.save_model()

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

if eval_dataset is not None:
    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)
    trainer.save_metrics("eval", eval_metrics)

trainer.save_state()

  trainer = Trainer(
[INFO|trainer.py:2414] 2025-05-11 14:40:14,804 >> ***** Running training *****
[INFO|trainer.py:2415] 2025-05-11 14:40:14,804 >>   Num examples = 18,667
[INFO|trainer.py:2416] 2025-05-11 14:40:14,804 >>   Num Epochs = 3
[INFO|trainer.py:2417] 2025-05-11 14:40:14,805 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2420] 2025-05-11 14:40:14,805 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2421] 2025-05-11 14:40:14,805 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2422] 2025-05-11 14:40:14,805 >>   Total optimization steps = 14,001
[INFO|trainer.py:2423] 2025-05-11 14:40:14,805 >>   Number of trainable parameters = 124,439,808
[INFO|integration_utils.py:831] 2025-05-11 14:40:14,806 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,3.4695,3.40876
2,3.1586,3.401
3,3.0151,3.413268


[INFO|trainer.py:4307] 2025-05-11 14:51:32,917 >> 
***** Running Evaluation *****
[INFO|trainer.py:4309] 2025-05-11 14:51:32,918 >>   Num examples = 1930
[INFO|trainer.py:4312] 2025-05-11 14:51:32,918 >>   Batch size = 8
[INFO|trainer.py:3984] 2025-05-11 14:51:49,076 >> Saving model checkpoint to ./gpt2-output/checkpoint-4667
[INFO|configuration_utils.py:419] 2025-05-11 14:51:49,077 >> Configuration saved in ./gpt2-output/checkpoint-4667/config.json
[INFO|configuration_utils.py:911] 2025-05-11 14:51:49,077 >> Configuration saved in ./gpt2-output/checkpoint-4667/generation_config.json
[INFO|modeling_utils.py:3572] 2025-05-11 14:51:49,298 >> Model weights saved in ./gpt2-output/checkpoint-4667/model.safetensors
[INFO|tokenization_utils_base.py:2510] 2025-05-11 14:51:49,299 >> tokenizer config file saved in ./gpt2-output/checkpoint-4667/tokenizer_config.json
[INFO|tokenization_utils_base.py:2519] 2025-05-11 14:51:49,300 >> Special tokens file saved in ./gpt2-output/checkpoint-4667/special

***** train metrics *****
  epoch                    =        3.0
  total_flos               =  3406921GF
  train_loss               =     3.2144
  train_runtime            = 0:34:45.63
  train_samples_per_second =     26.851
  train_steps_per_second   =      6.713




***** eval metrics *****
  epoch                   =        3.0
  eval_loss               =     3.4133
  eval_runtime            = 0:00:15.78
  eval_samples_per_second =    122.288
  eval_steps_per_second   =     15.334
