## Week7 Basic Homework

- week4 basic에서 실행한 모델 학습을 재연 및 wandb를 사용하여 metric 관찰

In [1]:
import os
import sys
import torch
import wandb
import logging
import datasets
import evaluate
import transformers
import numpy as np

from typing import Optional
from itertools import chain
from dataclasses import dataclass, field

from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_utils import get_last_checkpoint


In [2]:
#device setting
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.backends.cuda.is_built():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


In [3]:
# wandb 접속
wandb.init(project='Hanghae99')
wandb.run.name = 'gpt-finetuning'

[34m[1mwandb[0m: Currently logged in as: [33mimsta[0m ([33mimsta-hub[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


## [MY CODE] Arguments 정의

In [4]:
@dataclass
class Arguments:
    model_name_or_path: Optional[str] = field(default=None)  # HuggingFace hub에서 pre-trained 모델로 사용할 모델의 이름
    torch_dtype: Optional[str] = field(default=None, metadata={'choices': ['auto', 'bfloat16', 'float16', 'float32']})  # 우리 모델의 precision(data type이라고 이해하시면 됩니다)

    dataset_name: Optional[str] = field(default=None)  # Fine-tuning으로 사용할 huggingface hub에서의 dataset 이름
    dataset_config_name: Optional[str] = field(default=None)  # Fine-tuning으로 사용할 huggingface hub에서의 dataset configuration
    block_size: int = field(default=1024)  # Fine-tuning에 사용할 input text의 길이
    num_workers: Optional[int] = field(default=None)  # Data를 업로드하거나 전처리할 때 사용할 worker 숫자


args = Arguments(model_name_or_path="distilbert/distilbert-base-uncased",
                 torch_dtype='auto',
                 dataset_name="nyu-mll/glue",
                 dataset_config_name="mnli",
                 num_workers=1
                 )
training_args = TrainingArguments(output_dir=os.path.join(os.getcwd(),'my_model'),
                                  learning_rate=1e-4,
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=5,
                                  weight_decay=0.01,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  load_best_model_at_end=True,
                                  report_to = "wandb",
                                  push_to_hub=False
                                  )

## [MY CODE] logger 정의

In [5]:
logger = logging.getLogger()

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
    transformers.utils.logging.set_verbosity_info()  # log level을 INFO로 변경

# log level: 10 DEBUG, 20 INFO, 30 WARNING, 40 ERROR, 50 CRITICAL
log_level = training_args.get_process_log_level()

# 우리가 가지고 있는 logger와 HuggingFace의 logger의 log level 설정
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)

# 기타 HuggingFace logger option들을 설정
transformers.utils.logging.enable_default_handler() # logger 기능 활성화
transformers.utils.logging.enable_explicit_format() # 포맷 설정: [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE

In [6]:
# Training arguments 확인
logger.info(f"Training/evaluation parameters {training_args}")

02/05/2025 21:46:08 - INFO - root - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=

## [MY CODE] Dataset 및 Model 로드

In [7]:
raw_datasets = load_dataset(
    args.dataset_name,
    args.dataset_config_name
)

Overwrite dataset info from restored data version if exists.


02/05/2025 21:46:15 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c


02/05/2025 21:46:15 - INFO - datasets.info - Loading Dataset info from /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c


Found cached dataset glue (/Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c)


02/05/2025 21:46:15 - INFO - datasets.builder - Found cached dataset glue (/Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c)


Loading Dataset info from /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c


02/05/2025 21:46:15 - INFO - datasets.info - Loading Dataset info from /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c


In [8]:
config = AutoConfig.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name_or_path,
    config=config,
    torch_dtype=args.torch_dtype,
)

[INFO|configuration_utils.py:695] 2025-02-05 21:46:17,216 >> loading configuration file config.json from cache at /Users/jh/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
[INFO|configuration_utils.py:762] 2025-02-05 21:46:17,222 >> Model config DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.1",
  "vocab_size": 30522
}

[INFO|configuration_utils.py:695] 2025-02-05 21:46:17,440 >> loading configuration file config.json from cache at /User

## [MY CODE] dataset tokenize

In [9]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

column_names = list(raw_datasets["train"].features)
text_column_name1 = 'premise'
text_column_name2 = 'hypothesis'

def tokenize_function(row):
    output = tokenizer(row[text_column_name1],row[text_column_name2])
    output['labels'] = row['label']
    return output

with training_args.main_process_first(desc="dataset map tokenization"):
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=args.num_workers,
        remove_columns=column_names
    )

Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-ef6c312c050d1a78.arrow


02/05/2025 21:46:20 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-ef6c312c050d1a78.arrow


Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-9cb0fa7ca7edd13d.arrow


02/05/2025 21:46:20 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-9cb0fa7ca7edd13d.arrow


Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-4ac0670ef11a03bc.arrow


02/05/2025 21:46:20 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-4ac0670ef11a03bc.arrow


Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-ef8de6694e303d1e.arrow


02/05/2025 21:46:20 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-ef8de6694e303d1e.arrow


Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-274285747ecab6f4.arrow


02/05/2025 21:46:20 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-274285747ecab6f4.arrow


In [10]:
print(tokenized_datasets['train'][0])
tokenized_datasets['train'].data.to_pandas().head()

{'input_ids': [101, 17158, 2135, 6949, 8301, 25057, 2038, 2048, 3937, 9646, 1011, 4031, 1998, 10505, 1012, 102, 4031, 1998, 10505, 2024, 2054, 2191, 6949, 8301, 25057, 2147, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 1}


Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 17158, 2135, 6949, 8301, 25057, 2038, 20...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[101, 2017, 2113, 2076, 1996, 2161, 1998, 1045...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,"[101, 2028, 1997, 2256, 2193, 2097, 4287, 2041...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3,"[101, 2129, 2079, 2017, 2113, 1029, 2035, 2023...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
4,"[101, 3398, 1045, 2425, 2017, 2054, 2295, 2065...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9847
    })
})

## [MY CODE] 모델 학습
- metric으로 accuracy 추가
- 로컬 실행하기에는 데이터셋이 너무 커서 train의 5%만 사용하여 학습

In [12]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
# dataset이 너무 커서 5%만 사용
splits = tokenized_datasets['train'].train_test_split(test_size=0.95, train_size=0.05)
train_dataset = splits['train']
validation_dataset = tokenized_datasets['validation_matched']

# 토큰화가 진행되었기 때문에 padding만 해줄 collator 선택
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

checkpoint = None
last_checkpoint = get_last_checkpoint(training_args.output_dir)  # 만약 output_dir에 checkpoint가 남아있으면 이를 사용하고, 없으면 None이 return됩니다.
if training_args.resume_from_checkpoint is not None:  # output_dir이 아닌 다른 위치에서의 checkpoint를 resume_from_checkpoint로 지정할 수 있습니다.
    checkpoint = training_args.resume_from_checkpoint
else:  # 아니면 last_checkpoint로 checkpoint를 지정합니다.
    checkpoint = last_checkpoint

train_result = trainer.train(resume_from_checkpoint=checkpoint)

trainer.save_model()

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Caching indices mapping at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-9ea1ff0b47ea8161.arrow


02/05/2025 21:46:30 - INFO - datasets.arrow_dataset - Caching indices mapping at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-9ea1ff0b47ea8161.arrow


Caching indices mapping at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-ceb8c3069f319662.arrow


02/05/2025 21:46:30 - INFO - datasets.arrow_dataset - Caching indices mapping at /Users/jh/.cache/huggingface/datasets/nyu-mll___glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c/cache-ceb8c3069f319662.arrow


[INFO|trainer.py:2362] 2025-02-05 21:46:30,874 >> ***** Running training *****
[INFO|trainer.py:2363] 2025-02-05 21:46:30,874 >>   Num examples = 19,635
[INFO|trainer.py:2364] 2025-02-05 21:46:30,875 >>   Num Epochs = 5
[INFO|trainer.py:2365] 2025-02-05 21:46:30,875 >>   Instantaneous batch size per device = 32
[INFO|trainer.py:2368] 2025-02-05 21:46:30,876 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[INFO|trainer.py:2369] 2025-02-05 21:46:30,876 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2370] 2025-02-05 21:46:30,876 >>   Total optimization steps = 3,070
[INFO|trainer.py:2371] 2025-02-05 21:46:30,877 >>   Number of trainable parameters = 66,955,010
[INFO|integration_utils.py:811] 2025-02-05 21:46:30,878 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/3070 [00:00<?, ?it/s]

{'loss': 0.376, 'grad_norm': 2.679630994796753, 'learning_rate': 8.371335504885994e-05, 'epoch': 0.81}


[INFO|trainer.py:4203] 2025-02-05 21:51:05,222 >> 
***** Running Evaluation *****
[INFO|trainer.py:4205] 2025-02-05 21:51:05,223 >>   Num examples = 9815
[INFO|trainer.py:4208] 2025-02-05 21:51:05,224 >>   Batch size = 32


  0%|          | 0/307 [00:00<?, ?it/s]

[INFO|trainer.py:3887] 2025-02-05 21:52:05,571 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-614
[INFO|configuration_utils.py:419] 2025-02-05 21:52:05,573 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-614/config.json


{'eval_loss': 0.3100927174091339, 'eval_accuracy': 0.5331635252165053, 'eval_runtime': 60.3476, 'eval_samples_per_second': 162.641, 'eval_steps_per_second': 5.087, 'epoch': 1.0}


[INFO|modeling_utils.py:3042] 2025-02-05 21:52:05,843 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-614/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 21:52:05,844 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-614/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 21:52:05,845 >> Special tokens file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-614/special_tokens_map.json


{'loss': 0.2447, 'grad_norm': 6.208848476409912, 'learning_rate': 6.742671009771987e-05, 'epoch': 1.63}


[INFO|trainer.py:4203] 2025-02-05 21:56:47,780 >> 
***** Running Evaluation *****
[INFO|trainer.py:4205] 2025-02-05 21:56:47,781 >>   Num examples = 9815
[INFO|trainer.py:4208] 2025-02-05 21:56:47,781 >>   Batch size = 32


  0%|          | 0/307 [00:00<?, ?it/s]

[INFO|trainer.py:3887] 2025-02-05 21:57:42,879 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1228
[INFO|configuration_utils.py:419] 2025-02-05 21:57:42,882 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1228/config.json


{'eval_loss': 0.345465749502182, 'eval_accuracy': 0.5203260315843097, 'eval_runtime': 55.0983, 'eval_samples_per_second': 178.136, 'eval_steps_per_second': 5.572, 'epoch': 2.0}


[INFO|modeling_utils.py:3042] 2025-02-05 21:57:43,108 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1228/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 21:57:43,110 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1228/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 21:57:43,111 >> Special tokens file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1228/special_tokens_map.json


{'loss': 0.1602, 'grad_norm': 17.073707580566406, 'learning_rate': 5.114006514657981e-05, 'epoch': 2.44}


[INFO|trainer.py:4203] 2025-02-05 22:02:13,045 >> 
***** Running Evaluation *****
[INFO|trainer.py:4205] 2025-02-05 22:02:13,046 >>   Num examples = 9815
[INFO|trainer.py:4208] 2025-02-05 22:02:13,046 >>   Batch size = 32


  0%|          | 0/307 [00:00<?, ?it/s]

[INFO|trainer.py:3887] 2025-02-05 22:03:07,354 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1842
[INFO|configuration_utils.py:419] 2025-02-05 22:03:07,356 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1842/config.json


{'eval_loss': 0.4236220717430115, 'eval_accuracy': 0.53428425878757, 'eval_runtime': 54.3076, 'eval_samples_per_second': 180.73, 'eval_steps_per_second': 5.653, 'epoch': 3.0}


[INFO|modeling_utils.py:3042] 2025-02-05 22:03:07,552 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1842/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 22:03:07,554 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1842/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 22:03:07,555 >> Special tokens file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-1842/special_tokens_map.json


{'loss': 0.0883, 'grad_norm': 4.680767059326172, 'learning_rate': 3.485342019543974e-05, 'epoch': 3.26}


[INFO|trainer.py:4203] 2025-02-05 22:07:33,689 >> 
***** Running Evaluation *****
[INFO|trainer.py:4205] 2025-02-05 22:07:33,690 >>   Num examples = 9815
[INFO|trainer.py:4208] 2025-02-05 22:07:33,691 >>   Batch size = 32


  0%|          | 0/307 [00:00<?, ?it/s]

[INFO|trainer.py:3887] 2025-02-05 22:08:27,855 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-2456
[INFO|configuration_utils.py:419] 2025-02-05 22:08:27,858 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-2456/config.json


{'eval_loss': 0.602651059627533, 'eval_accuracy': 0.5360163015792155, 'eval_runtime': 54.1648, 'eval_samples_per_second': 181.206, 'eval_steps_per_second': 5.668, 'epoch': 4.0}


[INFO|modeling_utils.py:3042] 2025-02-05 22:08:28,061 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-2456/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 22:08:28,062 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-2456/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 22:08:28,063 >> Special tokens file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-2456/special_tokens_map.json


{'loss': 0.0475, 'grad_norm': 0.02795957215130329, 'learning_rate': 1.8566775244299675e-05, 'epoch': 4.07}
{'loss': 0.0218, 'grad_norm': 0.016305916011333466, 'learning_rate': 2.280130293159609e-06, 'epoch': 4.89}


[INFO|trainer.py:3887] 2025-02-05 22:12:56,639 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070
[INFO|configuration_utils.py:419] 2025-02-05 22:12:56,641 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/config.json
[INFO|modeling_utils.py:3042] 2025-02-05 22:12:56,830 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 22:12:56,831 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 22:12:56,832 >> Special tokens file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/special_tokens_map.json
[INFO|trainer.py:4203] 2025-02-05 22:12:57,750 >> 
***** Running Evaluation *****
[INFO|trainer.py:4205] 2025-02-05 22:12:57,751 >>   Nu

  0%|          | 0/307 [00:00<?, ?it/s]

[INFO|trainer.py:3887] 2025-02-05 22:13:52,093 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070
[INFO|configuration_utils.py:419] 2025-02-05 22:13:52,095 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/config.json
[INFO|modeling_utils.py:3042] 2025-02-05 22:13:52,264 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 22:13:52,266 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 22:13:52,266 >> Special tokens file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-3070/special_tokens_map.json


{'eval_loss': 0.7396935820579529, 'eval_accuracy': 0.5350993377483444, 'eval_runtime': 54.3419, 'eval_samples_per_second': 180.616, 'eval_steps_per_second': 5.649, 'epoch': 5.0}


[INFO|trainer.py:2636] 2025-02-05 22:13:52,939 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:2874] 2025-02-05 22:13:52,940 >> Loading best model from /Users/jh/workspaces/notebook/voyage/homework/my_model/checkpoint-614 (score: 0.3100927174091339).
[INFO|trainer.py:3887] 2025-02-05 22:13:53,377 >> Saving model checkpoint to /Users/jh/workspaces/notebook/voyage/homework/my_model
[INFO|configuration_utils.py:419] 2025-02-05 22:13:53,379 >> Configuration saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/config.json
[INFO|modeling_utils.py:3042] 2025-02-05 22:13:53,521 >> Model weights saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/model.safetensors
[INFO|tokenization_utils_base.py:2485] 2025-02-05 22:13:53,522 >> tokenizer config file saved in /Users/jh/workspaces/notebook/voyage/homework/my_model/tokenizer_config.json
[INFO|tokenization_utils_base.py:2494] 2025-02-05 22:13:53,523 >> Special tokens 

{'train_runtime': 1642.4867, 'train_samples_per_second': 59.772, 'train_steps_per_second': 1.869, 'train_loss': 0.1533355787833273, 'epoch': 5.0}
***** train metrics *****
  epoch                    =        5.0
  total_flos               =  2162261GF
  train_loss               =     0.1533
  train_runtime            = 0:27:22.48
  train_samples_per_second =     59.772
  train_steps_per_second   =      1.869


In [14]:
wandb.finish()

0,1
eval/accuracy,▇▁▇██
eval/loss,▁▂▃▆█
eval/runtime,█▂▁▁▁
eval/samples_per_second,▁▇███
eval/steps_per_second,▁▇███
train/epoch,▁▁▂▃▄▅▅▆▆███
train/global_step,▁▁▂▃▄▅▅▆▆███
train/grad_norm,▂▄█▃▁▁
train/learning_rate,█▇▅▄▂▁
train/loss,█▅▄▂▂▁

0,1
eval/accuracy,0.5351
eval/loss,0.73969
eval/runtime,54.3419
eval/samples_per_second,180.616
eval/steps_per_second,5.649
total_flos,2321710301850036.0
train/epoch,5.0
train/global_step,3070.0
train/grad_norm,0.01631
train/learning_rate,0.0
