## Week7 Advanced2 Homework
- corpus 사용하여 gemma 2b 학습

In [1]:
import os
import sys
import json
import torch
import wandb
import logging
import evaluate
import transformers
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

from trl import (
    SFTConfig,
    SFTTrainer,
    DataCollatorForCompletionOnlyLM
)

from transformers.trainer_utils import get_last_checkpoint

In [2]:
#device setting
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.backends.cuda.is_built():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


## [MY CODE] WandB 접속 및 logger 설정

In [3]:
# wandb 접속
wandb.init(project='Hanghae99')
wandb.run.name = 'gemma-finetuning'

[34m[1mwandb[0m: Currently logged in as: [33mimsta[0m ([33mimsta-hub[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [4]:
logger = logging.getLogger()

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

## [MY CODE] Model Load (gemma-2b-it)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    device_map=device,
    torch_dtype=torch.bfloat16
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# sample test1 - 모델에 직접 입력
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))

<bos>Write me a poem about Machine Learning.

Machines, they weave a web,
Of algorithms, vast and deep.
They learn from


In [12]:
# sample test2 - chat template로 입력
chat = [
    { "role": "user", "content": "Write me a poem about Machine Learning." },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(input_ids=inputs.to(device), max_new_tokens=150)

print(tokenizer.decode(outputs[0]))

<bos><start_of_turn>user
Write me a poem about Machine Learning.<end_of_turn>
<start_of_turn>model
Machines, with minds of steel,
Unleash the power of the steel.
Algorithms dance and churn,
Unveiling patterns, near and far.

Data, the fuel for their fire,
Igniting insights, setting them afire.
From images to speech, they learn and grow,
A symphony of data flow.

Supervised, unsupervised,
The algorithms guide the way.
Regression, classification, a trio,
Unveiling truths, both old and new.

Unsupervised, they find the core,
Clustering data, a task to adore.
Anomaly detection, a watchful eye,
Identifying deviations, setting things right.

Reinforcement, a learning curve,
Rewarding patterns


## [LOG] HF의 예제를 두 가지 유형으로 실행

## [MY CODE] Data Load (corpus)

In [7]:
with open("corpus.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

df = pd.DataFrame(corpus)

train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)

ds_train = datasets.Dataset.from_pandas(train_df, preserve_index=False)
ds_test = datasets.Dataset.from_pandas(test_df, preserve_index=False)

data_files = datasets.DatasetDict({"train":ds_train,
                                   "test":ds_test})

data_files

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 76
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 20
    })
})

## [MY CODE] 학습 준비 - training arguments, data preprocess function, collator

In [8]:
training_args = SFTConfig(output_dir=os.path.join(os.getcwd(),'trained_model'),
                        learning_rate=1e-4,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=20,
                        weight_decay=0.01,
                        eval_strategy="epoch",
                        save_strategy="epoch",
                        load_best_model_at_end=True,
                        report_to = "wandb",
                        push_to_hub=False,
                        use_mps_device=True
                        )

# logger 설정
if training_args.should_log:
    transformers.utils.logging.set_verbosity_info()  # log level을 INFO로 변경

# log level: 10 DEBUG, 20 INFO, 30 WARNING, 40 ERROR, 50 CRITICAL
log_level = training_args.get_process_log_level()

# 우리가 가지고 있는 logger와 HuggingFace의 logger의 log level 설정
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)

# 기타 HuggingFace logger option들을 설정
transformers.utils.logging.enable_default_handler() # logger 기능 활성화
transformers.utils.logging.enable_explicit_format() # 포맷 설정: [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE



In [9]:
# Training arguments 확인
logger.info(f"Training/evaluation parameters {training_args}")

02/07/2025 22:44:48 - INFO - root - Training/evaluation parameters SFTConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
chars_per_token=<CHARS_PER_TOKEN>,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
dataset_batch_size=1000,
dataset_kwargs=None,
dataset_num_proc=None,
dataset_text_field=text,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,


In [10]:
def formatting_prompts_func(src_df):
    output_texts = []
    for i in range(len(src_df['input'])):
        text = f"<bos><start_of_turn>user\n{src_df['input'][i]}<end_of_turn>\n<start_of_turn>model\n{src_df['output'][i]}<end_of_turn>"
        output_texts.append(text)
    return output_texts

In [11]:
tokenizer.padding_side = "right"

response_template = "<start_of_turn>model"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [12]:
trainer = SFTTrainer(
    model,
    train_dataset=ds_train,
    args=training_args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    eval_dataset=ds_test
)
#trainer.train()

checkpoint = None
last_checkpoint = get_last_checkpoint(training_args.output_dir)  # 만약 output_dir에 checkpoint가 남아있으면 이를 사용하고, 없으면 None이 return됩니다.
if training_args.resume_from_checkpoint is not None:  # output_dir이 아닌 다른 위치에서의 checkpoint를 resume_from_checkpoint로 지정할 수 있습니다.
    checkpoint = training_args.resume_from_checkpoint
else:  # 아니면 last_checkpoint로 checkpoint를 지정합니다.
    checkpoint = last_checkpoint

train_result = trainer.train(resume_from_checkpoint=checkpoint)

trainer.save_model()

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

[INFO|tokenization_utils_base.py:2030] 2025-02-07 22:44:56,066 >> loading file tokenizer.model from cache at /Users/jh/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/tokenizer.model
[INFO|tokenization_utils_base.py:2030] 2025-02-07 22:44:56,067 >> loading file tokenizer.json from cache at /Users/jh/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/tokenizer.json
[INFO|tokenization_utils_base.py:2030] 2025-02-07 22:44:56,068 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2030] 2025-02-07 22:44:56,069 >> loading file special_tokens_map.json from cache at /Users/jh/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/special_tokens_map.json
[INFO|tokenization_utils_base.py:2030] 2025-02-07 22:44:56,069 >> loading file tokenizer_config.json from cache at /Users/jh/.cache/huggingface/hub/models--goo

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

[INFO|trainer.py:2362] 2025-02-07 22:44:57,474 >> ***** Running training *****
[INFO|trainer.py:2363] 2025-02-07 22:44:57,474 >>   Num examples = 76
[INFO|trainer.py:2364] 2025-02-07 22:44:57,475 >>   Num Epochs = 20
[INFO|trainer.py:2365] 2025-02-07 22:44:57,475 >>   Instantaneous batch size per device = 16
[INFO|trainer.py:2368] 2025-02-07 22:44:57,476 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
[INFO|trainer.py:2369] 2025-02-07 22:44:57,476 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2370] 2025-02-07 22:44:57,477 >>   Total optimization steps = 100
[INFO|trainer.py:2371] 2025-02-07 22:44:57,477 >>   Number of trainable parameters = 2,506,172,416
[INFO|integration_utils.py:811] 2025-02-07 22:44:57,479 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/100 [00:00<?, ?it/s]

Human: This is a part of document:
7 Table 4: Human assessments for the Jeopardy Question Generation Task. Factuality Speciﬁcity BART better 7.1% 16.8% RAG better 42.7% 37.4% Both good 11.7% 11.8% Both poor 17.7% 6.9% No majority 20.8% 20.1% Table 5: Ratio of distinct to total tri-grams for generation tasks. MSMARCO Jeopardy QGen Gold 89.6% 90.0% BART 70.7% 32.4% RAG-Token 77.8% 46.8% RAG-Seq. 83.5% 53.8% Table 6: Ablations on the dev set. As FEVER is a classiﬁcation task, both RAG models are equivalent. Model NQ TQA WQ CT Jeopardy-QGen MSMarco FVR-3 FVR-2 Exact Match B-1 QB-1 R-L B-1 Label Accuracy RAG-Token-BM25 29.7 41.5 32.1 33.1 17.5 22.3 55.5 48.4 75.1 91.6RAG-Sequence-BM25 31.8 44.1 36.6 33.8 11.1 19.5 56.5 46.9 RAG-Token-Frozen 37.8 50.1 37.1 51.1 16.7 21.7 55.9 49.4 72.9 89.4RAG-Sequence-Frozen 41.2 52.1 41.8 52.6 11.8 19.6 56.7 47.3 RAG-Token 43.5 54.8 46.5 51.9 17.9 22.6 56.2 49.4 74.5 90.6RAG-Sequence 44.0 55.8 44.9 53.4 15.3 21.5 57.2 47.5 between these dates and use a tem

RuntimeError: MPS backend out of memory (MPS allocated: 35.28 GB, other allocations: 520.73 MB, max allowed: 36.27 GB). Tried to allocate 512.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).