## Week8 Basic Homework
- corpus 사용하여 facebook/opt-350m 학습

In [1]:
#!pip install scikit-learn numpy pandas scipy matplotlib tokenizers transformers datasets sacremoses sentencepiece importlib_metadata evaluate accelerate sacrebleu wandb trl peft huggingface

In [1]:
import os
import sys
import json
import torch
import wandb
import logging
import evaluate
import transformers
import numpy as np
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

from peft import (
    get_peft_config,
    get_peft_model,
    LoraConfig,
    TaskType
)

from trl import (
    SFTConfig,
    SFTTrainer,
    DataCollatorForCompletionOnlyLM
)

from transformers.trainer_utils import get_last_checkpoint

In [2]:
#device setting
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.backends.cuda.is_built():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [3]:
from datasets import load_dataset

data = load_dataset("sahil2801/CodeAlpaca-20k")

df = data['train'].to_pandas()
df.head()

Unnamed: 0,output,instruction,input
0,"arr = [2, 4, 6, 8, 10]",Create an array of length 5 which contains all...,
1,Height of triangle = opposite side length * si...,Formulate an equation to calculate the height ...,
2,"def replace(self, replace_with):\n new_stri...",Write a replace method for a string class whic...,"string = ""Hello World!""\nreplace_with = ""Greet..."
3,"arr = [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33...",Create an array of length 15 containing number...,
4,def find_num_distinct_states(matrix):\n sta...,Write a function to find the number of distinc...,"matrix = [[1, 0, 0],\n [1, 0, 1],\n ..."


## [MY CODE] Model Load (opt-350m)

In [4]:
# facebook - 2k tokens
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=torch.bfloat16
)

  return self.fget.__get__(instance, owner)()


In [5]:
lora_r: int = 256
lora_dropout: float = 0.1
lora_alpha: int = 32

target_modules = set()

for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        names = name.split('.')
        target_modules.add(names[0] if len(names) == 1 else names[-1])

if "lm_head" in target_modules:  # needed for 16-bit
    target_modules.remove("lm_head")

target_modules = list(target_modules)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 114,032,640 || all params: 445,229,056 || trainable%: 25.6121


## [MY CODE] 학습 준비 - training arguments, data preprocess function, collator

In [6]:
# wandb 접속
wandb.init(project='Week8')
wandb.run.name = 'fb-lora256-finetuning'

logger = logging.getLogger()

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

[34m[1mwandb[0m: Currently logged in as: [33mimsta[0m ([33mimsta-hub[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [7]:
training_args = SFTConfig(output_dir=os.path.join(os.getcwd(),'week8_model/fb_256'),
                        learning_rate=1e-4,
                        per_device_train_batch_size=32,
                        per_device_eval_batch_size=32,
                        max_seq_length=128,
                        num_train_epochs=10,
                        weight_decay=0.01,
                        eval_strategy="epoch",
                        save_strategy="epoch",
                        load_best_model_at_end=True,
                        report_to = "wandb",
                        push_to_hub=False,
                        )

# logger 설정
if training_args.should_log:
    transformers.utils.logging.set_verbosity_info()  # log level을 INFO로 변경

# log level: 10 DEBUG, 20 INFO, 30 WARNING, 40 ERROR, 50 CRITICAL
log_level = training_args.get_process_log_level()

# 우리가 가지고 있는 logger와 HuggingFace의 logger의 log level 설정
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)

# 기타 HuggingFace logger option들을 설정
transformers.utils.logging.enable_default_handler() # logger 기능 활성화
transformers.utils.logging.enable_explicit_format() # 포맷 설정: [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE

In [8]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"{example['instruction'][i]}\n### Answer: \n{example['output'][i]}"
        output_texts.append(text)
    return output_texts

In [9]:
train_test = data['train'].train_test_split(test_size=0.2, train_size=0.8)
ds_train = train_test['train']
ds_test = train_test['test']

In [10]:
response_template = "\n### Answer: "
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [11]:
trainer = SFTTrainer(
    model,
    train_dataset=ds_train,
    args=training_args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    eval_dataset=ds_test
)
#trainer.train()

checkpoint = None
last_checkpoint = get_last_checkpoint(training_args.output_dir)  # 만약 output_dir에 checkpoint가 남아있으면 이를 사용하고, 없으면 None이 return됩니다.
if training_args.resume_from_checkpoint is not None:  # output_dir이 아닌 다른 위치에서의 checkpoint를 resume_from_checkpoint로 지정할 수 있습니다.
    checkpoint = training_args.resume_from_checkpoint
else:  # 아니면 last_checkpoint로 checkpoint를 지정합니다.
    checkpoint = last_checkpoint

train_result = trainer.train()

trainer.save_model()

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

[INFO|configuration_utils.py:696] 2025-02-14 00:33:47,224 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--opt-350m/snapshots/08ab08cc4b72ff5593870b5d527cf4230323703c/config.json
[INFO|configuration_utils.py:768] 2025-02-14 00:33:47,226 >> Model config OPTConfig {
  "_name_or_path": "facebook/opt-350m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": false,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 4096,
  "hidden_size": 1024,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "prefix": "</s>",
  "torch_dtype": "float16",
  "transformers_version": "4.48.3",
  "use_ca

Applying formatting function to train dataset:   0%|          | 0/16017 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/16017 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16017 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16017 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/4005 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/4005 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/4005 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/4005 [00:00<?, ? examples/s]

[INFO|trainer.py:512] 2025-02-14 00:33:54,653 >> You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
[INFO|trainer.py:917] 2025-02-14 00:33:54,872 >> The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: output, text, instruction, input. If output, text, instruction, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
[INFO|trainer.py:2369] 2025-02-14 00:33:54,903 >> ***** Running training *****
[INFO|trainer.py:2370] 2025-02-14 00:33:54,904 >>   Num examples = 16,017
[INFO|trainer.py:2371] 2025-02-14 00:33:54,904 >>   Num Epochs = 10
[INFO|trainer.py:2372] 2025-02-14 00:33:54,905 >>   Instantaneous batch size per device = 32
[INFO|trainer.py:2375] 2025-02-14 00:33:54,905 >>   Total train batch size (w. parallel, distributed & accumulation) = 32
[

Epoch,Training Loss,Validation Loss
1,1.5694,1.350678
2,1.33,1.245024
3,1.2228,1.187945
4,1.1489,1.149851
5,1.0962,1.123814
6,1.0547,1.106952
7,1.0211,1.094466
8,0.997,1.083668
9,0.9761,1.079183
10,0.9661,1.077241


[INFO|trainer.py:917] 2025-02-14 00:35:42,335 >> The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: output, text, instruction, input. If output, text, instruction, input are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
[INFO|trainer.py:4226] 2025-02-14 00:35:42,338 >> 
***** Running Evaluation *****
[INFO|trainer.py:4228] 2025-02-14 00:35:42,338 >>   Num examples = 4005
[INFO|trainer.py:4231] 2025-02-14 00:35:42,339 >>   Batch size = 32
[INFO|trainer.py:3910] 2025-02-14 00:35:52,368 >> Saving model checkpoint to /workspace/week8_model/fb_256/checkpoint-501
[INFO|configuration_utils.py:696] 2025-02-14 00:35:52,877 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--opt-350m/snapshots/08ab08cc4b72ff5593870b5d527cf4230323703c/config.json
[INFO|configuration_utils.py:768] 2025-02-14 00:35:52,878 >> Model config OPTC

***** train metrics *****
  total_flos               = 47817314GF
  train_loss               =     1.1379
  train_runtime            = 0:19:56.62
  train_samples_per_second =    133.852
  train_steps_per_second   =      4.187
