In [1]:
import os
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import bitsandbytes as bnb
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training)

from typing import Union
from dotenv import load_dotenv
import wandb

In [2]:
load_dotenv('.env')
api_key = os.getenv("API_KEY")
wandb_host = os.getenv("WANDB_HOST")

In [3]:
wandb.login(host = wandb_host)
use_wandb = True
wandb_run_name = 'Single_GPU_Optim'

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33magair08[0m ([33mblue_analytics[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
run_wandb = wandb.init(project="llm_ksh", name = wandb_run_name)

!pip install -q -U bitsandbytes
!pip install datasets -U
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install wandb
!pip install pandas
!pip install python-dotenv

In [6]:
my_hf_key = api_key
login(my_hf_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/user1/.cache/huggingface/token
Login successful


In [7]:
model_path = 'meta-llama/Llama-3.1-8B-Instruct'
data_path = 'DopeorNope/Ko-Optimize_Dataset'

In [8]:
data = load_dataset(data_path)

In [9]:
df = pd.DataFrame(data['train'])

In [10]:
df.head()

Unnamed: 0,input,instruction,output
0,,"∫[0,t] (GCP × SM × TA) dt = SA\n\n여기서\n∫ = 적분 ...","예, 주어진 공식을 다음과 같이 수정할 수 있습니다:\n\n∫[0,T] [(GCP ..."
1,귀하는 사람들이 정보를 찾도록 도와주는 AI 어시스턴트입니다. 사용자가 질문을 합니...,정답 -> 문장 A\n질문 -> 다음 두 문장 중 상식에 어긋나는 것은 어느 것입니...,질문 -> 다음 두 문장 중 상식에 어긋나는 것은 어느 것입니까?\n옵션:\n- 문...
2,,이산화탄소 배출을 줄이는 5가지 방법을 나열하세요.,"1. 재생 에너지원 사용: 태양열, 풍력, 수력, 지열 등 이산화탄소를 배출하지 않..."
3,귀하는 사람들이 정보를 찾도록 도와주는 AI 어시스턴트입니다. 사용자가 질문을 합니...,"많은 정치인들이 소농에 대해 이야기하는 것을 좋아하지만, 실제로 거의 모든 농장은 ...",서서히 해봅시다: 노화된 록스타의 손이 협조하지 않는 것은 건강 문제를 나타냅니다....
4,귀하는 항상 설명을 제공하는 도움이 되는 조수입니다. 5살짜리 아이에게 대답한다고 ...,다음 리뷰의 감상을 알려주세요: 솔직히 저는 사춘기 욕구 때문에 이 영화를 봤어요....,이 리뷰에는 긍정적인 감정이 담겨 있습니다. 영화를 보고 유머와 연기를 즐기며 즐거...


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [12]:
# 토크나이저 세팅 : QLoRA시 pad 토큰을 eos로 설정하기
bos = tokenizer.bos_token_id
eos = tokenizer.eos_token_id

tokenizer.add_special_tokens({"pad_token" : "<|reserved_special_token_0>"})

tokenizer.pad_token_id = eos
tokenizer.padding_side = 'right'

In [13]:
cut_off_len = 4098
val_size = 0.005
train_on_inputs = False
add_eos_token = False

In [14]:
template = {
    "prompt_input": "아래는 문제를 설명하는 지시사항과, 구체적인 답변을 방식을 요구하는 입력이 함께 있는 문장입니다. 이 요청에 대해 적절하게 답변해주세요.\n###입력:{input}\n###지시사항:{instruction}\n###답변:",
    "prompt_no_input": "아래는 문제를 설명하는 지시사항입니다. 이 요청에 대해 적절하게 답변해주세요.\n###지시사항:{instruction}\n###답변"
}

In [15]:
def generate_prompt(
    instruction: str,
    input: Union[None, str] = None,
    label: Union[None, str] = None,
    verbose: bool = False
) -> str:
    if input:
        res = template["prompt_input"].format(instruction=instruction, input=input)
    else:
        res = template["prompt_no_input"].format(instruction=instruction)
    if label:
        res = f"{res}{label}"
    if verbose:
        print(res)

    return res

In [16]:
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(prompt, truncation=True, max_length=cut_off_len, padding=False, return_tensors=None,)
    if (result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cut_off_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
    result["labels"] = result["input_ids"].copy()
    return result

In [17]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"]
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = generate_prompt(data_point["instruction"], data_point["input"])
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos_token)
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]
    return tokenized_full_prompt

In [18]:
if val_size > 0:
    train_val = data["train"].train_test_split(test_size = val_size, shuffle=True, seed=42)
    train_data = (train_val["train"].shuffle().map(generate_and_tokenize_prompt))
    val_data = (train_val["test"].shuffle().map(generate_and_tokenize_prompt))
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None

Map:   0%|          | 0/9950 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

#### Model 

In [19]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16
)

In [20]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = quantization_config,
    torch_dtype = torch.bfloat16,
    device_map = {"": 0}
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [21]:
model = prepare_model_for_kbit_training(model)

In [22]:
config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    target_modules = ['q_proj','k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM"    
)

In [23]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

In [24]:
print('Trainable target module:', find_all_linear_names(model))

Trainable target module: ['up_proj', 'q_proj', 'gate_proj', 'o_proj', 'v_proj', 'down_proj', 'k_proj']


#### QLoRA Ready

In [25]:
model = get_peft_model(model, config)

In [26]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

In [27]:
print_trainable_parameters(model)

trainable params: 13631488 || all params: 2809401344 || trainable%: 0.4852097059436731


#### Hyper parameter setting

In [28]:
output_dir = './llama_singleGPU-v1'
num_epochs = 1
micro_batch_size = 1
gradient_accumulation_steps = 8
warmup_steps = 10
#warmup_steps = 100
learning_rate = 5e-8 
group_by_length = False
optimizer = 'paged_adamw_8bit'

# adam 활용시
beta1 = 0.9
beta2 = 0.95

lr_scheduler = 'cosine'
#lr_scheduler = 'cosine', 'linear', 'constant'
logging_steps = 1

use_fp16 = False
use_bf_16 = True
evaluation_strategy = 'steps'
eval_steps = 50
save_steps = 50
save_strategy = 'steps'

In [29]:
model.gradient_checkpointing_enable()

In [30]:
trainer = Trainer(
    model = model,
    train_dataset = train_data,
    eval_dataset = val_data,
    args = TrainingArguments(
    per_device_train_batch_size = micro_batch_size,
    per_device_eval_batch_size = micro_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    warmup_steps = warmup_steps,
    num_train_epochs = num_epochs,
    learning_rate = learning_rate,
    adam_beta1 = beta1,
    adam_beta2 = beta2,
    fp16 = use_fp16,
    bf16 = use_bf_16,
    logging_steps = logging_steps,
    optim = optimizer,
    evaluation_strategy = evaluation_strategy if val_size > 0 else "no",
    save_strategy = 'steps',
    eval_steps = eval_steps,
    save_steps = save_steps,
    lr_scheduler_type = lr_scheduler,
    output_dir = output_dir,
    #save_total_limit = 4,
    load_best_model_at_end = True if val_size > 0 else False,
    group_by_length = group_by_length,
    report_to = "wandb" if use_wandb else None,
    run_name = wandb_run_name if use_wandb else None,
    ),
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors='pt', padding=True
    ),
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
model.config.use_cache = False

trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
50,3.7547,2.032317
100,2.6885,2.033165
150,4.5435,2.031544
200,4.649,2.031247


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


In [None]:
trainer.save_model()
tokenizer.save_pretrained(output_dir)