# 토크나이저 및 데이터 준비

## Text Data 준비

In [1]:
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 허깅페이스 로그인 방법
my_hf_key='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'
login(my_hf_key)

In [3]:
# 모델 레포지토리
model_path = "beomi/Llama-3-Open-Ko-8B"

# 데이터 path
data_path = "junn991/couple_chat"#"MarkrAI/KOpen-HQ-Hermes-2.5-60K"

In [4]:
# dataset 다운

data = load_dataset(data_path)

In [5]:
df = pd.DataFrame(data['train'])

In [7]:
# 데이터셋 구성확인
df.head()

Unnamed: 0,instruction,output,input
0,웅,쟈깅,
1,됀당!!!\n꺄앙,다행이넹,
2,선톡받운사람만풀려 ㅠ,언능 풀려야되는뎅!!!,
3,https://youtube.com/shorts/d73cS5dY0jM?feature...,울아기넹!!!,
4,이게나양!,아주쭈앙,


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import bitsandbytes as bnb
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training)


In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [10]:
# 토크나이저 세팅: QLoRA시 pad 토큰을 eos로 설정해주기
bos = tokenizer.bos_token_id
eos = tokenizer.eos_token_id
pad = tokenizer.pad_token_id

tokenizer.pad_token_id = eos
tokenizer.padding_side = "right"
cut_off_len = 4098
val_size = 0.05
train_on_inputs = False
add_eos_token = False

In [11]:
template = {
    "prompt_input": "아래는 대화중 질문하는 지시사항과, 구체적인 답변을 방식을 요구하는 입력이 함께 있는 문장입니다. 이 요청에 대해 적절하게 답변해주세요.\n###입력:{input}\n###지시사항:{instruction}\n###답변:",
    "prompt_no_input": "아래는 대화중 질문하는 지시사항과, 이 요청에 대해 적절하게 답변해주세요.\n###지시사항:{instruction}\n###답변:"
}

In [12]:

from typing import Union

def generate_prompt(
    instruction: str,
    input: Union[None, str] = None,
    label: Union[None, str] = None,
    verbose: bool = False
) -> str:
    """
    주어진 instruction, input, label을 사용하여 프롬프트를 생성하는 함수.

    Parameters:
    - instruction (str): 문제 설명 또는 지시사항.
    - template (dict): 입력이 있는 경우와 없는 경우의 템플릿을 포함한 딕셔너리.
    - input (str or None): 문제에 대한 구체적인 입력 (옵션).
    - label (str or None): 정답 또는 응답 (옵션).
    - verbose (bool): 생성된 프롬프트를 출력할지 여부.

    Returns:
    - str: 완성된 프롬프트.
    """
    if input:
        res = template["prompt_input"].format(instruction=instruction, input=input)
    else:
        res = template["prompt_no_input"].format(instruction=instruction)

    if label:
        res = f"{res}{label}"

    if verbose:
        print(res)

    return res


In [13]:
def tokenize(prompt, add_eos_token=True):
   result = tokenizer(prompt,truncation=True,max_length=cut_off_len,padding=False,return_tensors=None,)
   if (result["input_ids"][-1] != tokenizer.eos_token_id
       and len(result["input_ids"]) < cut_off_len
       and add_eos_token
      ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

   result["labels"] = result["input_ids"].copy()
   return result

In [14]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"]
        )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = generate_prompt(data_point["instruction"], data_point["input"])
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos_token)
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1



        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]

    return tokenized_full_prompt

In [15]:
if val_size > 0:
  train_val = data["train"].train_test_split(test_size=val_size, shuffle=True, seed=42)
  train_data = (train_val["train"].shuffle().map(generate_and_tokenize_prompt))
  val_data = (train_val["test"].shuffle().map(generate_and_tokenize_prompt))
else:
  train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
  val_data = None

Map: 100%|██████████| 1045/1045 [00:00<00:00, 1527.83 examples/s]
Map: 100%|██████████| 55/55 [00:00<00:00, 1425.37 examples/s]


# Model 준비

In [16]:
# Quantization config 준비

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16,
    )





In [17]:
# Model 로드 하기
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config = quantization_config,
    torch_dtype = torch.bfloat16,
    device_map = {"" : 0}
    )


Loading checkpoint shards: 100%|██████████| 6/6 [00:50<00:00,  8.34s/it]


In [18]:
model = prepare_model_for_kbit_training(model)

In [19]:
config = LoraConfig(
    r = 16,
    lora_alpha = 32, # 기본값 16
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM"
    )

In [20]:
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
  return list(lora_module_names)

In [21]:
print('Trainable targer module:',find_all_linear_names(model))

Trainable targer module: ['up_proj', 'o_proj', 'q_proj', 'v_proj', 'down_proj', 'k_proj', 'gate_proj']


In [22]:
# QLoRA 준비
model = get_peft_model(model, config)

In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [24]:
# 파라미터 수 체크
print_trainable_parameters(model)

trainable params: 13631488 || all params: 2809401344 || trainable%: 0.4852097059436731


In [30]:
#Hyper parameter setting

output_dir='./llama_singleGPU-couple-v1'


num_epochs = 3
micro_batch_size = 1
gradient_accumulation_steps = 16
warmup_steps = 10
learning_rate = 5e-6 # 5e-8, 기본값
group_by_length = False
optimizer = 'paged_adamw_8bit'

# adam 활용시
beta1 = 0.9
beta2 = 0.95

lr_scheduler = 'cosine'
logging_steps = 1

use_wandb = True
wandb_run_name = 'llama_singleGPU-couple-v1'

use_fp16 = False
use_bf_16 = True
evaluation_strategy = 'steps'
eval_steps = 10
save_steps = 10
save_strategy = 'steps'




In [31]:
model.gradient_checkpointing_enable()

In [32]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=TrainingArguments(
    per_device_train_batch_size = micro_batch_size,
    per_device_eval_batch_size = micro_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    warmup_steps = warmup_steps,
    num_train_epochs = num_epochs,
    learning_rate = learning_rate,
    adam_beta1 = beta1, # adam 활용할때 사용
    adam_beta2 = beta2, # adam 활용할때 사용
    fp16 = use_fp16,
    bf16 = use_bf_16,
    logging_steps = logging_steps,
    optim = optimizer,
    evaluation_strategy = evaluation_strategy if val_size > 0 else "no",
    save_strategy="steps",  #스텝기준으로 save
    eval_steps = eval_steps if val_size > 0 else None,
    save_steps = save_steps,
    lr_scheduler_type=lr_scheduler,
    output_dir = output_dir,
    #save_total_limit = 4,
    load_best_model_at_end = True if val_size > 0 else False ,
    group_by_length=group_by_length,
    report_to="wandb" if use_wandb else None,
    run_name=wandb_run_name if use_wandb else None,
    ),
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )



In [33]:
model.config.use_cache = False


trainer.train()

Step,Training Loss,Validation Loss
10,3.4833,3.572029
20,3.6384,3.476855
30,3.1543,3.349677
40,3.4019,3.250666
50,3.1394,3.16818
60,3.4005,3.108689
70,3.0595,3.05115
80,3.065,3.002117
90,3.0147,2.964105
100,3.1337,2.929028


TrainOutput(global_step=195, training_loss=3.1617341879086616, metrics={'train_runtime': 4051.3252, 'train_samples_per_second': 0.774, 'train_steps_per_second': 0.048, 'total_flos': 8055080327577600.0, 'train_loss': 3.1617341879086616, 'epoch': 2.9645933014354067})

# 베이스모델이랑 합쳐서 허깅페이스 로드

In [7]:
from peft import PeftModel
 

base_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16, token=my_hf_key)

merged_model = PeftModel.from_pretrained(base_model, "/home/eardream2/Jun/Fine_TT/llama_singleGPU-couple-v1/checkpoint-195")

merged_model = merged_model.merge_and_unload()

merged_model.push_to_hub("junn991/llama3-8b-1gpu-couple")
tokenizer.push_to_hub("junn991/llama3-8b-1gpu-couple")

Loading checkpoint shards: 100%|██████████| 6/6 [00:01<00:00,  4.29it/s]
model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

[A[A
[A


[A[A[A

model-00002-of-00004.safetensors:   0%|          | 1.23M/5.00G [00:00<07:36, 11.0MB/s]
[A

model-00002-of-00004.safetensors:   0%|          | 4.46M/5.00G [00:00<04:52, 17.1MB/s]
[A

model-00002-of-00004.safetensors:   0%|          | 11.4M/5.00G [00:00<03:59, 20.8MB/s]

[A[A
model-00002-of-00004.safetensors:   0%|          | 24.9M/5.00G [00:01<03:14, 25.5MB/s]

[A[A

model-00002-of-00004.safetensors:   1%|          | 29.1M/5.00G [00:01<04:45, 17.4MB/s]

[A[A
[A
[A

[A[A
model-00002-of-00004.safetensors:   1%|          | 32.2M/5.00G [00:02<06:18, 13.1MB/s]
model-00002-of-00004.safetensors:   1%|          | 55.6M/5.00G [00:02<02:43, 30.2MB/s]

[A[A
[A
model-00002-of-00004.safetensors:   1%|          | 60.1M/5.00G [00:02<03:53, 21.2MB/s]

[A[A
[A

model-00002-of-00004.safetensors:   1%|▏         | 6

NameError: name 'tokenizer' is not defined

qlora학습시 loss가 0이 나오면 학습이 안되는 것 = 러닝레이트가 높다는 것
-> 러닝레이트 낮추고, 배치 높여주기 = 만능키