In [1]:
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login

# 허깅페이스 로그인
my_hf_key='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'
login(my_hf_key)

  from .autonotebook import tqdm as notebook_tqdm


Gemma 2 9B-It 모델은 크기가 커서 16GB GPU 메모리를 사용하더라도 전체 모델을 로드할 수 없음. 


-> 그래서 4비트 양자화 방식으로 모델을 로드

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

modelName = "google/gemma-2-9b-it"

bnbConfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(modelName)

model = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map = "auto",
    quantization_config=bnbConfig
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.84s/it]


In [3]:
from IPython.display import Markdown, display

system = "You are a Korean man in your 50s. You have a son named 준성, and you are currently having a casual conversation.Speak in Korean."
user = "I'm trying to decide on a dinner menu. Any recommendations?"

prompt = f"System: {system} \n User: {user} \n AI: "

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=500, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

Markdown(text.split("AI:")[1])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


 
어떤 음식을 좋아하는지 좀 알려줘. 



In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [5]:
wb_token = "a12d8c451336c1862c41662d081104856e34c503"

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Gemma-2-9b-it on Dad_son_talk Dataset2', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mgdkssud374[0m ([33mgdkssud374-gyeongsang-national-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/eardream2/.netrc


In [6]:
base_model = "google/gemma-2-9b-it"
dataset_name = "junn991/asdasd"
new_model = "Gemma-2-9b-it-chat-dad_v2"

In [7]:
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [8]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [9]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    # attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.89s/it]


In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [11]:
# chat_template을 None으로 설정
# tokenizer.chat_template = None

In [12]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
# model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [13]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="all")

def format_chat_template(row):
    row_json = [{"role": "system", "content": row["instruction"]},
               {"role": "user", "content": row["input"]},
               {"role": "assistant", "content": row["output"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)

dataset

Map (num_proc=4):   0%|          | 0/90 [00:00<?, ? examples/s]


TemplateError: System role not supported

In [30]:
dataset['text'][3]

'<|im_start|>system\n수액맞는다<|im_end|>\n<|im_start|>user\n<|im_end|>\n<|im_start|>assistant\n알았다<|im_end|>\n'

In [31]:
dataset = dataset.train_test_split(test_size=0.1)

In [34]:
# Setting Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 81/81 [00:00<00:00, 5610.88 examples/s]


Step,Training Loss,Validation Loss
2,9.4969,9.405496
4,8.2551,7.853437
6,6.9745,6.789494
8,6.1065,5.861004
10,5.5188,5.440773


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=10, training_loss=7.19634747505188, metrics={'train_runtime': 74.0877, 'train_samples_per_second': 1.093, 'train_steps_per_second': 0.135, 'total_flos': 132509928026112.0, 'train_loss': 7.19634747505188, 'epoch': 0.9876543209876543})

In [32]:
import torch
torch.cuda.empty_cache()

In [35]:
wandb.finish()
model.config.use_cache = True

0,1
eval/loss,█▅▃▂▁
eval/runtime,▁█▂█▄
eval/samples_per_second,█▁▇▁▅
eval/steps_per_second,█▁▇▁▅
train/epoch,▁▂▂▃▃▃▄▅▅▆▆▆▇███
train/global_step,▁▂▂▃▃▃▄▅▅▆▆▆▇███
train/grad_norm,▄█▄▅▃▃▂▂▁▁
train/learning_rate,▂▃▃▄▅▆▆▇█▁
train/loss,▅█▇▆▄▄▂▂▁▁

0,1
eval/loss,5.44077
eval/runtime,2.3623
eval/samples_per_second,3.81
eval/steps_per_second,3.81
total_flos,132509928026112.0
train/epoch,0.98765
train/global_step,10.0
train/grad_norm,6.68685
train/learning_rate,0.0
train/loss,5.5188


In [37]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors: 100%|██████████| 3.89G/3.89G [01:36<00:00, 40.2MB/s]


CommitInfo(commit_url='https://huggingface.co/junn991/Gemma-2-9b-it-chat-dad/commit/ad28ded1ad19617f9a4a54b36206d2b4cb7b8bfd', commit_message='Upload model', commit_description='', oid='ad28ded1ad19617f9a4a54b36206d2b4cb7b8bfd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/junn991/Gemma-2-9b-it-chat-dad', endpoint='https://huggingface.co', repo_type='model', repo_id='junn991/Gemma-2-9b-it-chat-dad'), pr_revision=None, pr_num=None)

In [38]:
base_model_url = "google/gemma-2-9b-it"
new_model_url = "/home/eardream2/Jun/Fine_TT/Gemma-2-9b-it-chat-dad/"

In [39]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format


# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model_url)

base_model_reload= AutoModelForCausalLM.from_pretrained(
    base_model_url,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.34s/it]


In [None]:
# chat_template을 None으로 설정
tokenizer.chat_template = None

In [46]:
base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)
model = PeftModel.from_pretrained(base_model_reload, new_model_url)

model = model.merge_and_unload()

In [47]:
model.save_pretrained("Gemma-2-9b-it-chat-dad")
tokenizer.save_pretrained("Gemma-2-9b-it-chat-dad")

('Gemma-2-9b-it-chat-dad/tokenizer_config.json',
 'Gemma-2-9b-it-chat-dad/special_tokens_map.json',
 'Gemma-2-9b-it-chat-dad/tokenizer.json')

In [48]:
model.push_to_hub("Gemma-2-9b-it-chat-dad", use_temp_dir=False)
tokenizer.push_to_hub("Gemma-2-9b-it-chat-dad", use_temp_dir=False)

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

[A[A
[A


model-00002-of-00004.safetensors:   0%|          | 2.88M/4.95G [00:00<02:51, 28.8MB/s]
[A


[A[A[A
model-00002-of-00004.safetensors:   0%|          | 14.4M/4.95G [00:00<01:51, 44.4MB/s]


[A[A[A


model-00002-of-00004.safetensors:   0%|          | 18.9M/4.95G [00:00<03:20, 24.6MB/s]
[A
[A


model-00002-of-00004.safetensors:   1%|          | 27.5M/4.95G [00:00<02:37, 31.2MB/s]
[A


[A[A[A


[A[A[A
model-00002-of-00004.safetensors:   1%|          | 32.0M/4.95G [00:01<04:08, 19.8MB/s]
[A


model-00002-of-00004.safetensors:   1%|          | 46.6M/4.95G [00:01<02:20, 34.8MB/s]
[A


model-00002-of-00004.safetensors:   1%|          | 51.4M/4.95G [00:01<03:12, 25.5MB/s]
model-00002-of-00004.safetensors:   1%|          | 59.3M/4.95G [00:02<02:48, 29.1MB/s]
[A


model-00002-of-00004.safetensors:   1%|▏         | 63.5M/4.95G [00:02<02:37, 31.0MB/s]
[A
[A


[A[A[A
[A


model-00002-o

CommitInfo(commit_url='https://huggingface.co/junn991/Gemma-2-9b-it-chat-dad/commit/7c42895fb0214d43846f9ee5e1ea08d6121d9e42', commit_message='Upload tokenizer', commit_description='', oid='7c42895fb0214d43846f9ee5e1ea08d6121d9e42', pr_url=None, repo_url=RepoUrl('https://huggingface.co/junn991/Gemma-2-9b-it-chat-dad', endpoint='https://huggingface.co', repo_type='model', repo_id='junn991/Gemma-2-9b-it-chat-dad'), pr_revision=None, pr_num=None)