In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U trl
%pip install -U wandb

In [None]:
%pip install -U datasets
%pip install -U peft
%pip install -U huggingface_hub



In [None]:
import os
from google.colab import userdata
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging
)

from peft import (LoraConfig,
                  get_peft_model,
                  prepare_model_for_kbit_training,
                  PeftModel
                  )
import pandas as pd
from huggingface_hub import login
from datasets import Dataset, load_dataset
import wandb
from trl import SFTTrainer, setup_chat_format

In [None]:
os.environ["HUGGINGFACE_TOKEN"] = userdata.get('HUGGINGFACE_TOKEN')
login(token =userdata.get('HUGGINGFACE_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
base_model = "google/gemma-2-2b-it"

if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

print(attn_implementation)

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=4,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
# 1. CSV 파일 로드 (Hugging Face `load_dataset` 사용)
dataset = load_dataset('csv', data_files='/content/carl_dataset.csv', split="all")

# 2. NaN 값이 있는 행을 제외 (챗봇 열에서 NaN 제외)
def filter_nan(row):
    return row["챗봇"] is not None and pd.notna(row["챗봇"])

dataset = dataset.filter(filter_nan)

# 3. '유저', '챗봇' 열만 남기기 (remove_columns 사용)
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["유저", "챗봇"]])

# 4. instruction 컬럼 추가
def add_instruction(row):
    row["instruction"] = "당신은 심리상담가입니다. 환자의 발화에 적절한 답변을 해주세요."
    return row

dataset = dataset.map(add_instruction)

# 5. 챗봇 형식에 맞게 데이터 변환 함수 정의
def format_chat_template(row):
    # '구분', '유저', '챗봇' 열을 사용해 JSON 형식으로 변환
    row_json = [{"role": "system", "content": row['instruction']},
               {"role": "user", "content": row["유저"]},
               {"role": "assistant", "content": row["챗봇"]}]

    # 토크나이저로 변환하지 않고 'text' 필드에 저장
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)

    return row

# 6. 데이터셋 변환 (멀티 프로세싱으로 변환 속도 최적화)
dataset = dataset.map(
    format_chat_template,
    # num_proc=4,  # 멀티 프로세싱으로 처리 속도 향상
)

# 7. 결과 확인
dataset

In [None]:
dataset['text'][3]

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# Setting Hyperparamter

new_model = "gemma-2-2b-it-chat-carl-jung"

training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    # report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1408 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
141,0.5962,0.965517
282,0.7306,0.916965
423,0.8498,0.881949
564,0.7718,0.852576




TrainOutput(global_step=704, training_loss=0.9127722905779426, metrics={'train_runtime': 905.2702, 'train_samples_per_second': 1.555, 'train_steps_per_second': 0.778, 'total_flos': 1760768990820864.0, 'train_loss': 0.9127722905779426, 'epoch': 1.0})

In [None]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.023 MB of 0.023 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▅▃▁
eval/runtime,▁▅██
eval/samples_per_second,█▄▁▁
eval/steps_per_second,█▄▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇█
train/grad_norm,█▃▂▃▃▂▂▄▃▄▂▂▂▂▂▄▃▂▃▁▄▂▃▁▁▂▂▃▂▂▂▃▂▂▃▂▂▁▂▂
train/learning_rate,▃███▇▇▇▇▇▇▇▇▇▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▂▁▁▁
train/loss,█▄▄▂▅▂▄▂▃▂▂▄▃▁▄▄▃▄▃▃▂▃▂▃▂▃▃▂▁▁▃▂▂▂▂▄▁▃▂▃

0,1
eval/loss,0.85258
eval/runtime,28.7551
eval/samples_per_second,5.46
eval/steps_per_second,5.46
total_flos,1760768990820864.0
train/epoch,1.0
train/global_step,704.0
train/grad_norm,3.53409
train/learning_rate,0.0
train/loss,0.5887


In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



adapter_model.safetensors:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CometKing/Gemma-2-9b-it-chat-carl-jung/commit/d4fd0740d7d134463e6fb59771ccd438e38dcaaf', commit_message='Upload model', commit_description='', oid='d4fd0740d7d134463e6fb59771ccd438e38dcaaf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CometKing/Gemma-2-9b-it-chat-carl-jung', endpoint='https://huggingface.co', repo_type='model', repo_id='CometKing/Gemma-2-9b-it-chat-carl-jung'), pr_revision=None, pr_num=None)

In [None]:
import shutil

# content 경로에 있는 폴더
source_folder = '/content/wandb'  # 복사할 폴더 경로를 지정

# 드라이브 내 저장할 경로
destination_folder = '/content/drive/MyDrive/wandb'  # 드라이브에 저장할 폴더 경로

# 폴더가 존재하지 않으면 새로 생성
# if not os.path.exists(destination_folder):
#     os.makedirs(destination_folder)

# 폴더 복사
shutil.copytree(source_folder, destination_folder)

print(f'폴더가 {destination_folder}에 성공적으로 저장되었습니다.')

폴더가 /content/drive/MyDrive/wandb에 성공적으로 저장되었습니다.


## Merging the Base Model with Adopter

In [None]:
new_model_url = "CometKing/Gemma-2-2b-it-chat-carl-jung"
base_model_url = "google/gemma-2-2b-it"

In [None]:
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model_url)

base_model_reload= AutoModelForCausalLM.from_pretrained(
    base_model_url,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [None]:
base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)
model = PeftModel.from_pretrained(base_model_reload, new_model_url)

model = model.merge_and_unload()

adapter_config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

In [None]:
model.save_pretrained("Gemma-2-2b-it-chat-carl-jung")
tokenizer.save_pretrained("Gemma-2-2b-it-chat-carl-jung")

('Gemma-2-2b-it-chat-carl-jung/tokenizer_config.json',
 'Gemma-2-2b-it-chat-carl-jung/special_tokens_map.json',
 'Gemma-2-2b-it-chat-carl-jung/tokenizer.model',
 'Gemma-2-2b-it-chat-carl-jung/added_tokens.json',
 'Gemma-2-2b-it-chat-carl-jung/tokenizer.json')

In [None]:
model.push_to_hub("Gemma-2-2b-it-chat-carl-jung", use_temp_dir=False)
tokenizer.push_to_hub("Gemma-2-2b-it-chat-carl-jung", use_temp_dir=False)

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CometKing/Gemma-2-2b-it-chat-carl-jung/commit/0e80fdb6939460e513fe06250defc22a7edabdf0', commit_message='Upload tokenizer', commit_description='', oid='0e80fdb6939460e513fe06250defc22a7edabdf0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CometKing/Gemma-2-2b-it-chat-carl-jung', endpoint='https://huggingface.co', repo_type='model', repo_id='CometKing/Gemma-2-2b-it-chat-carl-jung'), pr_revision=None, pr_num=None)