In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = "zake7749/gemma-2-2b-it-chinese-kyara-dpo"
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             quantization_config=bnb_config, 
                                             attn_implementation='eager',
                                             cache_implementation=None,
                                             use_cache=False,)

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
model.to("cuda")

In [2]:
from datasets import load_dataset

c2m_prompt = "你是一個繁體中文的專家，專門將文言文翻譯成白話文。將下面的文言文翻譯成白話文："
m2c_prompt = "你是一個繁體中文的專家，專門將白話文翻譯成文言文。將下面的白話文翻譯成文言文："

def c2m_get_prompt(example):
    chat = [
        {"role": "user", "content": f"{c2m_prompt}：{example['classical']}"},
        {"role": "assistant", "content": f"{example['modern']}"},
    ]
    return tokenizer.apply_chat_template(chat, tokenize=False)

def m2c_get_prompt(example):
    chat = [
        {"role": "user", "content": f"{m2c_prompt}：{example['modern']}"},
        {"role": "assistant", "content": f"{example['classical']}"},
    ]
    return tokenizer.apply_chat_template(chat, tokenize=False)

c2m_dataset = load_dataset('csv', data_files="datasetcht.csv", split="train").shuffle(seed=42)
c2m_text = [c2m_get_prompt(data_point) for data_point in c2m_dataset]
c2m_dataset = c2m_dataset.add_column("prompt", c2m_text)
# c2m_dataset = c2m_dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

m2c_dataset = load_dataset('csv', data_files="datasetcht.csv", split="train").shuffle(seed=42)
m2c_text = [m2c_get_prompt(data_point) for data_point in m2c_dataset]
m2c_dataset = m2c_dataset.add_column("prompt", m2c_text)
# m2c_dataset = m2c_dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)


In [None]:
from datasets import concatenate_datasets
dataset = concatenate_datasets([c2m_dataset, m2c_dataset])
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [4]:
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [5]:
def find_all_linear_names(peft_model, int4=False, int8=False):
    """Find all linear layer names in the model. reference from qlora paper."""
    cls = torch.nn.Linear
    if int4 or int8:
        import bitsandbytes as bnb
        if int4:
            cls = bnb.nn.Linear4bit
        elif int8:
            cls = bnb.nn.Linear8bitLt
    lora_module_names = set()
    for name, module in peft_model.named_modules():
        if isinstance(module, cls):
            # last layer is not add to lora_module_names
            if 'lm_head' in name:
                continue
            if 'output_layer' in name:
                continue
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return sorted(lora_module_names)

In [6]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

model.enable_input_require_grads()
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)
modules = find_all_linear_names(model)  # Get modules to apply LoRA to
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_model = get_peft_model(model, lora_config)

In [None]:
from trl import SFTConfig
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=10,
        # max_steps=500,
        learning_rate=2e-4,
        output_dir="output1",
        optim="paged_adamw_32bit",
        save_strategy="steps",
        report_to="wandb",
        logging_steps=1,
        packing=False,
        gradient_checkpointing=True,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()

In [None]:
peft_model.save_pretrained("peft_model")
repo_name = "huchiahsi/peft-model-repo-1900k"  # 替換為你的 repository 名稱
save_directory = "./peft_model"             # 模型儲存的本地路徑

# 上傳模型到 Hugging Face
peft_model.push_to_hub(repo_name)