In [None]:
!pip install transformers==4.48.0 peft==0.14.0 trl==0.13.0 bitsandbytes==0.45.3 accelerate==1.2.1

In [None]:
import os
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = hf_token

In [None]:
# Quantization
from transformers import BitsAndBytesConfig
import torch

quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

In [None]:
# Loading a merged(tuned) model and quantizing it to 4-bits
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    "jsgoodlife0511/llama3.1-tuned-and-merged ",
    quantization_config = quantization_config,
    device_map = {"": 0}
)

In [None]:
# Tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("jsgoodlife0511/llama3.1-tuned-and-merged")

In [None]:
# Lora

from peft import LoraConfig
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj","k_proj","o_proj","gate_proj","up_proj","down_proj"]
)

In [None]:
from peft import get_peft_model
model = get_peft_model(model, peft_config) # Directly attaching a QLoRA to base model

In [None]:
# Prompt/Response Formatting
EOS_TOKEN = tokenizer.eos_token

# Prompt fommating should be the same with the one used in part 1 (when Alpaca dataset was used)
# The prompt format represents the "context structure" that the model has been trained on, so it needs to remain consistent during DPO training as well.
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""


def formatting_prompts_func_for_dpo(examples):
    questions = examples["question"]
    chosens = examples["chosen"]
    rejects = examples["rejected"]

    prompt_lst, chosen_lst, rejected_lst = [], [], []
    for prompt, chosen, rejected in zip(questions, chosens, rejects):
        prompt = alpaca_prompt.format(prompt, "")
        prompt_lst.append(prompt)
        chosen_lst.append(chosen + EOS_TOKEN)
        rejected_lst.append(rejected + EOS_TOKEN)

    return {"prompt" : prompt_lst, "chosen": chosen_lst, "rejected": rejected_lst,}

In [None]:
# Orca Dataset Load
from datasets import load_dataset
dataset = load_dataset("Intel/  s", split = "train")

# Remove "question ..." dataset. We will use instruction-only data.
# Set a length limitation to generate a shorter text than in DPO_training_1.
filtered_dataset = dataset.filter(
    lambda example:
        "question:" not in example['question'].lower()
        and 'q:' not in example['question'].lower()
        and (len(example['rejected']) - len(example['chosen'])) >= 200
        and len(example['chosen']) <= 300
    )
mapped_dataset = filtered_dataset.map(formatting_prompts_func_for_dpo, batched=True, remove_columns=filtered_dataset.column_names)
split_dataset = mapped_dataset.train_test_split(test_size=0.05, seed=42)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
local_output_dir = "/content/dpo_output_2"
!mkdir {local_output_dir}

In [None]:
%load_ext tensorboard
%tensorboard --logdir '{local_output_dir}/runs'

In [None]:
from transformers import TrainingArguments
from trl import DPOTrainer
from trl.trainer.dpo_config import DPOConfig

training_args = DPOConfig(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':False},
    learning_rate=5e-5,
    lr_scheduler_type = "constant_with_warmup",
    max_steps=300,
    eval_steps=10,
    save_steps=10,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=1,
    output_dir=local_output_dir,
    optim = "adamw_8bit",
    warmup_steps = 50,
    report_to="tensorboard",
    beta=0.1,
    max_length=1024,
    max_prompt_length=512
)
trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer
)
trainer.train()

In [None]:
# Google drive로 복사
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r {local_output_dir} /content/drive/MyDrive