In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
# install all dependencies
%%capture

!pip install -q -U peft transformers datasets bitsandbytes trl accelerate
!pip install --upgrade transformers==4.38.2, datasets==2.16.1, accelerate==0.26.1, evaluate==0.4.1, bitsandbytes==0.42.0, trl==0.7.11, peft==0.8.2


In [None]:
# Library
%%capture

import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM
from datasets import load_dataset, Dataset
from trl import SFTTrainer, DPOTrainer
from huggingface_hub import notebook_login

# Ignore warings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# log in to the Hugging Face hub (required for private datasets/models)
notebook_login()

In [None]:
## Check my parameter size

def print_trainable_params(model):
    total_params = 0
    trainable_params  = 0
    for name, param in model.named_parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || total params: {total_params} || trainable%: {100 * trainable_params / total_params}"
    )

In [None]:
#dataset_dpo = load_dataset("jondurbin/truthy-dpo-v0.1", split="train[:150]")
dataset_dpo = load_dataset("jondurbin/truthy-dpo-v0.1", split="train[150:900]")
print(dataset_dpo.shape)

df_dpo = dataset_dpo.to_pandas()
df_dpo.head()

# keep rows with 'system' column = 'You are an unbiased, uncensored, helpful assistant.'
df_dpo = df_dpo[df_dpo["system"] == "You are an unbiased, uncensored, helpful assistant."]
df_dpo.head()

# keep only columns 'prompt', 'chosen', 'rejected'
df_dpo = df_dpo[["prompt", "chosen", "rejected"]]

# change every text in promt from str to user: str. asistent:
df_dpo["prompt"] = df_dpo["prompt"].apply(lambda x: "### USER: " + x + "\n### ASSISTANT: ")
filtered_dataset = Dataset.from_pandas(df_dpo)
print(df_dpo.shape)
df_dpo.head()


In [None]:
## Random Response
import random
import numpy as np

epsilon = 0.1
# epsilon = 0.5
# epsilon = 1
# epsilon = 2

fliping = (1) /(np.exp(epsilon)+1)
# fliping = 0

def switch_chosen_rejected(example):
  if random.random() < fliping:
    return {"prompt": example["prompt"], "chosen": example["rejected"], "rejected": example["chosen"]}
  else:
    return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]}

filtered_dataset = filtered_dataset.map(switch_chosen_rejected)
print(fliping)

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch

SFT_model_id = "your/huggingface/model"

# Set quantization config (to save memory)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model = AutoPeftModelForCausalLM.from_pretrained(
    SFT_model_id, # location of saved SFT model
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    is_trainable=True,
)
model.config.use_cache = False

model_ref = AutoPeftModelForCausalLM.from_pretrained(
    SFT_model_id,  # same model as the main one
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(SFT_model_id)
# Set it to a new token to correctly attend to EOS tokens.
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print_trainable_params(model)
print_trainable_params(model_ref)

In [None]:
DPO_model_id = "your/huggingface/model"

# Training arguments
training_arguments = TrainingArguments(
    output_dir = DPO_model_id,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-3,
    lr_scheduler_type="cosine",
    ## epochs
    num_train_epochs = 7, #10
    ## max_steps=200,
    save_strategy="no",
    logging_steps=10,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    # report_to="wandb",
    push_to_hub=True,
    report_to =None,

)

In [None]:
from trl import DPOTrainer

dpo_trainer = DPOTrainer(
    model,          # base model from SFT pipeline
    model_ref,             # typically a copy of the SFT trained base model
    beta=0.1,              # temperature hyperparameter of DPO
    train_dataset=filtered_dataset, # dataset prepared above
    tokenizer=tokenizer,   # tokenizer
    args=training_arguments,    # training arguments e.g. batch size, lr, etc.
)

In [None]:
print_trainable_params(dpo_trainer.model)
print_trainable_params(dpo_trainer.ref_model)

In [None]:
dpo_trainer.train()

In [None]:
dpo_trainer.push_to_hub()
