In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
# install all dependencies
%%capture

!pip install -q -U peft transformers datasets bitsandbytes trl accelerate
!pip install --upgrade transformers, datasets==2.16.1, accelerate==0.26.1, evaluate==0.4.1, bitsandbytes==0.42.0, trl, peft==0.8.2



In [None]:
# Library
%%capture


from huggingface_hub import hf_hub_download

import transformers
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM
from datasets import load_dataset, Dataset
from trl import SFTTrainer, DPOTrainer
from huggingface_hub import notebook_login

# Ignore warings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# log in to the Hugging Face hub (required for private datasets/models)
# login to my huggingface, with a token
from huggingface_hub import HfApi, HfFolder, CommitOperationAdd
from huggingface_hub import login

# log in to the Hugging Face hub (required for private datasets/models)
notebook_login()

In [None]:
DPO_model_id = "your/huggingface/model"

## Random Response
import random
import numpy as np

epsilon = 2
# epsilon = 1
# epsilon = 0.5
# epsilon = 0.1
# epsilon = 0
fliping = (1) /(np.exp(epsilon)+1)
# inf

In [None]:
## Check my parameter size

def print_trainable_params(model):
    total_params = 0
    trainable_params  = 0
    for name, param in model.named_parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || total params: {total_params} || trainable%: {100 * trainable_params / total_params}"
    )

In [None]:
dataset_dpo = load_dataset("reciprocate/alpaca-eval", split="train[100:2100]")

print(dataset_dpo.shape)

df_dpo = dataset_dpo.to_pandas()
df_dpo.head()

# keep rows with 'system' column = 'You are an unbiased, uncensored, helpful assistant.'
# df_dpo = df_dpo[df_dpo["system"] == "You are an unbiased, uncensored, helpful assistant."]

# keep only columns 'prompt', 'chosen', 'rejected'
df_dpo = df_dpo[["prompt", "selected", "rejected"]]
# Rename the 'selected' column to 'chosen'
df_dpo = df_dpo.rename(columns={"selected": "chosen"})
df_dpo.head()

# change every text in promt from str to user: str. asistent:
df_dpo["prompt"] = df_dpo["prompt"].apply(lambda x: "### USER: " + x + "\n### ASSISTANT: ")
filtered_dataset = Dataset.from_pandas(df_dpo)
print(df_dpo.shape)
df_dpo.head()


# partition this dataset into 3 parts

filtered_dataset_d1 = filtered_dataset.select(range(2000))


def switch_chosen_rejected(example):
  if random.random() < fliping:
    return {"prompt": example["prompt"], "chosen": example["rejected"], "rejected": example["chosen"]}
  else:
    return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]}

filtered_dataset_d1 = filtered_dataset_d1.map(switch_chosen_rejected)

print(fliping)


In [None]:
## Load my tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2-large')
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

## load my model
huggingface_filepath = hf_hub_download(repo_id="your/huggingface/model", filename="policy.pt")
model = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
model.load_state_dict(torch.load(huggingface_filepath, map_location=torch.device('cuda'))['state'])

## Self referencing

## Load my tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2-large')
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

## load my model
huggingface_filepath = hf_hub_download(repo_id="your/huggingface/model", filename="policy.pt")
model_ref = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
model_ref.load_state_dict(torch.load(huggingface_filepath, map_location=torch.device('cuda'))['state'])

## Self referencing

In [None]:
from trl import DPOTrainer
from trl import DPOConfig

# Training arguments
training_arguments = DPOConfig(
    output_dir = DPO_model_id,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    ## epochs
    num_train_epochs = 3,
    ## max_steps=200,
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to=None,
    push_to_hub=True,
)

from trl import DPOTrainer

dpo_trainer = DPOTrainer(
    model,          # base model from SFT pipeline
    model_ref,             # typically a copy of the SFT trained base model

    # beta=0.1,              # temperature hyperparameter of DPO
    train_dataset=filtered_dataset_d1, # dataset prepared above
    # tokenizer=tokenizer,   # tokenizer
    processing_class=tokenizer, # wow, smart update
    args=training_arguments,    # training arguments e.g. batch size, lr, etc.
)

In [None]:
# First stage train
dpo_trainer.train()

In [None]:
dpo_trainer.state.log_history

train_loss = [log['loss'] for log in dpo_trainer.state.log_history if 'loss' in log]
print(train_loss)

import matplotlib.pyplot as plt

plt.plot(train_loss)
plt.xlabel('Step')
plt.ylabel('Training Loss')
plt.title('Training Loss over Steps')
plt.show()

In [None]:
dpo_trainer.model.push_to_hub(DPO_model_id)