In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
# install all dependencies
%%capture

!pip install -q -U peft transformers datasets bitsandbytes trl accelerate
!pip install --upgrade transformers, datasets==2.16.1, accelerate==0.26.1, evaluate==0.4.1, bitsandbytes==0.42.0, trl, peft==0.8.2



In [None]:
# Library
%%capture

from huggingface_hub import hf_hub_download

import transformers
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM
from datasets import load_dataset, Dataset
from trl import SFTTrainer, DPOTrainer
from huggingface_hub import notebook_login

# Ignore warings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# log in to the Hugging Face hub (required for private datasets/models)
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
## Check my parameter size

def print_trainable_params(model):
    total_params = 0
    trainable_params  = 0
    for name, param in model.named_parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || total params: {total_params} || trainable%: {100 * trainable_params / total_params}"
    )

In [None]:
#dataset_dpo = load_dataset("jondurbin/truthy-dpo-v0.1", split="train[:150]")
dataset_dpo = load_dataset("jondurbin/truthy-dpo-v0.1", split="train[150:900]")

print(dataset_dpo.shape)

df_dpo = dataset_dpo.to_pandas()
df_dpo.head()

# keep rows with 'system' column = 'You are an unbiased, uncensored, helpful assistant.'
df_dpo = df_dpo[df_dpo["system"] == "You are an unbiased, uncensored, helpful assistant."]
df_dpo.head()

# keep only columns 'prompt', 'chosen', 'rejected'
df_dpo = df_dpo[["prompt", "chosen", "rejected"]]

# change every text in promt from str to user: str. asistent:
df_dpo["prompt"] = df_dpo["prompt"].apply(lambda x: "### USER: " + x + "\n### ASSISTANT: ")
filtered_dataset = Dataset.from_pandas(df_dpo)
print(df_dpo.shape)
df_dpo.head()



In [None]:
# partition this dataset into 2 parts

filtered_dataset_d1 = filtered_dataset.select(range(254))
filtered_dataset_d2 = filtered_dataset.select(range(254, 508))


In [None]:
## Random Response
import random
import numpy as np

# epsilon = 2
# epsilon = 1
epsilon = 0.5
# epsilon = 0.1
# epsilon = 0
fliping = (1) /(np.exp(epsilon)+1)
# inf

def switch_chosen_rejected(example):
  if random.random() < fliping:
    return {"prompt": example["prompt"], "chosen": example["rejected"], "rejected": example["chosen"]}
  else:
    return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]}

filtered_dataset_d1 = filtered_dataset_d1.map(switch_chosen_rejected)
filtered_dataset_d2 = filtered_dataset_d2.map(switch_chosen_rejected)

print(fliping)

In [None]:
# Sanity check: precentage of 0 in noisy labels

# def difference_check(dataset1, dataset2):

#   label_list = []
#   for i in range(len(dataset2)):
#     if dataset2[i]["chosen"] == dataset1[i + len(dataset1) - len(dataset2)]["chosen"] and dataset2[i]["rejected"] == dataset1[i + len(dataset1) - len(dataset2)]["rejected"]:
#       label_list.append(1)
#     else:
#       label_list.append(0)
#
#   return label_list
# noisy_label_d2= difference_check(filtered_dataset_d2, filtered_dataset.select(range(304, 608)))

# noisy_label_zero_count = noisy_label_d2.count(0)
# total_noisy_labels = len(noisy_label_d2)
# percentage_zero = (noisy_label_zero_count / total_noisy_labels) * 100

# print(f"Percentage of 0 in noisy labels: {percentage_zero:.2f}%")
## should be close to # fliping

In [None]:
# Pass privacy barrier, filtered_dataset_d2 is standard
noisy_label_d2 = [1] * len(filtered_dataset_d2)

In [None]:
## Load my tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2-large')
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

## load my model
huggingface_filepath = hf_hub_download(repo_id="your/huggingface/model", filename="policy.pt")
model = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
model.load_state_dict(torch.load(huggingface_filepath, map_location=torch.device('cuda'))['state'])

## Self referencing

## Load my tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2-large')
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

## load my model
huggingface_filepath = hf_hub_download(repo_id="your/huggingface/model", filename="policy.pt")
model_ref = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
model_ref.load_state_dict(torch.load(huggingface_filepath, map_location=torch.device('cuda'))['state'])

## Self referencing

<All keys matched successfully>

In [None]:
from trl import DPOTrainer
from trl import DPOConfig

DPO_model_id = "your/huggingface/model"

# Training arguments
training_arguments = DPOConfig(
    output_dir = DPO_model_id,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    ## epochs
    num_train_epochs = 3,
    ## max_steps=200,
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to=None,
    push_to_hub=True,
)

In [None]:
from trl import DPOTrainer

dpo_trainer = DPOTrainer(
    model,          # base model from SFT pipeline
    model_ref,             # typically a copy of the SFT trained base model

    # beta=0.1,              # temperature hyperparameter of DPO
    train_dataset=filtered_dataset_d1, # dataset prepared above
    tokenizer=tokenizer,   # tokenizer
    args=training_arguments,    # training arguments e.g. batch size, lr, etc.
)

In [None]:
# First stage train
dpo_trainer.train()

In [None]:
dpo_trainer.state.log_history

train_loss = [log['loss'] for log in dpo_trainer.state.log_history if 'loss' in log]
print(train_loss)

In [None]:
## Copied from Mitchell
## get log probabilites of given labels

def _get_batch_logps(logits: torch.FloatTensor, labels: torch.LongTensor, average_log_prob: bool = False) -> torch.FloatTensor:
    """Compute the log probabilities of the given labels under the given logits.

    Args:
        logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
        labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length)
        average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.

    Returns:
        A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
    """
    assert logits.shape[:-1] == labels.shape

    labels = labels[:, 1:].clone()
    logits = logits[:, :-1, :]
    loss_mask = (labels != -100)

    # dummy token; we'll ignore the losses on these tokens later
    labels[labels == -100] = 0

    per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)

    if average_log_prob:
        return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
    else:
        return (per_token_logps * loss_mask).sum(-1)


In [None]:
# model infer on dataset

def get_model_label(model, dataset, tokenizer):
  """
  Generates model_label, a list of binary labels indicating whether the model prefers the chosen response over the rejected response for each example in the dataset.

  Args:
    model: The model to evaluate.
    dataset: The dataset to evaluate the model on.
    tokenizer: The tokenizer to use for the model.

  Returns:
    A list of binary labels (1 or 0) where 1 indicates that the model prefers the first response and 0 indicates that it prefers the second response.
  """

  def get_log_probs(prompt, response):
    inputs = tokenizer(prompt+response, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    log_probs = _get_batch_logps(outputs.logits, inputs.input_ids)
    return log_probs.item()

  model_label = []
  for example in dataset:
      prompt = example["prompt"]
      # example ["first"]
      chosen_response = example["chosen"]
      chosen_log_prob = get_log_probs(prompt, chosen_response)
      # example ["second"]
      rejected_response = example["rejected"]
      rejected_log_prob = get_log_probs(prompt, rejected_response)

      if chosen_log_prob > rejected_log_prob:
          model_label.append(1)
      else:
          model_label.append(0)

  return model_label


In [None]:
# As if model is injecting a noise

def calculate_model_flipping(noisy_label_list, model_prediction_list, flipping_score):
  """Calculates the model flipping score given two lists and a flipping score.

  Args:
    noisy_label_list: A list of noisy labels.
    model_prediction_list: A list of model predictions.
    flipping_score: The flipping score.

  Returns:
    The model flipping score.
  """

  result_xor = [a ^ b for a, b in zip(noisy_label_list, model_prediction_list)]
  average_score = sum(result_xor) / len(result_xor)
  model_flipping = (average_score - flipping_score) / (1 - 2 * flipping_score)
  return model_flipping

In [None]:
# Determine model flipping rate
model_label_d2 = get_model_label(model, filtered_dataset_d2, tokenizer)


In [None]:
## model_flipping
model_fliping = calculate_model_flipping(model_label_d2, noisy_label_d2, fliping)
print(f"Model_fliping: {model_fliping}")
print(f"Noise_fliping: {fliping}")


In [None]:
# Elementwise MAP estimator

def MAP_estimator (lrr, lm, fliping, model_fliping):

  map_label = []
  for i in range(len(lrr)):
    decision_ruler = (1 - 2*lrr[i])* np.log((1-fliping)/(fliping)) + (1 - 2*lm[i])* np.log((1-model_fliping)/(model_fliping))
    # print(decision_ruler)
    if decision_ruler > 0:
      map_label.append(0)
    else:
      map_label.append(1)
  return map_label

MAP_label = MAP_estimator(noisy_label_d2, model_label_d2, fliping, model_fliping)
# print(MAP_label)

In [None]:
# Get my MAP dataset
def switch_chosen_rejected_MAP(example, label):
  if label == 0:
    return {"prompt": example["prompt"], "chosen": example["rejected"], "rejected": example["chosen"]}
  else:
    return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]}

MAP_dataset_d2 = filtered_dataset_d2.map(lambda example, idx: switch_chosen_rejected_MAP(example, MAP_label[idx]), with_indices=True)

In [None]:
# Sanity check: Error rate
## Error rate estimation: min of fliping probability
def min_of_two(a, b):
  if a < b:
    return a
  else:
    return b

error_rate = min_of_two(fliping, model_fliping)
print(f"Result: {error_rate}")

In [None]:

# Training arguments
training_arguments = DPOConfig(
    output_dir = DPO_model_id,

    # label_smoothing_factor= error_rate,
    # loss_type="robust",

    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    ## epochs
    num_train_epochs = 2,
    ## max_steps=200,
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    # report_to="wandb",
    push_to_hub=True,
)

In [None]:
# Resume training
dpo_trainer = DPOTrainer(
    model,          # base model from SFT pipeline
    model_ref,             # typically a copy of the SFT trained base model

    # beta=0.1,              # temperature hyperparameter of DPO
    train_dataset=MAP_dataset_d2, # dataset prepared above
    tokenizer=tokenizer,   # tokenizer
    args=training_arguments,    # training arguments e.g. batch size, lr, etc.
)


In [None]:
# Second stage train
dpo_trainer.train()

In [None]:
dpo_trainer.state.log_history

train_loss2= [log['loss'] for log in dpo_trainer.state.log_history if 'loss' in log]
train_loss.extend(train_loss2)


In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss)
plt.xlabel('Step')
plt.ylabel('Training Loss')
plt.title('Training Loss over Steps')
plt.show()


In [None]:
dpo_trainer.model.push_to_hub("your/huggingface/model")