## Install required libraries

In [None]:
!pip install -q bitsandbytes datasets==2.21.0 peft==0.10.0 trl==0.8.6 distilabel==1.3.2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/199.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.0/345.0 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from torch import autocast
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

## Model, classifier and dataset paths

In [None]:
# Models being compared to each other
SFT_MODEL_PATH = "ishasinha1/Qwen3-0.6B-SFT-Safety"
RLHF_MODEL_PATH = "ishasinha1/Qwen3-0.6B-DPO-Safety"
INSTRUCT_MODEL_PATH = "Qwen/Qwen3-0.6B"

# Evaluation models
BIAS_TYPE_CLASSIFIER_PATH = "maximuspowers/bias-type-classifier"
DEBERTA_REWARD_MODEL_PATH = "OpenAssistant/reward-model-deberta-v3-large"

TEST_DATASET_PATH = "ethical-spectacle/biased-corpus"

## Prepare test data

In [None]:
# Tease apart type of bias we want to evaluate
# Can be replaced with any type of bias in the test dataset: https://huggingface.co/datasets/ethical-spectacle/biased-corpus

# BIAS_TYPE = "racial"
BIAS_TYPE = "gender"

dataset = load_dataset(TEST_DATASET_PATH)

# Only the prompts corresponding to the bias we are looking at
# Access the 'train' split of the dataset
TEST_DATA = [ex["biased_text"] for ex in dataset['train'] if ex[BIAS_TYPE] == 1]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37507 [00:00<?, ? examples/s]

In [None]:
import numpy as np
rng = np.random.default_rng(seed=42)

# Randomly select 500 test data samples for a given bias type
# seed = 42 ensures reproducibility
TEST_DATA_SAMPLE = rng.choice(TEST_DATA, size=500, replace=False)

## Load tokenizer and model using 4-bit NF4 Quantization

In [None]:
from getpass import getpass
huggingface_token = getpass("Enter your Huggingface token:")

Enter your Huggingface token:··········


In [None]:
# Loads the model and tokenizer
def load_model(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token)
  tokenizer.pad_token = tokenizer.eos_token
  model = AutoModelForCausalLM.from_pretrained(model_name,
                                              token=huggingface_token,
                                              quantization_config=BitsAndBytesConfig(
                                                    load_in_4bit=True,
                                                    bnb_4bit_compute_dtype=torch.bfloat16,
                                                    bnb_4bit_use_double_quant=True,
                                                    bnb_4bit_quant_type='nf4'
                                                )
                                              )
  return tokenizer, model

## Run inference on test data

In [None]:
# Replace model path with one of the ones defined above


# tokenizer, model = load_model(INSTRUCT_MODEL_PATH)
# tokenizer, model = load_model(SFT_MODEL_PATH)
tokenizer, model = load_model(RLHF_MODEL_PATH)

# Runs inference on the model given a prompt
def run_inference(prompt):
  messages = [
        {"role": "system", "content": 'You are a helpful assistant'},
        {"role": "user", "content": prompt},
    ]

  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
  inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

  with autocast("cuda", torch.bfloat16):
    output = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"],
                            use_cache=True, do_sample=True, tokenizer=tokenizer, max_length=512)

  output = tokenizer.batch_decode(output)[0]
  return output

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

adapter_config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/9.20M [00:00<?, ?B/s]

In [None]:
# Extract model responses for each model for the test_data
from tqdm import tqdm

MODEL_RESPONSES = []
for prompt in tqdm(TEST_DATA_SAMPLE):
  response = run_inference(prompt)
  MODEL_RESPONSES.append(response)

100%|██████████| 500/500 [1:33:16<00:00, 11.19s/it]


## Clean and save model responses

In [None]:
# Extracts and stores only the model responses
ASSISTANT_MODEL_RESPONSES = []
for response in MODEL_RESPONSES:
  if "</think>" in response:
      response = response.split("</think>", 1)[-1]
  if "<|im_end|>" in response:
      response = response.split("<|im_end|>", 1)[0]
  response = response.strip()
  ASSISTANT_MODEL_RESPONSES.append(response)

In [None]:
# Save prompts and model responses as JSON files
import json

data = [
    {'prompt': prompt, 'response': response}
    for prompt, response in zip(TEST_DATA_SAMPLE, ASSISTANT_MODEL_RESPONSES)
]

with open(f'dpo_model_{BIAS_TYPE}_responses.json', 'w') as f:
    json.dump(data, f, indent=4)

# Evaluation

## Get reward model scores for helpfulness

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

# Code modified from documentation: https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large
rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(DEBERTA_REWARD_MODEL_PATH), AutoTokenizer.from_pretrained(DEBERTA_REWARD_MODEL_PATH)
for qa in tqdm(data):
  question = qa['prompt']
  answer = qa['response']
  inputs = tokenizer(question, answer, return_tensors='pt')
  score = rank_model(**inputs).logits[0].cpu().detach()
  qa['deberta-rm-score'] = score.item()


config.json:   0%|          | 0.00/991 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]


  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [00:01<09:58,  1.20s/it][A
  0%|          | 2/500 [00:01<06:05,  1.36it/s][A
  1%|          | 3/500 [00:02<04:52,  1.70it/s][A
  1%|          | 4/500 [00:02<04:08,  2.00it/s][A
  1%|          | 5/500 [00:02<03:57,  2.09it/s][A
  1%|          | 6/500 [00:03<03:59,  2.06it/s][A
  1%|▏         | 7/500 [00:03<03:44,  2.20it/s][A
  2%|▏         | 8/500 [00:04<03:25,  2.39it/s][A
  2%|▏         | 9/500 [00:04<03:58,  2.06it/s][A
  2%|▏         | 10/500 [00:05<03:42,  2.20it/s][A
  2%|▏         | 11/500 [00:05<03:23,  2.40it/s][A
  2%|▏         | 12/500 [00:06<03:51,  2.11it/s][A
  3%|▎         | 13/500 [00:06<03:40,  2.21it/s][A
  3%|▎         | 14/500 [00:06<03:26,  2.36it/s][A
  3%|▎         | 15/500 [00:07<03:26,  2.35it/s][A
  3%|▎         | 16/500 [00:07<03:14,  2.49it/s][A
  3%|▎         | 17/500 [00:08<03:37,  2.22it/s][A
  4%|▎         | 18/500 [00:08<03:23,  2.37it/s][A
  4%|▍         | 19/500 [00:0

## Bias Type Classifier: Get BIAS_TYPE Bias Confidence $∈$ $[0, 1]$

In [None]:
from transformers import pipeline
from tqdm import tqdm

classifier = pipeline("text-classification", model=BIAS_TYPE_CLASSIFIER_PATH, top_k=None)

for ex in tqdm(data):
  # Add truncation=True to handle long sequences that exceed the model's maximum input length.
  result = classifier(ex['response'], truncation=True)
  ex['bias-type-score'] = next(item['score'] for item in result[0] if item['label'] == BIAS_TYPE)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 1/500 [00:00<01:11,  7.01it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 500/500 [00:04<00:00, 105.13it/s]


In [None]:
# Save response evaluation scores
with open(f'dpo_model_{BIAS_TYPE}_responses_scores.json', 'w') as f:
    json.dump(data, f, indent=4)

## Extract responses and scores from JSON File for Evaluation

In [None]:
# Saving and extracting files allows for asynchronous generation of responses / scores

import json
filename = f'dpo_model_{BIAS_TYPE}_responses_scores.json'
with open(filename, 'r') as f:
  data = json.load(f)

In [None]:
rm_scores = [ex['deberta-rm-score'] for ex in data]
bias_scores = [ex['bias-type-score'] for ex in data]

In [None]:
# Compute and print mean rewards / bias scores
import numpy as np

mean_rm_score = np.mean(rm_scores)
mean_bias_score = np.mean(bias_scores)
print(f"Mean RM Score: {mean_rm_score}")
print(f"Mean Bias Score: {mean_bias_score}")

Mean RM Score: -1.7689379951655864
Mean Bias Score: 0.7022276511195087
