## Evaluate responses using the StrongREJECT package

In [1]:
! pip install -U bitsandbytes peft datasets
# !pip install git+https://github.com/dsbowen/strong_reject@main


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!nvidia-smi

import torch
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}")

Wed May 14 11:28:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40                     On  |   00000000:D2:00.0 Off |                    0 |
|  0%   28C    P8             22W /  300W |       1MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Required: Prepare the dataset

First, we need a dataset with two columns:

* `forbidden_prompt` is a harmful prompt that a model should be forbidden from answering
* `response` is a model's response to the forbidden prompt

We start with a simple script to transform a CSV file with "prompt" and "response" columns into a Hugging Face Dataset object.

In [3]:
import csv
from tqdm import tqdm
from datasets import Dataset

# Read CSV file
prompts = []
responses = []
cot_responses = []  # Chain of thought responses
output_responses = []  # Final outputs
discarded_count = 0  # Count discarded rows

csv_path = "../dataset/strongreject_reasoning_template.csv"
with open(csv_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in tqdm(reader, desc="Processing rows"):
        
        # Check if response contains the </think> tag
        if "</think>" in row["response"]:
            prompts.append(row["prompt"])
            responses.append(row['response'])

            # Split response at </think> tag
            parts = row["response"].split("</think>", 1)  # Split at first occurrence

            # Add chain of thought (everything before </think>)
            cot_responses.append(parts[0].strip())
            # Add output (everything after </think>)
            output_responses.append(parts[1].strip())
        else:
            discarded_count += 1

dataset = Dataset.from_dict({
    "forbidden_prompt": prompts,
    "response": responses
})

# Create chain of thought dataset
cot_dataset = Dataset.from_dict({
    "forbidden_prompt": prompts,
    "response": cot_responses
})

# Create output dataset
output_dataset = Dataset.from_dict({
    "forbidden_prompt": prompts,
    "response": output_responses
})

# Display dataset info
print(f"Original dataset: {len(prompts) + discarded_count} rows")
print(f"Discarded rows (no </think> tag): {discarded_count}")
print(f"Processed datasets: {len(prompts)} rows")
print("\nChain of Thought Dataset:")
print(cot_dataset)
print("\nOutput Dataset:")
print(output_dataset)
print("\nDataset:")
print(dataset)

Processing rows: 313it [00:00, 6566.38it/s]

Original dataset: 313 rows
Discarded rows (no </think> tag): 0
Processed datasets: 313 rows

Chain of Thought Dataset:
Dataset({
    features: ['forbidden_prompt', 'response'],
    num_rows: 313
})

Output Dataset:
Dataset({
    features: ['forbidden_prompt', 'response'],
    num_rows: 313
})

Dataset:
Dataset({
    features: ['forbidden_prompt', 'response'],
    num_rows: 313
})





In [4]:
import os
from strong_reject.evaluate import evaluate_dataset

eval_dataset = evaluate_dataset(output_dataset, ["strongreject_finetuned"])
print(eval_dataset)

Map:   0%|          | 0/313 [00:00<?, ? examples/s]

adapter_config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Dataset({
    features: ['forbidden_prompt', 'response', 'score', 'evaluator'],
    num_rows: 313
})


### Filtering based on the whole score

In [None]:
# from transformers import AutoTokenizer

# def filter_csv(eval_dataset, cot_dataset, lower_threshold=None, upper_threshold=None):
#     """
#     Filter rows with score < lower_threshold OR score > upper_threshold and add to filtered_data.
    
#     Args:
#         dataset: A Hugging Face dataset object
#         lower_threshold: Score should be below this value (if not None)
#         upper_threshold: Score should be above this value (if not None)
#     """
#     model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
    
#     if lower_threshold is None and upper_threshold is None:
#         raise ValueError("At least one threshold (lower or upper) must be specified")
    
#     # Filter rows based on provided thresholds
#     filtered_data = []
#     token_lengths = []
#     for i, row in enumerate(eval_dataset):
#         # Check if score is below lower_threshold OR above upper_threshold
#         include_row = False
        
#         if lower_threshold is not None and row["score"] < lower_threshold:
#             include_row = True
        
#         if upper_threshold is not None and row["score"] > upper_threshold:
#             include_row = True
            
#         if include_row:

#             filtered_data.append({"forbidden_prompt": row["forbidden_prompt"], "response": row["response"]})
#             tokens = tokenizer.encode(cot_dataset["response"][i])
#             token_lengths.append(len(tokens))
    
#     smallest_token_length = min(token_lengths)
    
#     return filtered_data, smallest_token_length

# def write_to_csv(filtered_data, output_file=None):

#     # Write to CSV
#     with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
#         fieldnames = ['forbidden_prompt', 'response']
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
#         writer.writeheader()
#         for row in filtered_data:
#             writer.writerow(row)
    
#     print(f"Successfully exported {len(filtered_data)} rows to {output_file}")

# # Uncomment and modify the example that fits your needs
# filtered_data, smallest_token_length = filter_csv(eval_dataset, cot_dataset, lower_threshold=0.1)
# # 106 the size of the non-cautious dataset
# write_to_csv(filtered_data[:106], output_file="../dataset/filtered_low_strongreject.csv")
# print("\nfiltered_low_advbench.csv")
# print(smallest_token_length)

# # filtered_data, smallest_token_length = filter_csv(eval_dataset, cot_dataset, upper_threshold=0.85)
# # write_to_csv(filtered_data, output_file="../dataset/filtered_high_advbench.csv")
# # print("\nfiltered_high_advbench.csv")
# # print(smallest_token_length)

### filtering based only on the output score

In [6]:
from transformers import AutoTokenizer

def filter_csv(eval_dataset, dataset, cot_dataset, lower_threshold=None, upper_threshold=None):
    """
    Filter rows with score < lower_threshold OR score > upper_threshold and add to filtered_data.
    
    Args:
        dataset: A Hugging Face dataset object
        lower_threshold: Score should be below this value (if not None)
        upper_threshold: Score should be above this value (if not None)
    """
    model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    if lower_threshold is None and upper_threshold is None:
        raise ValueError("At least one threshold (lower or upper) must be specified")
    
    # Filter rows based on provided thresholds
    filtered_data = []
    token_lengths = []
    for i, row in enumerate(eval_dataset):
        # Check if score is below lower_threshold OR above upper_threshold
        include_row = False
        
        if lower_threshold is not None and row["score"] < lower_threshold:
            include_row = True
        
        if upper_threshold is not None and row["score"] > upper_threshold:
            include_row = True
            
        if include_row:

            filtered_data.append({"forbidden_prompt": row["forbidden_prompt"], "response": dataset["response"][i]})
            tokens = tokenizer.encode(cot_dataset["response"][i])
            token_lengths.append(len(tokens))
    
    smallest_token_length = min(token_lengths)
    
    # Check if we have any matching rows
    if not filtered_data:
        print("No rows found matching the specified criteria")
        return
    
    return filtered_data, smallest_token_length

def write_to_csv(filtered_data, output_file=None):

    # Write to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['forbidden_prompt', 'response']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in filtered_data:
            writer.writerow(row)
    
    print(f"Successfully exported {len(filtered_data)} rows to {output_file}")

# Uncomment and modify the example that fits your needs
filtered_data, smallest_token_length = filter_csv(eval_dataset, dataset, cot_dataset, lower_threshold=0.03)
# 106 the size of the non-cautious dataset
write_to_csv(filtered_data, output_file="../dataset/cautious_eval.csv")
print("\ncautious_eval.csv")
print(smallest_token_length)

# filtered_data, smallest_token_length = filter_csv(eval_dataset, dataset, cot_dataset, upper_threshold=0.85)
# write_to_csv(filtered_data, output_file="../dataset/filtered_high_advbench.csv")
# print("\nfiltered_high_advbench.csv")
# print(smallest_token_length)

Successfully exported 116 rows to ../dataset/cautious_eval.csv

cautious_eval.csv
218
