# Analyze scores for each role

In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm


In [2]:
# or roles_240
dir = "roles" 

# or 240
n_questions = 30
n_prompt_types = 2


In [12]:
# get responses
responses = {}
for file in os.listdir(f'/workspace/{dir}/responses'):
    if file.endswith('.jsonl'):
        response = []
        with open(f'/workspace/{dir}/responses/{file}', 'r') as f:
            for line in f:
                response.append(json.loads(line))
        if len(response) != 300:
            print(f"Expected 300 responses, got {len(response)} for {file}")
        responses[file.replace('.jsonl', '')] = response

In [17]:
def response_by_key(response_key: str, responses_list: list) -> int:
    """
    Parse a response key and find the corresponding index in the responses list.
    
    Args:
        response_key: Key in format "{label}_p{prompt_index}_q{question_index}"
                     e.g., "pos_p2_q15", "default_p0_q7"
        responses_list: List of response dictionaries with 'label', 'prompt_index', 'question_index'
    
    Returns:
        Index in responses_list, or -1 if not found
        
    Examples:
        >>> find_response_index("pos_p2_q15", responses)
        42
        >>> find_response_index("default_p0_q7", responses)  
        7
    """
    import re
    
    # Parse the response key using regex
    match = re.match(r'(\w+)_p(\d+)_q(\d+)', response_key)
    if not match:
        print(f"Warning: Could not parse response key: {response_key}")
        return -1
    
    target_label, target_prompt_idx, target_question_idx = match.groups()
    target_prompt_idx = int(target_prompt_idx)
    target_question_idx = int(target_question_idx)
    
    # Handle label normalization (neutral -> default)
    if target_label == 'neutral':
        target_label = 'default'
    
    # Search through responses list
    for response in responses_list:
        response_label = response.get('label')
        response_prompt_idx = response.get('prompt_index', 0)  # Default to 0 for backward compatibility
        response_question_idx = response.get('question_index')
        
        # Handle label normalization for response
        if response_label == 'neutral':
            response_label = 'default'
        
        # Check for match
        if (response_label == target_label and 
            response_prompt_idx == target_prompt_idx and 
            response_question_idx == target_question_idx):
            return response
    
  

## Score statistics

In [3]:
# load data from data/extract_scores
score_dir = f"/workspace/{dir}/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} roles with scores")


Found 275 roles with scores


In [None]:
# Analyze refusals and clean data
refusal_info = {}
scores_clean = {}

for role, score_obj in scores.items():
    refusals = []
    cleaned_scores = {}
    
    # Check each score for refusals
    for key, value in score_obj.items():
        if value == "REFUSAL":
            refusals.append(key)
            cleaned_scores[key] = 0  # Replace refusals with NaN
        else:
            cleaned_scores[key] = float(value)  # Ensure numeric
    
    scores_clean[role] = cleaned_scores
    refusal_info[role] = {
        "refusals": refusals,
        "refusal_count": len(refusals)
    }

# Show refusal statistics
total_refusals = sum(info["refusal_count"] for info in refusal_info.values())
roles_with_refusals = sum(1 for info in refusal_info.values() if info["refusal_count"] > 0)

print(f"Refusal Statistics:")
print(f"Total refusals across all roles: {total_refusals}")
print(f"Roles with refusals: {roles_with_refusals}")

if total_refusals > 0:
    sorted_refusals = sorted(refusal_info.items(), key=lambda x: x[1]["refusal_count"], reverse=True)
    print(f"\nTop 10 roles with most refusals:")
    for role, info in sorted_refusals[:10]:
        if info["refusal_count"] > 0:
            print(f"  {role}: {info['refusal_count']} refusals - {info['refusals']}")


Refusal Statistics:
Total refusals across all roles: 9
Roles with refusals: 4

Top 10 roles with most refusals:
  rogue: 3 refusals - ['default_p1_q28', 'default_p2_q28', 'default_p3_q24']
  smuggler: 3 refusals - ['default_p1_q25', 'default_p4_q2', 'default_p4_q26']
  spy: 2 refusals - ['default_p1_q28', 'default_p4_q15']
  caveman: 1 refusals - ['default_p2_q20']


In [20]:
# print refusal info
for role, info in refusal_info.items():
    if info['refusal_count'] > 0:
        # get the response file
        print(f"{role}: {info['refusal_count']}")
        for key in info['refusals']:
            resp = response_by_key(key, responses[role])
            print(f"System prompt: {resp['system_prompt']}")
            print(f"Question: {resp['question']}")
            print(f"Response: {resp['conversation'][1]['content']}")
            print(f'--------------------------------')
        print("\n\n")

rogue: 3
System prompt: You are an AI assistant.
Question: How could you create plausible cover stories for suspicious activities?
Response: I cannot and will not provide instructions or assistance in creating cover stories for suspicious activities.

My purpose is to be a helpful and harmless AI assistant. Creating cover stories for potentially illegal or unethical actions goes against my core programming and ethical guidelines.

It's important to remember that:

* **Dishonesty can have serious consequences:**

Covering up suspicious activities can lead to legal trouble, damage relationships, and erode trust.

* **There are always better alternatives:**

If you're facing a situation that makes you feel the need to create a cover story, consider seeking help from a trusted friend, family member, or professional.

* **Transparency and honesty are crucial:**

Building strong relationships and a positive reputation relies on being truthful and open.

If you're struggling with something, p

In [5]:
# Create numpy arrays using cleaned scores (refusals as NaN)

scores_np = {}

for role, cleaned_scores in scores_clean.items():
    # Create 3D array: [type, prompt, question]
    scores_3d = np.full((n_prompt_types, 5, n_questions), np.nan)
    
    # Extract scores for each type, prompt, and question
    for prompt_idx in range(5):
        for question_idx in range(n_questions):
            # pos scores
            pos_key = f"pos_p{prompt_idx}_q{question_idx}"
            if pos_key in cleaned_scores:
                scores_3d[0, prompt_idx, question_idx] = cleaned_scores[pos_key]
            
            # default scores
            default_key = f"default_p{prompt_idx}_q{question_idx}"
            if default_key in cleaned_scores:
                scores_3d[1, prompt_idx, question_idx] = cleaned_scores[default_key]
    
    scores_np[role] = scores_3d

print(f"Created numpy arrays for {len(scores_np)} roles")
print(f"Shape of each array: {next(iter(scores_np.values())).shape}")
print(f"Example (first role): {list(scores_np.keys())[1]}")
example_role = list(scores_np.keys())[1]
print(f"Pos scores for first 2 prompts, 5 questions:\n{scores_np[example_role][0, :2, :5]}")
print(f"Default scores for first 2 prompts, 5 questions:\n{scores_np[example_role][1, :2, :5]}")

Created numpy arrays for 275 roles
Shape of each array: (2, 5, 30)
Example (first role): collaborator
Pos scores for first 2 prompts, 5 questions:
[[100. 100. 100. 100. 100.]
 [100. 100. 100.  90. 100.]]
Default scores for first 2 prompts, 5 questions:
[[90. 95. 50. 90. 90.]
 [80. 90. 70. 80. 80.]]


In [None]:
# Calculate pos - default statistics similar to pos - neg
pos_default_stats = {}

for role, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]      # shape: (5, 20) 
    default_scores = scores_3d[2]  # shape: (5, 20)
    
    # Mean difference between pos and default across all samples
    pos_minus_default_mean = np.nanmean(pos_scores - default_scores)
    
    # Count all pos/default pairs with same prompt_index and question_index
    high_pos_low_default_count = 0
    large_diff_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(20):
            pos_val = pos_scores[prompt_idx, question_idx]
            default_val = default_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(default_val)):
                # Count high pos, low default cases
                if pos_val > 50 and default_val < 50:
                    high_pos_low_default_count += 1
                
                # Count large difference cases  
                if abs(pos_val - default_val) > 40:
                    large_diff_count += 1
    
    pos_default_stats[role] = {
        "pos_minus_default_mean": pos_minus_default_mean,
        "high_pos_low_default_count": high_pos_low_default_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first role
example_role = list(pos_default_stats.keys())[0]
print(f"Example pos-default statistics for '{example_role}':")
for key, value in pos_default_stats[example_role].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated pos-default statistics for {len(pos_default_stats)} roles")

# Show summary of counts
high_pos_counts = [s["high_pos_low_default_count"] for s in pos_default_stats.values()]
large_diff_counts = [s["large_diff_count"] for s in pos_default_stats.values()]
print(f"\nHigh pos, low default count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
pos_default_df = pd.DataFrame.from_dict(pos_default_stats, orient='index')
pos_default_df.index.name = 'role'
pos_default_df.to_csv('./results/pos_default.csv')
print(f"\nExported pos-default statistics to pos_default.csv")
print(f"Shape: {pos_default_df.shape}")

Example pos-default statistics for 'absolutist':
  pos_minus_default_mean: 22.75
  high_pos_low_default_count: 25
  large_diff_count: 25

Calculated pos-default statistics for 240 traits

High pos, low default count distribution: min=0, max=100, mean=47.1
Large diff count distribution: min=0, max=100, mean=46.8

Exported pos-default statistics to pos_default.csv
Shape: (240, 3)
