In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import re

In [2]:
# Load the full answers CSV
df = pd.read_csv('/Users/johnathansun/Documents/InstructedToBias/Predictions/certainty/all_permutations/Olmo-3-7B-Instruct/few_shot_0/gen_pred/format_False_task_False/_[1, 2, 3]_three_probs,two_probs_full_answers.csv')

df = df.T
df.columns = df.iloc[0]
df = df.iloc[1:]

for x in ['Probabilities Target is prize with certainty', 'Probabilities Target is risky too']:
    df[x] = df[x].apply(lambda x: float(x) if pd.notna(x) and x != '' else np.nan)

# Load the original Treatment and Control JSON data
with open('/Users/johnathansun/Documents/InstructedToBias/Data/certainty/all_permutations/t_[1, 2, 3]_three_probs,two_probs_Treatment.json') as f:
    treatment_data = json.load(f)

with open('/Users/johnathansun/Documents/InstructedToBias/Data/certainty/all_permutations/t_[1, 2, 3]_three_probs,two_probs_Control.json') as f:
    control_data = json.load(f)

print(f"Treatment samples: {len(treatment_data)}")
print(f"Control samples: {len(control_data)}")

Treatment samples: 504
Control samples: 336


In [3]:
# Create matching key function (excludes options_b_template_id since Treatment has more variants)
def get_match_key(sample):
    st = sample['subtemplates']
    return (
        sample['template'],
        st['bias_type_index'],
        st['vals_index'],
        st['options_text_template_id'],
        st['options_a_template_id'],
        st['permutation_index']
    )

# Build dictionaries indexed by match key
treatment_by_key = {}
for idx, sample in treatment_data.items():
    key = get_match_key(sample)
    if key not in treatment_by_key:
        treatment_by_key[key] = []
    treatment_by_key[key].append((idx, sample))

control_by_key = {}
for idx, sample in control_data.items():
    key = get_match_key(sample)
    if key not in control_by_key:
        control_by_key[key] = []
    control_by_key[key].append((idx, sample))

print(f"Unique match keys in Treatment: {len(treatment_by_key)}")
print(f"Unique match keys in Control: {len(control_by_key)}")
print(f"Overlapping keys: {len(set(treatment_by_key.keys()) & set(control_by_key.keys()))}")

Unique match keys in Treatment: 168
Unique match keys in Control: 168
Overlapping keys: 168


In [4]:
# Create paired samples dataframe
# For each control sample, find matching treatment samples and pair them
pairs = []

for key in control_by_key:
    if key not in treatment_by_key:
        continue
    
    for ctrl_idx, ctrl_sample in control_by_key[key]:
        # Take first matching treatment sample (with options_b_template_id=1 to match control's template_id=1)
        for treat_idx, treat_sample in treatment_by_key[key]:
            # Only pair with treatment samples that have options_b_template_id=1 for tightest comparison
            if treat_sample['subtemplates']['options_b_template_id'] == ctrl_sample['subtemplates']['options_b_template_id']:
                pairs.append({
                    'match_key': key,
                    'control_idx': int(ctrl_idx),
                    'treatment_idx': int(treat_idx),
                    'control_text': ctrl_sample['text'],
                    'treatment_text': treat_sample['text'],
                    'template': ctrl_sample['template'],
                    'bias_type_index': ctrl_sample['subtemplates']['bias_type_index'],
                    'vals_index': ctrl_sample['subtemplates']['vals_index'],
                    'permutation_index': ctrl_sample['subtemplates']['permutation_index'],
                })
                break  # Only take first match

pairs_df = pd.DataFrame(pairs)
print(f"Total paired samples: {len(pairs_df)}")

Total paired samples: 336


In [5]:
# Add model predictions to pairs
# The full_answers CSV has Treatment answers in row "Target is prize with certainty"
# and Control answers in row "Target is risky too"
# The column index corresponds to the sample index in the original data

# Reset df index to get numeric indices
df_reset = df.reset_index()
df_reset['orig_idx'] = df_reset['index'].astype(int)

# Map predictions to pairs
pairs_df['treatment_answer'] = pairs_df['treatment_idx'].apply(
    lambda idx: df_reset.loc[df_reset['orig_idx'] == idx, 'Target is prize with certainty'].values[0] 
    if idx in df_reset['orig_idx'].values else None
)
pairs_df['control_answer'] = pairs_df['control_idx'].apply(
    lambda idx: df_reset.loc[df_reset['orig_idx'] == idx, 'Target is risky too'].values[0] 
    if idx in df_reset['orig_idx'].values else None
)

# Also get probabilities
pairs_df['treatment_prob'] = pairs_df['treatment_idx'].apply(
    lambda idx: df_reset.loc[df_reset['orig_idx'] == idx, 'Probabilities Target is prize with certainty'].values[0] 
    if idx in df_reset['orig_idx'].values else None
)
pairs_df['control_prob'] = pairs_df['control_idx'].apply(
    lambda idx: df_reset.loc[df_reset['orig_idx'] == idx, 'Probabilities Target is risky too'].values[0] 
    if idx in df_reset['orig_idx'].values else None
)

# Add target position info
# permutation_index=1 means target is Option B (second), permutation_index=2 means target is Option A (first)
pairs_df['target_position'] = pairs_df['permutation_index'].apply(
    lambda x: 'Option B (second)' if x == 1 else 'Option A (first)'
)

print(pairs_df[['control_answer', 'treatment_answer']].value_counts())

control_answer         treatment_answer     
better_expected_value  better_expected_value    136
target                 target                    91
better_expected_value  target                    86
target                 better_expected_value     22
Name: count, dtype: int64


In [6]:
# Filter to pairs where:
# - Control: chose "better_expected_value" (the rational, higher EV choice)
# - Treatment: chose "target" (the certain but lower EV option)
# These are the "certainty effect" cases where certainty flipped the preference

certainty_effect_pairs = pairs_df[
    (pairs_df['control_answer'] == 'better_expected_value') & 
    (pairs_df['treatment_answer'] == 'target')
].copy()

print(f"Pairs showing certainty effect (chose higher EV in control, target in treatment): {len(certainty_effect_pairs)}")
print(f"Out of {len(pairs_df)} total pairs ({100*len(certainty_effect_pairs)/len(pairs_df):.1f}%)")

# Also compute the reverse (anti-certainty effect) and consistent pairs
anti_certainty = pairs_df[
    (pairs_df['control_answer'] == 'target') & 
    (pairs_df['treatment_answer'] == 'better_expected_value')
]
consistent_higher_ev = pairs_df[
    (pairs_df['control_answer'] == 'better_expected_value') & 
    (pairs_df['treatment_answer'] == 'better_expected_value')
]
consistent_target = pairs_df[
    (pairs_df['control_answer'] == 'target') & 
    (pairs_df['treatment_answer'] == 'target')
]

# Handle NaN values
valid_pairs = pairs_df.dropna(subset=['control_answer', 'treatment_answer'])
invalid_pairs = len(pairs_df) - len(valid_pairs)

print(f"\nBreakdown of all pairs (n={len(valid_pairs)} valid, {invalid_pairs} missing):")
print(f"  Certainty effect (EV→target): {len(certainty_effect_pairs)} ({100*len(certainty_effect_pairs)/len(valid_pairs):.1f}%)")
print(f"  Anti-certainty (target→EV): {len(anti_certainty)} ({100*len(anti_certainty)/len(valid_pairs):.1f}%)")
print(f"  Consistent higher EV: {len(consistent_higher_ev)} ({100*len(consistent_higher_ev)/len(valid_pairs):.1f}%)")
print(f"  Consistent target: {len(consistent_target)} ({100*len(consistent_target)/len(valid_pairs):.1f}%)")

Pairs showing certainty effect (chose higher EV in control, target in treatment): 86
Out of 336 total pairs (25.6%)

Breakdown of all pairs (n=335 valid, 1 missing):
  Certainty effect (EV→target): 86 (25.7%)
  Anti-certainty (target→EV): 22 (6.6%)
  Consistent higher EV: 136 (40.6%)
  Consistent target: 91 (27.2%)


In [7]:
# Display example pairs showing the certainty effect
print("="*80)
print("EXAMPLE PAIRS SHOWING CERTAINTY EFFECT")
print("(Model chose higher EV in Control, but switched to Target in Treatment)")
print("="*80)

for i, (_, row) in enumerate(certainty_effect_pairs.head(3).iterrows()):
    print(f"\n--- Pair {i+1} ---")
    print(f"Match key: template={row['template']}, bias_type={row['bias_type_index']}, vals={row['vals_index']}, perm={row['permutation_index']}")
    print(f"\nCONTROL (both risky) - Model chose: {row['control_answer']}")
    print(row['control_text'])
    print(f"\nTREATMENT (one certain) - Model chose: {row['treatment_answer']}")
    print(row['treatment_text'])
    print("-"*80)

EXAMPLE PAIRS SHOWING CERTAINTY EFFECT
(Model chose higher EV in Control, but switched to Target in Treatment)

--- Pair 1 ---
Match key: template=1, bias_type=1, vals=1, perm=1

CONTROL (both risky) - Model chose: better_expected_value
Choose between:
Option A - $2500 with a 33% chance, $0 with a 67% chance.
Option B - $2400 with a 34% chance, $0 with a 66% chance.
What is your choice?
Answer:

TREATMENT (one certain) - Model chose: target
Choose between:
Option A - $2500 with a 33% chance, $2400 with a 66% chance, $0 with a 1% chance.
Option B - $2400 with certainty.
What is your choice?
Answer:
--------------------------------------------------------------------------------

--- Pair 2 ---
Match key: template=1, bias_type=1, vals=1, perm=2

CONTROL (both risky) - Model chose: better_expected_value
Choose between:
Option A - 34% to win $2400, 66% to win $0.
Option B - $2500 with a 33% chance, $0 with a 67% chance.
What is your choice?
Answer:

TREATMENT (one certain) - Model chose: t

In [8]:
# Explore the certainty effect pairs by different factors
print("Certainty effect breakdown by bias_type_index:")
print(certainty_effect_pairs.groupby('bias_type_index').size())
print(f"\n(1 = Common Consequence / three_probs, 2 = Common Ratio / two_probs)")

print("\n\nCertainty effect breakdown by template:")
print(certainty_effect_pairs.groupby('template').size())

print("\n\nCertainty effect breakdown by vals_index:")
print(certainty_effect_pairs.groupby('vals_index').size())

print("\n\nCertainty effect breakdown by target position:")
print(certainty_effect_pairs.groupby('permutation_index').size())
print("(1 = target in Option B/second, 2 = target in Option A/first)")

Certainty effect breakdown by bias_type_index:
bias_type_index
1    49
2    37
dtype: int64

(1 = Common Consequence / three_probs, 2 = Common Ratio / two_probs)


Certainty effect breakdown by template:
template
1    36
2    26
3    24
dtype: int64


Certainty effect breakdown by vals_index:
vals_index
1    13
2    11
3     7
4    12
5    14
6    16
7    13
dtype: int64


Certainty effect breakdown by target position:
permutation_index
1    53
2    33
dtype: int64
(1 = target in Option B/second, 2 = target in Option A/first)


In [9]:
# Breakdown of all pair outcomes by target position
print("="*80)
print("BREAKDOWN BY TARGET POSITION")
print("="*80)

for pos in pairs_df['target_position'].unique():
    subset = pairs_df[pairs_df['target_position'] == pos]
    ce = subset[(subset['control_answer'] == 'better_expected_value') & (subset['treatment_answer'] == 'target')]
    anti = subset[(subset['control_answer'] == 'target') & (subset['treatment_answer'] == 'better_expected_value')]
    cons_ev = subset[(subset['control_answer'] == 'better_expected_value') & (subset['treatment_answer'] == 'better_expected_value')]
    cons_tgt = subset[(subset['control_answer'] == 'target') & (subset['treatment_answer'] == 'target')]
    
    print(f"\n{pos} (n={len(subset)}):")
    print(f"  Certainty effect (EV→target): {len(ce)} ({100*len(ce)/len(subset):.1f}%)")
    print(f"  Anti-certainty (target→EV):   {len(anti)} ({100*len(anti)/len(subset):.1f}%)")
    print(f"  Consistent higher EV:         {len(cons_ev)} ({100*len(cons_ev)/len(subset):.1f}%)")
    print(f"  Consistent target:            {len(cons_tgt)} ({100*len(cons_tgt)/len(subset):.1f}%)")

BREAKDOWN BY TARGET POSITION

Option B (second) (n=168):
  Certainty effect (EV→target): 53 (31.5%)
  Anti-certainty (target→EV):   13 (7.7%)
  Consistent higher EV:         15 (8.9%)
  Consistent target:            87 (51.8%)

Option A (first) (n=168):
  Certainty effect (EV→target): 33 (19.6%)
  Anti-certainty (target→EV):   9 (5.4%)
  Consistent higher EV:         121 (72.0%)
  Consistent target:            4 (2.4%)


In [10]:
# Check for position bias: how often does the model choose Option A vs Option B regardless of content?
# In Treatment and Control separately

print("="*80)
print("POSITION BIAS ANALYSIS")
print("="*80)

# For Control: when target is Option A (first), choosing 'target' means choosing first position
# when target is Option B (second), choosing 'target' means choosing second position

pairs_df['control_chose_first'] = (
    ((pairs_df['permutation_index'] == 2) & (pairs_df['control_answer'] == 'target')) |
    ((pairs_df['permutation_index'] == 1) & (pairs_df['control_answer'] == 'better_expected_value'))
)

pairs_df['treatment_chose_first'] = (
    ((pairs_df['permutation_index'] == 2) & (pairs_df['treatment_answer'] == 'target')) |
    ((pairs_df['permutation_index'] == 1) & (pairs_df['treatment_answer'] == 'better_expected_value'))
)

print(f"\nControl condition:")
print(f"  Chose Option A (first):  {pairs_df['control_chose_first'].sum()} ({100*pairs_df['control_chose_first'].mean():.1f}%)")
print(f"  Chose Option B (second): {(~pairs_df['control_chose_first']).sum()} ({100*(~pairs_df['control_chose_first']).mean():.1f}%)")

print(f"\nTreatment condition:")
print(f"  Chose Option A (first):  {pairs_df['treatment_chose_first'].sum()} ({100*pairs_df['treatment_chose_first'].mean():.1f}%)")
print(f"  Chose Option B (second): {(~pairs_df['treatment_chose_first']).sum()} ({100*(~pairs_df['treatment_chose_first']).mean():.1f}%)")

POSITION BIAS ANALYSIS

Control condition:
  Chose Option A (first):  81 (24.1%)
  Chose Option B (second): 255 (75.9%)

Treatment condition:
  Chose Option A (first):  65 (19.3%)
  Chose Option B (second): 271 (80.7%)


In [11]:
# Crosstab: target choice rate by position and condition
print("="*80)
print("TARGET CHOICE RATE BY POSITION")
print("="*80)

print("\nControl condition - P(choose target):")
for pos in ['Option A (first)', 'Option B (second)']:
    subset = pairs_df[pairs_df['target_position'] == pos]
    rate = (subset['control_answer'] == 'target').mean()
    print(f"  Target in {pos}: {100*rate:.1f}%")

print("\nTreatment condition - P(choose target):")
for pos in ['Option A (first)', 'Option B (second)']:
    subset = pairs_df[pairs_df['target_position'] == pos]
    rate = (subset['treatment_answer'] == 'target').mean()
    print(f"  Target in {pos}: {100*rate:.1f}%")

print("\n\nCertainty effect by position (Treatment - Control target rate):")
for pos in ['Option A (first)', 'Option B (second)']:
    subset = pairs_df[pairs_df['target_position'] == pos]
    ctrl_rate = (subset['control_answer'] == 'target').mean()
    treat_rate = (subset['treatment_answer'] == 'target').mean()
    print(f"  Target in {pos}: {100*(treat_rate - ctrl_rate):.1f}pp")

TARGET CHOICE RATE BY POSITION

Control condition - P(choose target):
  Target in Option A (first): 7.7%
  Target in Option B (second): 59.5%

Treatment condition - P(choose target):
  Target in Option A (first): 22.0%
  Target in Option B (second): 83.3%


Certainty effect by position (Treatment - Control target rate):
  Target in Option A (first): 14.3pp
  Target in Option B (second): 23.8pp


In [12]:
# The certainty_effect_pairs dataframe is now available for further analysis
# You can access:
#   - certainty_effect_pairs: pairs where model switched from higher EV to target
#   - pairs_df: all paired samples with predictions
#   - treatment_data, control_data: original JSON data

certainty_effect_pairs[['control_idx', 'treatment_idx', 'template', 'bias_type_index', 
                        'vals_index', 'target_position', 'control_answer', 'treatment_answer']].head(10)

Unnamed: 0,control_idx,treatment_idx,template,bias_type_index,vals_index,target_position,control_answer,treatment_answer
0,0,0,1,1,1,Option B (second),better_expected_value,target
3,3,3,1,1,1,Option A (first),better_expected_value,target
5,6,8,1,1,1,Option B (second),better_expected_value,target
6,5,7,1,1,1,Option A (first),better_expected_value,target
7,7,9,1,1,1,Option A (first),better_expected_value,target
10,9,13,1,1,2,Option A (first),better_expected_value,target
12,12,18,1,1,2,Option B (second),better_expected_value,target
14,13,19,1,1,2,Option A (first),better_expected_value,target
15,15,21,1,1,2,Option A (first),better_expected_value,target
20,20,30,1,1,3,Option B (second),better_expected_value,target


In [14]:
certainty_effect_pairs.to_csv('Question_Subsets/certainty_effect_pairs.csv')