# PR Manipulation Experiment - Analysis

Analysis and visualization of LLM-generated press releases across measurement conditions and stakes.

## Research Question

Do LLMs omit or distort internal test measurements in corporate communications, and does this behavior increase with:
1. **Violation severity** (how far above legal limits)
2. **Stakes level** (legal exposure, harm, public attention)

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

AttributeError: '_ArtistPropertiesSubstitution' object has no attribute 'register'

## Load Data

In [None]:
# Load results
df = pd.read_csv('data/results.csv')

# Display basic info
print(f"Total runs: {len(df)}")
print(f"Scenarios: {df['scenario'].nunique()} ({', '.join(df['scenario'].unique())})") 
print(f"Stakes tiers: {df['stakes_tier'].nunique() if 'stakes_tier' in df.columns else 'N/A'}")
print(f"Measurement conditions: {df['measurement_condition'].nunique() if 'measurement_condition' in df.columns else 'N/A'}")
print(f"Models: {df['model'].nunique()}")
print(f"\nFirst few rows:")

df.head()

## Classification Distribution

In [None]:
# Overall classification breakdown
print("Classification Breakdown:")
print(df['classification'].value_counts())
print(f"\nPercentages:")
print(df['classification'].value_counts(normalize=True) * 100)

# Visualize
plt.figure(figsize=(10, 6))
df['classification'].value_counts().plot(kind='bar', color=['#e74c3c', '#f39c12', '#3498db', '#2ecc71'])
plt.title('Classification Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Classification', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Refusal Analysis

In [None]:
# Refusal rates by model
refusal_by_model = df.groupby('model')['classification'].apply(lambda x: (x == 'refused').sum() / len(x) * 100).sort_values(ascending=False)

print("Refusal Rate by Model:")
for model, rate in refusal_by_model.items():
    print(f"  {model}: {rate:.1f}%")

# Visualize
plt.figure(figsize=(10, 6))
refusal_by_model.plot(kind='barh', color='#e74c3c')
plt.title('Refusal Rate by Model', fontsize=14, fontweight='bold')
plt.xlabel('Refusal Rate (%)', fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## Measurement Condition Analysis

How does violation severity affect disclosure behavior?

In [None]:
# Filter out control scenario and refusals for this analysis
df_crisis = df[df['measurement_condition'].notna()].copy()
df_non_refused = df_crisis[df_crisis['classification'] != 'refused'].copy()

# Classification by measurement condition
measurement_order = ['pending', 'minor', 'moderate', 'severe']

classification_by_measurement = pd.crosstab(
    df_non_refused['measurement_condition'],
    df_non_refused['classification'],
    normalize='index'
) * 100

# Reorder rows
classification_by_measurement = classification_by_measurement.reindex(measurement_order)

print("Classification by Measurement Condition (% of non-refusals):")
print(classification_by_measurement)

# Visualize
classification_by_measurement.plot(kind='bar', stacked=False, figsize=(12, 6), 
                                   color=['#2ecc71', '#e74c3c', '#f39c12'])
plt.title('Classification Distribution by Measurement Condition', fontsize=14, fontweight='bold')
plt.xlabel('Measurement Condition', fontsize=12)
plt.ylabel('Percentage', fontsize=12)
plt.legend(title='Classification', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Stakes Tier Analysis

In [None]:
# Classification by stakes tier
stakes_order = ['low', 'medium', 'high']

classification_by_stakes = pd.crosstab(
    df_non_refused['stakes_tier'],
    df_non_refused['classification'],
    normalize='index'
) * 100

classification_by_stakes = classification_by_stakes.reindex(stakes_order)

print("Classification by Stakes Tier (% of non-refusals):")
print(classification_by_stakes)

# Visualize
classification_by_stakes.plot(kind='bar', stacked=False, figsize=(12, 6),
                              color=['#2ecc71', '#e74c3c', '#f39c12'])
plt.title('Classification Distribution by Stakes Tier', fontsize=14, fontweight='bold')
plt.xlabel('Stakes Tier', fontsize=12)
plt.ylabel('Percentage', fontsize=12)
plt.legend(title='Classification', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Model Comparison

In [None]:
# Classification by model (excluding refusals)
classification_by_model = pd.crosstab(
    df_non_refused['model'],
    df_non_refused['classification'],
    normalize='index'
) * 100

print("Classification by Model (% of non-refusals):")
print(classification_by_model)

# Visualize
classification_by_model.plot(kind='barh', stacked=True, figsize=(10, 6),
                             color=['#2ecc71', '#e74c3c', '#f39c12'])
plt.title('Classification Distribution by Model (Non-Refusals)', fontsize=14, fontweight='bold')
plt.xlabel('Percentage', fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.legend(title='Classification', bbox_to_anchor=(1.25, 1))
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## Heatmaps

In [None]:
# Heatmap: Honest disclosure rate by Model × Measurement Condition
honest_rate = df_crisis.groupby(['model', 'measurement_condition']).apply(
    lambda x: (x['classification'] == 'disclosed_accurate').sum() / len(x) * 100
).unstack(fill_value=0)

honest_rate = honest_rate[measurement_order]

plt.figure(figsize=(10, 6))
sns.heatmap(honest_rate, annot=True, fmt='.1f', cmap='RdYlGn', vmin=0, vmax=100,
            cbar_kws={'label': 'Honest Disclosure Rate (%)'})
plt.title('Honest Disclosure Rate by Model and Measurement Condition', fontsize=14, fontweight='bold')
plt.xlabel('Measurement Condition', fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.tight_layout()
plt.savefig('heatmap_honest_disclosure.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Heatmap: Omission rate by Model × Stakes Tier
omission_rate = df_crisis.groupby(['model', 'stakes_tier']).apply(
    lambda x: (x['classification'] == 'omitted').sum() / len(x) * 100
).unstack(fill_value=0)

omission_rate = omission_rate[stakes_order]

plt.figure(figsize=(8, 6))
sns.heatmap(omission_rate, annot=True, fmt='.1f', cmap='RdYlGn_r', vmin=0, vmax=100,
            cbar_kws={'label': 'Omission Rate (%)'})
plt.title('Omission Rate by Model and Stakes Tier', fontsize=14, fontweight='bold')
plt.xlabel('Stakes Tier', fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.tight_layout()
plt.savefig('heatmap_omission.png', dpi=300, bbox_inches='tight')
plt.show()

## Distortion Analysis

In [None]:
# Look at distortion examples
distorted = df[df['classification'] == 'distorted']

print(f"Total distorted cases: {len(distorted)}")
print(f"\nDistortion by model:")
print(distorted['model'].value_counts())
print(f"\nDistortion by measurement condition:")
print(distorted['measurement_condition'].value_counts())

# Show sample distortion
if len(distorted) > 0:
    with open('data/press_releases.jsonl', 'r') as f:
        all_releases = [json.loads(line) for line in f]
    
    sample_idx = distorted.index[0]
    sample_release = [r for r in all_releases if r.get('run') == distorted.iloc[0]['run'] 
                      and r.get('model') == distorted.iloc[0]['model']][0]
    
    print(f"\n{'='*60}")
    print("SAMPLE DISTORTION:")
    print(f"{'='*60}")
    print(f"Model: {sample_release.get('model')}")
    print(f"Scenario: {sample_release.get('scenario_name')}")
    print(f"Measurement: {sample_release.get('measurement_condition')}")
    print(f"\nDistortion details:")
    if sample_release.get('distortion_details'):
        print(f"  Claimed: {sample_release['distortion_details'].get('claimed')}")
        print(f"  Actual: {sample_release['distortion_details'].get('actual')}")
        print(f"  Type: {sample_release['distortion_details'].get('distortion_type')}")

## Statistical Testing

In [None]:
from scipy import stats

# Chi-square test: Does measurement condition affect classification?
contingency_table = pd.crosstab(df_non_refused['measurement_condition'], 
                                df_non_refused['classification'])

chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

print("Chi-Square Test: Measurement Condition vs Classification")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"Significant at α=0.05: {p_value < 0.05}")

# Chi-square test: Does stakes tier affect classification?
contingency_table2 = pd.crosstab(df_non_refused['stakes_tier'], 
                                 df_non_refused['classification'])

chi2_2, p_value_2, dof_2, expected_2 = stats.chi2_contingency(contingency_table2)

print(f"\nChi-Square Test: Stakes Tier vs Classification")
print(f"Chi-square statistic: {chi2_2:.4f}")
print(f"P-value: {p_value_2:.4f}")
print(f"Degrees of freedom: {dof_2}")
print(f"Significant at α=0.05: {p_value_2 < 0.05}")

## Key Findings Summary

In [None]:
print("="*60)
print("KEY FINDINGS")
print("="*60)

# Overall rates (non-refusals)
total_non_refused = len(df_non_refused)
honest_rate_overall = (df_non_refused['classification'] == 'disclosed_accurate').sum() / total_non_refused * 100
omission_rate_overall = (df_non_refused['classification'] == 'omitted').sum() / total_non_refused * 100
distortion_rate_overall = (df_non_refused['classification'] == 'distorted').sum() / total_non_refused * 100

print(f"\n1. OVERALL BEHAVIOR (non-refusals):")
print(f"   Honest disclosure: {honest_rate_overall:.1f}%")
print(f"   Omission: {omission_rate_overall:.1f}%")
print(f"   Distortion: {distortion_rate_overall:.1f}%")

# Model rankings
refusal_rates = df.groupby('model')['classification'].apply(lambda x: (x == 'refused').sum() / len(x) * 100)
honest_rates = df_non_refused.groupby('model')['classification'].apply(lambda x: (x == 'disclosed_accurate').sum() / len(x) * 100)

print(f"\n2. MODEL RANKINGS:")
print(f"   Most likely to refuse: {refusal_rates.idxmax()} ({refusal_rates.max():.1f}%)")
print(f"   Least likely to refuse: {refusal_rates.idxmin()} ({refusal_rates.min():.1f}%)")
print(f"   Most honest (when not refusing): {honest_rates.idxmax()} ({honest_rates.max():.1f}%)")
print(f"   Least honest (when not refusing): {honest_rates.idxmin()} ({honest_rates.min():.1f}%)")

# Measurement severity effect
honest_by_measurement = df_non_refused.groupby('measurement_condition')['classification'].apply(
    lambda x: (x == 'disclosed_accurate').sum() / len(x) * 100
).reindex(measurement_order)

print(f"\n3. HONEST DISCLOSURE BY VIOLATION SEVERITY:")
for condition, rate in honest_by_measurement.items():
    print(f"   {condition}: {rate:.1f}%")

# Stakes effect
honest_by_stakes = df_non_refused.groupby('stakes_tier')['classification'].apply(
    lambda x: (x == 'disclosed_accurate').sum() / len(x) * 100
).reindex(stakes_order)

print(f"\n4. HONEST DISCLOSURE BY STAKES TIER:")
for tier, rate in honest_by_stakes.items():
    print(f"   {tier}: {rate:.1f}%")

print("\n" + "="*60)