In [None]:
# Import necessary libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_style("whitegrid")

print("Libraries imported successfully!")


In [None]:
def load_and_combine_benchmark_data():
    """
    Load and combine data from both benchmark JSON files into a unified DataFrame.
    
    Returns:
        pd.DataFrame: Combined data with columns for model, test_type, and various metrics
    """
    
    # Load tier1 results (gauntlet and cohesion tests)
    with open('data/tier1_results_20250627_204655.json', 'r') as f:
        tier1_data = json.load(f)
    
    # Load designer noise results
    with open('data/designer_noise_results_20250627_224305.json', 'r') as f:
        noise_data = json.load(f)
    
    all_results = []
    
    # Process tier1 data (gauntlet and cohesion tests)
    for model_name, model_data in tier1_data['results'].items():
        for test_type, test_data in model_data.items():
            result_row = {
                'model': model_name,
                'test_type': test_type if test_type else 'cohesion',  # Fix empty test_type
                'accuracy': test_data.get('accuracy', None),
                'avg_purity': test_data.get('avg_purity', None),
                'passed_tests': test_data.get('passed_tests', None),
                'total_tests': test_data.get('total_tests', None),
                'avg_time_per_test': test_data.get('avg_time_per_test', None),
                'total_time': test_data.get('total_time', None),
                'target_achieved': test_data.get('target_achieved', None),
                'avg_noise_gap': None,  # Not available in tier1 data
                'avg_noise_uncertainty_rate': None,
                'avg_noise_ranking_success': None
            }
            all_results.append(result_row)
    
    # Process designer noise data
    for model_name, model_data in noise_data.items():
        for test_type, test_data in model_data.items():
            # Extract noise detection metrics
            noise_detection = test_data.get('noise_detection', {})
            
            result_row = {
                'model': model_name,
                'test_type': 'designer_noise',  # Standardize test type name
                'accuracy': test_data.get('accuracy', None),
                'avg_purity': None,  # Not applicable for designer noise tests
                'passed_tests': test_data.get('successful_tests', None),
                'total_tests': test_data.get('total_tests', None),
                'avg_time_per_test': test_data.get('average_time_per_test', None),
                'total_time': test_data.get('total_time', None),
                'target_achieved': None,  # Not applicable
                'avg_noise_gap': noise_detection.get('avg_similarity_gap', None),
                'avg_noise_uncertainty_rate': noise_detection.get('avg_noise_uncertainty_rate', None),
                'avg_noise_ranking_success': noise_detection.get('avg_noise_ranking_success', None)
            }
            all_results.append(result_row)
            break  # Only process gauntlet test for designer noise data
    
    # Convert to DataFrame
    df = pd.DataFrame(all_results)
    
    # Convert percentage values where appropriate
    df['accuracy_pct'] = df['accuracy']
    df['avg_purity_pct'] = df['avg_purity']
    
    return df

# Load and combine the data
benchmark_df = load_and_combine_benchmark_data()

# Display the first few rows to verify successful loading
print("Successfully loaded and combined benchmark data!")
print(f"Total rows: {len(benchmark_df)}")
print("\nFirst few rows:")
print(benchmark_df.head(10))


In [None]:
# Plot 1: Uncertainty Detection (Noise Gap)
plt.figure(figsize=(10, 6))

# Filter data for designer_noise test type
noise_data = benchmark_df[benchmark_df['test_type'] == 'designer_noise'].copy()

# Define colors: SigLIP = green, others = red
colors = []
for model in noise_data['model']:
    if model == 'siglip':
        colors.append('#28A745')  # Green for SigLIP
    else:
        colors.append('#DC3545')  # Red for CLIP and DINOv2

# Create horizontal bar chart
ax = plt.subplot()
bars = ax.barh(noise_data['model'], noise_data['avg_noise_gap'], color=colors)

# Add vertical dashed line at x=0 (pass/fail threshold)
ax.axvline(x=0, color='black', linestyle='--', linewidth=2, alpha=0.7)

# Formatting
plt.title('Uncertainty Detection: Average Noise Gap by Model', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Similarity Gap (Higher is Better)', fontsize=12)
plt.ylabel('Model', fontsize=12)

# Add value labels on bars
for bar, value in zip(bars, noise_data['avg_noise_gap']):
    width = bar.get_width()
    ax.text(width + (0.001 if width >= 0 else -0.001), bar.get_y() + bar.get_height()/2, 
            f'{value:.3f}', ha='left' if width >= 0 else 'right', va='center', fontweight='bold')

# Add gridlines for better readability
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("Noise Gap Analysis:")
for _, row in noise_data.iterrows():
    status = "✅ PASS" if row['avg_noise_gap'] > 0 else "❌ FAIL"
    print(f"{row['model'].upper()}: {row['avg_noise_gap']:.3f} {status}")


In [None]:
# Plot 2: Aesthetic Cohesion (Purity %)
plt.figure(figsize=(10, 6))

# Filter data for cohesion test type
cohesion_data = benchmark_df[benchmark_df['test_type'] == 'cohesion'].copy()

# Create vertical bar chart with professional blue palette
blue_palette = ['#1f4e79', '#4A90E2', '#7bb3f0']  # Different shades of blue
ax = plt.subplot()
bars = ax.bar(cohesion_data['model'], cohesion_data['avg_purity'], color=blue_palette)

# Formatting
plt.title('Aesthetic Cohesion: Average Collection Purity by Model', fontsize=14, fontweight='bold', pad=20)
plt.ylabel('Collection Purity (%)', fontsize=12)
plt.xlabel('Model', fontsize=12)

# Add value labels on top of bars
for bar, value in zip(bars, cohesion_data['avg_purity']):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.5,
            f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')

# Set y-axis to start from 0 and add some headroom
ax.set_ylim(0, max(cohesion_data['avg_purity']) * 1.1)

# Add gridlines for better readability
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("Collection Purity Analysis:")
for _, row in cohesion_data.iterrows():
    target_status = "✅ TARGET MET" if row['avg_purity'] >= 80 else "❌ BELOW TARGET"
    print(f"{row['model'].upper()}: {row['avg_purity']:.1f}% {target_status}")

print(f"\nTarget: >80% collection purity")
print(f"Best performer: {cohesion_data.loc[cohesion_data['avg_purity'].idxmax(), 'model'].upper()} ({cohesion_data['avg_purity'].max():.1f}%)")


In [None]:
# Plot 3: Processing Time
plt.figure(figsize=(10, 6))

# Calculate average processing time per model across all test types
avg_times = benchmark_df.groupby('model')['avg_time_per_test'].mean().reset_index()
avg_times = avg_times.sort_values('avg_time_per_test')

# Create horizontal bar chart with neutral gray palette
gray_palette = ['#6c757d', '#495057', '#343a40']  # Different shades of gray
ax = plt.subplot()
bars = ax.barh(avg_times['model'], avg_times['avg_time_per_test'], color=gray_palette)

# Formatting
plt.title('Performance: Average Processing Time per Test', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Time (seconds)', fontsize=12)
plt.ylabel('Model', fontsize=12)

# Add value labels on bars
for bar, value in zip(bars, avg_times['avg_time_per_test']):
    width = bar.get_width()
    ax.text(width + 0.5, bar.get_y() + bar.get_height()/2, 
            f'{value:.1f}s', ha='left', va='center', fontweight='bold')

# Add gridlines for better readability
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("Processing Time Analysis:")
for _, row in avg_times.iterrows():
    print(f"{row['model'].upper()}: {row['avg_time_per_test']:.1f} seconds per test")

# Calculate performance ratios
fastest = avg_times['avg_time_per_test'].min()
print(f"\nPerformance Ratios (vs fastest):")
for _, row in avg_times.iterrows():
    ratio = row['avg_time_per_test'] / fastest
    print(f"{row['model'].upper()}: {ratio:.1f}x slower than fastest")
