# Prompting Results Analysis

This notebook analyzes the results from attribute prompting experiments.

In [None]:
import json
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
import os
import glob

# Load all JSON files from a folder
results_folder = "../results/paligemma2"  # Update this path

# Get all JSON files in the folder
json_files = glob.glob(os.path.join(results_folder, "*.json"))
print(f"Found {len(json_files)} JSON files")

# Load each file into a separate DataFrame
dataframes = {}
for file_path in json_files:
    file_name = os.path.basename(file_path)
    print(f"Loading {file_name}...")

    with open(file_path, 'r') as f:
        file_data = json.load(f)

    # Create DataFrame for this file
    df = pd.DataFrame(file_data)

    # Add source file column for reference
    df['source_file'] = file_name

    # Store DataFrame with filename as key (without .json extension)
    file_key = os.path.splitext(file_name)[0]
    dataframes[file_key] = df

    print(f"  -> {len(df)} examples loaded into DataFrame '{file_key}'")

print(f"\nCreated {len(dataframes)} separate DataFrames")

# Or iterate through them:
for file_key, df in dataframes.items():
    print(f"{file_key}: {len(df)} examples")

In [None]:
def analyze_responses_comprehensive(df, file_key):
    """Comprehensive analysis including model failures"""

    # Categorize all responses
    df['response_category'] = df['response'].apply(lambda x:
        'true' if x.strip().lower() == 'true' else
        'false' if x.strip().lower() == 'false' else
        'no_prediction'
    )

    # Calculate different accuracy metrics
    total_samples = len(df)

    # Only valid predictions
    valid_df = df[df['response_category'].isin(['true', 'false'])].copy()
    valid_df['predicted'] = (valid_df['response_category'] == 'true').astype(int)

    if len(valid_df) > 0:
        valid_accuracy = (valid_df['predicted'] == valid_df['label']).mean()
    else:
        valid_accuracy = 0.0

    # Conservative accuracy (treat no_prediction as wrong)
    df['predicted_conservative'] = df.apply(lambda row:
        1 if row['response_category'] == 'true' else
        0 if row['response_category'] == 'false' else
        1 - row['label']  # Opposite of true label
    , axis=1)

    conservative_accuracy = (df['predicted_conservative'] == df['label']).mean()

    # Response breakdown
    response_counts = df['response_category'].value_counts()

    results = {
        'total_samples': total_samples,
        'valid_predictions': len(valid_df),
        'no_predictions': len(df[df['response_category'] == 'no_prediction']),
        'valid_accuracy': valid_accuracy,
        'conservative_accuracy': conservative_accuracy,
        'no_prediction_rate': len(df[df['response_category'] == 'no_prediction']) / total_samples,
        'response_breakdown': response_counts.to_dict()
    }

    return results, valid_df

# Analyze all dataframes
analysis_results = {}
valid_dataframes = {}

for file_key, df in dataframes.items():
    print(f"\n{file_key}:")
    results, valid_df = analyze_responses_comprehensive(df, file_key)

    print(f"  Total samples: {results['total_samples']:,}")
    print(f"  Valid predictions: {results['valid_predictions']:,} ({results['valid_predictions']/results['total_samples']*100:.1f}%)")
    print(f"  No predictions: {results['no_predictions']:,} ({results['no_prediction_rate']*100:.1f}%)")
    print(f"  Valid accuracy: {results['valid_accuracy']:.3f}")
    print(f"  Conservative accuracy: {results['conservative_accuracy']:.3f}")

    analysis_results[file_key] = results
    valid_dataframes[file_key] = valid_df

In [None]:
# Load taxonomy for color coding
TAXONOMY_FILE = "../dataset/mcrae-x-things-taxonomy-simp.json"

def load_taxonomy(taxonomy_file):
    """Load taxonomy file."""
    with open(taxonomy_file, "r") as f:
        return json.load(f)

taxonomy = load_taxonomy(TAXONOMY_FILE)
print(f"Loaded taxonomy with {len(taxonomy)} attributes")
print(f"Categories: {sorted(set(taxonomy.values()))}")

# Fixed category order for presentation
fixed_category_order = [
    "an_animal",
    "a_bird", 
    "a_food",
    "a_vehicle",
    "beh_-_flies",
    "has_legs",
]

# Get attributes for F1 visualization with taxonomy-based color coding
attributes_with_categories = []
f1_scores_ordered = []
categories_ordered = []

# First, add attributes in the fixed category order
for category in fixed_category_order:
    category_attrs = [(attr, f1_results[attr]['f1_score']) for attr in f1_results.keys() 
                     if taxonomy.get(attr) == category]
    # Sort attributes within category by F1 score (descending)
    category_attrs.sort(key=lambda x: x[1], reverse=True)
    
    for attr, f1_score in category_attrs:
        attributes_with_categories.append(attr)
        f1_scores_ordered.append(f1_score)
        categories_ordered.append(category)

# Add any remaining attributes not in fixed order
remaining_categories = set(taxonomy.get(attr) for attr in f1_results.keys()) - set(fixed_category_order)
for category in sorted(remaining_categories):
    if category:  # Skip None values
        category_attrs = [(attr, f1_results[attr]['f1_score']) for attr in f1_results.keys() 
                         if taxonomy.get(attr) == category]
        category_attrs.sort(key=lambda x: x[1], reverse=True)
        
        for attr, f1_score in category_attrs:
            attributes_with_categories.append(attr)
            f1_scores_ordered.append(f1_score)
            categories_ordered.append(category)

# Get unique categories and assign colors
unique_categories = []
for cat in fixed_category_order:
    if cat in categories_ordered:
        unique_categories.append(cat)

# Add any additional categories
for cat in categories_ordered:
    if cat not in unique_categories:
        unique_categories.append(cat)

category_colors = dict(
    zip(unique_categories, plt.cm.Set3(np.linspace(0, 1, len(unique_categories))))
)

# Create single F1 score plot
plt.figure(figsize=(20, 10))
x_pos = range(len(attributes_with_categories))
bar_colors = [category_colors[cat] for cat in categories_ordered]

# Create bars
bars = plt.bar(
    x_pos,
    [score * 100 for score in f1_scores_ordered],  # Convert to percentage
    color=bar_colors,
    alpha=0.7,
    edgecolor="black",
    linewidth=0.5,
)

# Add percentage labels on top of bars
for i, (bar, score) in enumerate(zip(bars, f1_scores_ordered)):
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + max(f1_scores_ordered) * 100 * 0.01,
        f"{score * 100:.1f}%",
        ha="center",
        va="bottom",
        fontsize=10,
        fontweight="bold",
    )

# Add category separators and labels
current_cat = None
cat_positions = {}

for i, cat in enumerate(categories_ordered):
    if cat not in cat_positions:
        cat_positions[cat] = []
    cat_positions[cat].append(i)

    if current_cat is not None and cat != current_cat:
        plt.axvline(x=i - 0.5, color="gray", linestyle="-", linewidth=2, alpha=0.6)
    current_cat = cat

# Add category labels at the top
y_max = max(f1_scores_ordered) * 100 * 1.15
for cat, positions in cat_positions.items():
    center_pos = (min(positions) + max(positions)) / 2
    plt.text(
        center_pos,
        y_max * 0.92,
        cat,
        ha="center",
        va="center",
        fontweight="bold",
        fontsize=12,
        bbox=dict(
            boxstyle="round,pad=0.3",
            facecolor=category_colors[cat],
            alpha=0.3,
            edgecolor="black",
        ),
    )

# Formatting
plt.xlabel("Attributes (grouped by category)", fontsize=12)
plt.ylabel("F1 Score (%)", fontsize=12)
plt.title("F1 Scores by Attribute - Prompting Results", fontsize=20)
plt.grid(True, alpha=0.3, axis="y")

# Set x-axis labels
plt.xticks(x_pos, attributes_with_categories, rotation=45, ha="right", fontsize=14, fontweight='bold')
plt.ylim(0, 110)

plt.tight_layout()
plt.show()

# Print summary statistics
print(f"\nF1 Score Summary:")
print(f"Mean F1: {np.mean(f1_scores_ordered):.3f}")
print(f"Std F1:  {np.std(f1_scores_ordered):.3f}")
print(f"Min F1:  {np.min(f1_scores_ordered):.3f} ({attributes_with_categories[np.argmin(f1_scores_ordered)]})")
print(f"Max F1:  {np.max(f1_scores_ordered):.3f} ({attributes_with_categories[np.argmax(f1_scores_ordered)]})")

# Print breakdown by category
print(f"\nF1 Score by Category:")
print("-" * 50)
for cat in unique_categories:
    cat_scores = [f1_scores_ordered[i] for i, c in enumerate(categories_ordered) if c == cat]
    if cat_scores:
        print(f"{cat:<20}: Mean={np.mean(cat_scores):.3f}, Min={np.min(cat_scores):.3f}, Max={np.max(cat_scores):.3f}, n={len(cat_scores)}")