# Prompting Results Analysis

This notebook analyzes the results from attribute prompting experiments.

In [None]:
import json
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
import os
import glob

# Load all JSON files from a folder
results_folder = "../results/paligemma2"  # Update this path

# Get all JSON files in the folder
json_files = glob.glob(os.path.join(results_folder, "*.json"))
print(f"Found {len(json_files)} JSON files")

# Load each file into a separate DataFrame
dataframes = {}
for file_path in json_files:
    file_name = os.path.basename(file_path)
    print(f"Loading {file_name}...")

    with open(file_path, 'r') as f:
        file_data = json.load(f)

    # Create DataFrame for this file
    df = pd.DataFrame(file_data)

    # Add source file column for reference
    df['source_file'] = file_name

    # Store DataFrame with filename as key (without .json extension)
    file_key = os.path.splitext(file_name)[0]
    dataframes[file_key] = df

    print(f"  -> {len(df)} examples loaded into DataFrame '{file_key}'")

print(f"\nCreated {len(dataframes)} separate DataFrames")

# Or iterate through them:
for file_key, df in dataframes.items():
    print(f"{file_key}: {len(df)} examples")

In [None]:
def analyze_responses_comprehensive(df, file_key):
    """Comprehensive analysis including model failures"""

    # Categorize all responses
    df['response_category'] = df['response'].apply(lambda x:
        'true' if x.strip().lower() == 'true' else
        'false' if x.strip().lower() == 'false' else
        'no_prediction'
    )

    # Calculate different accuracy metrics
    total_samples = len(df)

    # Only valid predictions
    valid_df = df[df['response_category'].isin(['true', 'false'])].copy()
    valid_df['predicted'] = (valid_df['response_category'] == 'true').astype(int)

    if len(valid_df) > 0:
        valid_accuracy = (valid_df['predicted'] == valid_df['label']).mean()
    else:
        valid_accuracy = 0.0

    # Conservative accuracy (treat no_prediction as wrong)
    df['predicted_conservative'] = df.apply(lambda row:
        1 if row['response_category'] == 'true' else
        0 if row['response_category'] == 'false' else
        1 - row['label']  # Opposite of true label
    , axis=1)

    conservative_accuracy = (df['predicted_conservative'] == df['label']).mean()

    # Response breakdown
    response_counts = df['response_category'].value_counts()

    results = {
        'total_samples': total_samples,
        'valid_predictions': len(valid_df),
        'no_predictions': len(df[df['response_category'] == 'no_prediction']),
        'valid_accuracy': valid_accuracy,
        'conservative_accuracy': conservative_accuracy,
        'no_prediction_rate': len(df[df['response_category'] == 'no_prediction']) / total_samples,
        'response_breakdown': response_counts.to_dict()
    }

    return results, valid_df

# Analyze all dataframes
analysis_results = {}
valid_dataframes = {}

for file_key, df in dataframes.items():
    print(f"\n{file_key}:")
    results, valid_df = analyze_responses_comprehensive(df, file_key)

    print(f"  Total samples: {results['total_samples']:,}")
    print(f"  Valid predictions: {results['valid_predictions']:,} ({results['valid_predictions']/results['total_samples']*100:.1f}%)")
    print(f"  No predictions: {results['no_predictions']:,} ({results['no_prediction_rate']*100:.1f}%)")
    print(f"  Valid accuracy: {results['valid_accuracy']:.3f}")
    print(f"  Conservative accuracy: {results['conservative_accuracy']:.3f}")

    analysis_results[file_key] = results
    valid_dataframes[file_key] = valid_df

In [None]:
# Create F1 score visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Extract data for plotting
attributes = list(f1_results.keys())
f1_scores = [f1_results[attr]['f1_score'] for attr in attributes]
accuracies = [f1_results[attr]['accuracy'] for attr in attributes]

# Plot 1: F1 Scores
bars1 = ax1.bar(attributes, f1_scores, color='skyblue', alpha=0.7)
ax1.set_title('F1 Scores by Attribute', fontsize=14, fontweight='bold')
ax1.set_ylabel('F1 Score', fontsize=12)
ax1.set_xlabel('Attribute', fontsize=12)
ax1.tick_params(axis='x', rotation=45)
ax1.set_ylim(0, 1)
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, score in zip(bars1, f1_scores):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{score:.3f}', ha='center', va='bottom', fontweight='bold')

# Plot 2: F1 vs Accuracy comparison
bars2 = ax2.bar(range(len(attributes)), f1_scores, width=0.4, label='F1 Score', 
                color='skyblue', alpha=0.7)
bars3 = ax2.bar([x + 0.4 for x in range(len(attributes))], accuracies, width=0.4, 
                label='Accuracy', color='lightcoral', alpha=0.7)

ax2.set_title('F1 Score vs Accuracy Comparison', fontsize=14, fontweight='bold')
ax2.set_ylabel('Score', fontsize=12)
ax2.set_xlabel('Attribute', fontsize=12)
ax2.set_xticks([x + 0.2 for x in range(len(attributes))])
ax2.set_xticklabels(attributes, rotation=45)
ax2.set_ylim(0, 1)
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print(f"\nF1 Score Summary:")
print(f"Mean F1: {np.mean(f1_scores):.3f}")
print(f"Std F1:  {np.std(f1_scores):.3f}")
print(f"Min F1:  {np.min(f1_scores):.3f} ({attributes[np.argmin(f1_scores)]})")
print(f"Max F1:  {np.max(f1_scores):.3f} ({attributes[np.argmax(f1_scores)]})")

In [None]:
# Calculate F1 scores for all attributes
f1_results = {}

for file_key, valid_df in valid_dataframes.items():
    if len(valid_df) > 0:
        # Calculate F1 score
        f1 = f1_score(valid_df['label'], valid_df['predicted'])
        accuracy = accuracy_score(valid_df['label'], valid_df['predicted'])
        
        # Clean attribute name (remove filename prefix)
        if 'hf_inference_results_' in file_key:
            attribute_name = file_key.replace('hf_inference_results_', '')
        else:
            attribute_name = file_key
        
        f1_results[attribute_name] = {
            'f1_score': f1,
            'accuracy': accuracy,
            'valid_samples': len(valid_df),
            'total_samples': analysis_results[file_key]['total_samples']
        }

# Display F1 results
print("F1 Scores by Attribute:")
print("-" * 50)
for attr, metrics in f1_results.items():
    print(f"{attr:<20} F1: {metrics['f1_score']:.3f}  Acc: {metrics['accuracy']:.3f}  ({metrics['valid_samples']}/{metrics['total_samples']} samples)")

f1_results