# Evaluation Results Analysis

This notebook analyzes model evaluation results to understand:
- Where models are making mistakes
- Answer distribution patterns
- Most common confusions
- Per-class accuracy

In [None]:
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

%matplotlib inline
sns.set_style("whitegrid")

## Configuration

In [None]:
# Path to your results file
RESULTS_FILE = "./output/results_paligemma_Controlled_Images_B_None_fouroption_False.json"

# You can also compare multiple models
# RESULTS_FILES = {
#     "PaliGemma": "./output/results_paligemma_Controlled_Images_B_None_fouroption_False.json",
#     "Qwen2.5-VL": "./output/results_qwen_vllm_Controlled_Images_B_None_fouroption_False.json",
# }

## Load Results

In [None]:
def extract_spatial_answer(generation):
    """
    Extract spatial relation from model generation.
    
    Args:
        generation: Raw model output string
    
    Returns:
        Extracted spatial relation (lowercase) or 'unknown'
    """
    gen_lower = generation.lower().strip()
    
    # List of possible spatial relations
    spatial_relations = [
        'left', 'right', 'above', 'below', 'top', 'bottom',
        'on', 'under', 'front', 'behind', 'in-front'
    ]
    
    # Try to find exact matches first
    for relation in spatial_relations:
        if relation == gen_lower or f' {relation} ' in f' {gen_lower} ':
            return relation
    
    # Try to find relations at the beginning or end
    for relation in spatial_relations:
        if gen_lower.startswith(relation + ' ') or gen_lower.endswith(' ' + relation):
            return relation
    
    # If no exact match, return the generation (truncated)
    return gen_lower[:20] if gen_lower else 'unknown'


# Load results
with open(RESULTS_FILE, 'r') as f:
    results = json.load(f)

print(f"Loaded {len(results)} results")
print(f"\nFirst result example:")
print(json.dumps(results[0], indent=2))

## Extract and Clean Predictions

In [None]:
# Extract predictions and ground truth
predicted_answers = []
golden_answers = []
correct_predictions = []
raw_generations = []

for result in results:
    raw_gen = result['Generation']
    pred = extract_spatial_answer(raw_gen)
    gold = result['Golden'].lower() if isinstance(result['Golden'], str) else result['Golden'][0].lower()
    
    raw_generations.append(raw_gen)
    predicted_answers.append(pred)
    golden_answers.append(gold)
    correct_predictions.append(pred == gold)

# Create DataFrame for easier analysis
df = pd.DataFrame({
    'prompt': [r['Prompt'] for r in results],
    'raw_generation': raw_generations,
    'predicted': predicted_answers,
    'golden': golden_answers,
    'correct': correct_predictions
})

print(f"\nDataFrame shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## Overall Statistics

In [None]:
total = len(df)
correct = df['correct'].sum()
accuracy = 100 * correct / total

print("="*60)
print("OVERALL STATISTICS")
print("="*60)
print(f"Total samples: {total}")
print(f"Correct predictions: {correct}")
print(f"Accuracy: {accuracy:.2f}%")
print("="*60)

## Per-Class Accuracy

Which spatial relations is the model struggling with?

In [None]:
# Calculate per-class accuracy
per_class = df.groupby('golden').agg({
    'correct': ['sum', 'count', 'mean']
}).round(4)
per_class.columns = ['correct', 'total', 'accuracy']
per_class['accuracy'] = per_class['accuracy'] * 100
per_class = per_class.sort_values('accuracy', ascending=False)

print("\nPer-Class Accuracy:")
print(per_class)

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(per_class.index, per_class['accuracy'], color='steelblue', alpha=0.8)
ax.set_xlabel('Spatial Relation (Golden Answer)', fontsize=12)
ax.set_ylabel('Accuracy (%)', fontsize=12)
ax.set_title(f'Per-Class Accuracy\nOverall Accuracy: {accuracy:.2f}%', fontsize=14)
ax.set_ylim(0, 100)
ax.axhline(y=accuracy, color='r', linestyle='--', alpha=0.5, label=f'Overall: {accuracy:.1f}%')
plt.xticks(rotation=45, ha='right')

# Add count labels on bars
for bar, (idx, row) in zip(bars, per_class.iterrows()):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 2,
            f'n={int(row["total"])}', ha='center', va='bottom', fontsize=10)

plt.legend()
plt.tight_layout()
plt.show()

## Answer Distribution

Compare golden vs predicted answer distributions

In [None]:
# Count distributions
golden_dist = Counter(golden_answers)
predicted_dist = Counter(predicted_answers)

print("\nGolden Answer Distribution:")
for answer, count in golden_dist.most_common():
    pct = 100 * count / len(golden_answers)
    print(f"  {answer:15s}: {count:4d} ({pct:5.1f}%)")

print("\nPredicted Answer Distribution:")
for answer, count in predicted_dist.most_common():
    pct = 100 * count / len(predicted_answers)
    print(f"  {answer:15s}: {count:4d} ({pct:5.1f}%)")

# Visualize side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Golden distribution
ax1.bar(golden_dist.keys(), golden_dist.values(), color='green', alpha=0.6)
ax1.set_xlabel('Spatial Relation', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Golden Answer Distribution', fontsize=14)
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Predicted distribution
ax2.bar(predicted_dist.keys(), predicted_dist.values(), color='orange', alpha=0.6)
ax2.set_xlabel('Spatial Relation', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.set_title('Predicted Answer Distribution', fontsize=14)
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

## Confusion Matrix

What does the model predict when it's wrong?

In [None]:
# Build confusion matrix
all_labels = sorted(set(golden_answers + predicted_answers))
confusion = np.zeros((len(all_labels), len(all_labels)))
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}

for gold, pred in zip(golden_answers, predicted_answers):
    confusion[label_to_idx[gold], label_to_idx[pred]] += 1

# Normalize by row (golden answer) to get percentages
confusion_norm = confusion / (confusion.sum(axis=1, keepdims=True) + 1e-10) * 100

# Plot
fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(confusion_norm, annot=True, fmt='.1f', cmap='YlOrRd',
            xticklabels=all_labels, yticklabels=all_labels,
            cbar_kws={'label': 'Percentage (%)'}, ax=ax)
ax.set_xlabel('Predicted Answer', fontsize=12)
ax.set_ylabel('Golden Answer', fontsize=12)
ax.set_title('Confusion Matrix (% of each golden answer)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Also show absolute counts
fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(confusion, annot=True, fmt='.0f', cmap='Blues',
            xticklabels=all_labels, yticklabels=all_labels,
            cbar_kws={'label': 'Count'}, ax=ax)
ax.set_xlabel('Predicted Answer', fontsize=12)
ax.set_ylabel('Golden Answer', fontsize=12)
ax.set_title('Confusion Matrix (Absolute Counts)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Most Common Confusion Patterns

In [None]:
# Find most common confusions for each class
print("\nMost Common Confusion Patterns:")
print("="*60)

for gold in sorted(set(golden_answers)):
    gold_mask = df['golden'] == gold
    predictions = df[gold_mask]['predicted'].values
    pred_counts = Counter(predictions)
    
    print(f"\nWhen golden answer is '{gold}' (n={sum(gold_mask)}):")
    for pred, count in pred_counts.most_common(5):
        pct = 100 * count / sum(gold_mask)
        is_correct = "✓" if pred == gold else "✗"
        print(f"  {is_correct} {pred:15s}: {count:4d} ({pct:5.1f}%)")

## Explore Specific Errors

In [None]:
# View incorrect predictions
incorrect_df = df[~df['correct']].copy()
print(f"\nTotal incorrect predictions: {len(incorrect_df)}")

# Show first few errors
print("\nFirst 10 errors:")
incorrect_df[['prompt', 'raw_generation', 'predicted', 'golden']].head(10)

## Analyze Specific Confusion Pair

In [None]:
# Choose a specific confusion to investigate
GOLD_ANSWER = 'left'  # Change this to investigate different confusions
PRED_ANSWER = 'right'  # Change this to investigate different confusions

confusion_mask = (df['golden'] == GOLD_ANSWER) & (df['predicted'] == PRED_ANSWER)
confusion_cases = df[confusion_mask]

print(f"\nCases where golden='{GOLD_ANSWER}' but predicted='{PRED_ANSWER}': {len(confusion_cases)}")
print("\nExamples:")
for idx, row in confusion_cases.head(5).iterrows():
    print(f"\nPrompt: {row['prompt']}")
    print(f"Generated: {row['raw_generation']}")
    print(f"Predicted: {row['predicted']} | Golden: {row['golden']}")
    print("-" * 80)

## Raw Generation Analysis

In [None]:
# Look at the raw generations to understand model behavior
print("\nSample raw generations (correct predictions):")
for idx, row in df[df['correct']].sample(min(5, len(df[df['correct']]))).iterrows():
    print(f"\nGolden: {row['golden']}")
    print(f"Raw: {row['raw_generation']}")
    print("-" * 60)

print("\n\nSample raw generations (incorrect predictions):")
for idx, row in df[~df['correct']].sample(min(5, len(df[~df['correct']]))).iterrows():
    print(f"\nGolden: {row['golden']} | Predicted: {row['predicted']}")
    print(f"Raw: {row['raw_generation']}")
    print("-" * 60)

## Save Analysis Summary

In [None]:
# Save summary to JSON
summary = {
    'overall': {
        'total': total,
        'correct': int(correct),
        'accuracy': float(accuracy)
    },
    'per_class_accuracy': per_class.to_dict('index'),
    'golden_distribution': dict(golden_dist),
    'predicted_distribution': dict(predicted_dist)
}

output_file = RESULTS_FILE.replace('.json', '_analysis.json')
with open(output_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nAnalysis saved to: {output_file}")