# Winoground Results Analysis

This notebook processes all winoground evaluation results and converts them to a pandas DataFrame and CSV.

In [None]:
import pandas as pd
import json
import glob
import os
from pathlib import Path

In [None]:
# Define paths
results_dir = "../results/winoground/"
output_csv = "../results/winoground_summary.csv"

In [None]:
# Read all JSON files
json_files = glob.glob(os.path.join(results_dir, "*.json"))
print(f"Found {len(json_files)} result files:")
for file in json_files:
    print(f"  - {os.path.basename(file)}")

In [None]:
# Process each file and create rows for DataFrame
rows = []

for json_file in json_files:
    model_name = Path(json_file).stem  # Get filename without extension
    
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    # Navigate to winoground results within experiments
    winoground_data = data['experiments']['winoground']
    
    # Overall results
    overall = winoground_data['overall']
    rows.append({
        'model': model_name,
        'tag': 'Overall',
        'text_score': overall['text'] * 100,
        'image_score': overall['image'] * 100,
        'group_score': overall['group'] * 100,
        'count': winoground_data['total_examples']
    })
    
    # Tag-specific results
    for tag, tag_data in winoground_data['by_tag'].items():
        rows.append({
            'model': model_name,
            'tag': tag,
            'text_score': tag_data['text'] * 100,
            'image_score': tag_data['image'] * 100,
            'group_score': tag_data['group'] * 100,
            'count': tag_data['count']
        })

print(f"Created {len(rows)} rows from {len(json_files)} files")

In [None]:
# Create DataFrame
df = pd.DataFrame(rows)
print(f"DataFrame shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head(10)

In [None]:
# Display summary statistics
print("Models found:", df['model'].unique())
print("\nTags found:", df['tag'].unique())
print("\nSample counts by tag:")
tag_counts = df[df['model'] == df['model'].iloc[0]].set_index('tag')['count']
print(tag_counts)

In [None]:
# Save to CSV
df.to_csv(output_csv, index=False)
print(f"Results saved to: {output_csv}")
print(f"CSV shape: {df.shape}")

In [None]:
# Optional: Create a pivot table for easier viewing
# Pivot by model and tag for group scores
pivot_group = df.pivot(index='tag', columns='model', values='group_score')
print("\nGroup Scores by Model and Tag:")
display(pivot_group.round(2))

In [None]:
# Save pivot table as well
pivot_group.to_csv("../results/winoground_pivot_group.csv")
print("Pivot table saved to: ../results/winoground_pivot_group.csv")

# Generate LaTeX table
def generate_latex_table(df):
    """Generate LaTeX table in the specified format"""
    # Get unique tags and models, ensure consistent ordering
    tags = ['Overall', 'Non Minimal', 'Unusual Image', 'Visually Difficult', 
            'Unusual Text', 'Ambiguously Correct', 'Complex Reasoning', 'NoTag']
    models = df['model'].unique()
    
    latex_lines = []
    latex_lines.append("\\begin{table*}[t]")
    latex_lines.append("\\centering")
    latex_lines.append("\\resizebox{\\textwidth}{!}{%")
    latex_lines.append("\\begin{tabular}{@{} l")
    latex_lines.append("    *{24}{S[table-format=2.1]} % 8 tags x 3 metrics each")
    latex_lines.append("    @{}}")
    latex_lines.append("\\toprule")
    
    # Main header
    latex_lines.append("& \\multicolumn{24}{c}{\\textbf{Winoground Results by Tag Category}} \\\\")
    latex_lines.append("\\cmidrule(lr){2-25}")
    
    # Tag headers
    header_parts = [""]
    cmidrule_parts = []
    start_col = 2
    for tag in tags:
        header_parts.append(f"\\multicolumn{{3}}{{c}}{{\\textbf{{{tag}}}}}")
        cmidrule_parts.append(f"\\cmidrule(lr){{{start_col}-{start_col+2}}}")
        start_col += 3
    
    latex_lines.append(" ".join(header_parts) + " \\\\")
    latex_lines.append(" ".join(cmidrule_parts))
    
    # Metric headers
    metric_parts = ["\\textbf{Model}"]
    for tag in tags:
        metric_parts.extend(["{Text}", "{Image}", "{Group}"])
    
    latex_lines.append(" & ".join(metric_parts) + " \\\\")
    latex_lines.append("\\midrule")
    
    # Data rows
    for model in models:
        model_data = df[df['model'] == model]
        # Clean model name for LaTeX
        clean_model = model.replace('_', '\\_')
        row_parts = [clean_model]
        
        for tag in tags:
            tag_data = model_data[model_data['tag'] == tag]
            if len(tag_data) > 0:
                text_score = f"{tag_data.iloc[0]['text_score']:.1f}"
                image_score = f"{tag_data.iloc[0]['image_score']:.1f}"
                group_score = f"{tag_data.iloc[0]['group_score']:.1f}"
                row_parts.extend([text_score, image_score, group_score])
            else:
                row_parts.extend(["-", "-", "-"])
        
        latex_lines.append(" & ".join(row_parts) + " \\\\")
    
    # End table
    latex_lines.append("\\bottomrule")
    latex_lines.append("\\end{tabular}")
    latex_lines.append("}")
    latex_lines.append("\\end{table*}")
    
    return "\n".join(latex_lines)

# Generate and display LaTeX table
latex_table = generate_latex_table(df)
print("\n" + "="*80)
print("LATEX TABLE:")
print("="*80)
print(latex_table)
print("="*80)

# Save LaTeX table to file
with open("../results/winoground_table.tex", "w") as f:
    f.write(latex_table)
print("\nLaTeX table saved to: ../results/winoground_table.tex")