# FDT Evaluation Results Analysis

This notebook analyzes evaluation results from chat and cogex models, comparing their choices against CDT/FDT/EDT recommendations.

In [1]:
import json
import pandas as pd
from pathlib import Path
from glob import glob

## Load FDT Dataset

Load the ground truth dataset with CDT/FDT/EDT recommendations for each problem and schema.

In [2]:
DATA_DIR = Path("../data")

# Load fdt.json
with open(DATA_DIR / "fdt.json") as f:
    fdt_data = json.load(f)

# Create lookup dict: {problem_id: {"A": {"CDT": x, "FDT": y, "EDT": z}, ...}}
ground_truth = {}
for problem in fdt_data["problems"]:
    pid = problem["id"]
    ground_truth[pid] = {}
    for schema_key in ["A", "B", "C"]:
        schema = problem["schema"][schema_key]
        ground_truth[pid][schema_key] = {
            "CDT": schema["CDT"],
            "FDT": schema["FDT"],
            "EDT": schema["EDT"]
        }

print(f"Loaded {len(ground_truth)} problems from fdt.json")
print(f"Problem IDs range: {min(ground_truth.keys())} to {max(ground_truth.keys())}")

Loaded 999 problems from fdt.json
Problem IDs range: 0 to 998


## Load Result Files

Load all chat-*.json and cogex-*.json result files.

In [3]:
# Find all result files
chat_files = sorted(glob(str(DATA_DIR / "chat-*.json")))
cogex_files = sorted(glob(str(DATA_DIR / "cogex-*.json")))
all_result_files = chat_files + cogex_files

print(f"Found {len(chat_files)} chat result files:")
for f in chat_files:
    print(f"  - {Path(f).name}")

print(f"\nFound {len(cogex_files)} cogex result files:")
for f in cogex_files:
    print(f"  - {Path(f).name}")

Found 3 chat result files:
  - chat-llama2-13b.json
  - chat-llama2-7b.json
  - chat-qwen3-8b.json

Found 3 cogex result files:
  - cogex-llama2-13b.json
  - cogex-llama2-7b.json
  - cogex-qwen3-8b.json


In [4]:
# Load all result files
results = {}
for filepath in all_result_files:
    filename = Path(filepath).stem  # e.g., "chat-qwen3-8b"
    with open(filepath) as f:
        results[filename] = json.load(f)
    print(f"Loaded {filename}: {len(results[filename])} problems")

print(f"\nTotal models loaded: {len(results)}")

Loaded chat-llama2-13b: 999 problems
Loaded chat-llama2-7b: 999 problems
Loaded chat-qwen3-8b: 999 problems
Loaded cogex-llama2-13b: 999 problems
Loaded cogex-llama2-7b: 999 problems
Loaded cogex-qwen3-8b: 999 problems

Total models loaded: 6


## Check for Missing Problems

Report which problems are missing from each result file.

In [5]:
all_problem_ids = set(ground_truth.keys())

for model_name, model_results in results.items():
    # Convert string keys to int for comparison
    result_ids = set(int(k) for k in model_results.keys())
    missing_ids = all_problem_ids - result_ids
    
    if missing_ids:
        print(f"\n{model_name}: MISSING {len(missing_ids)} problems")
        if len(missing_ids) <= 20:
            print(f"  Missing IDs: {sorted(missing_ids)}")
        else:
            print(f"  First 20 missing: {sorted(missing_ids)[:20]}...")
    else:
        print(f"{model_name}: Complete ({len(result_ids)} problems)")

chat-llama2-13b: Complete (999 problems)
chat-llama2-7b: Complete (999 problems)
chat-qwen3-8b: Complete (999 problems)
cogex-llama2-13b: Complete (999 problems)
cogex-llama2-7b: Complete (999 problems)
cogex-qwen3-8b: Complete (999 problems)


## Create Hierarchical DataFrames

For each model, create a DataFrame where:
- Index: problem_id
- Columns: MultiIndex (schema, theory) e.g., ("A", "CDT"), ("A", "FDT"), etc.
- Values: 1 if model agrees, 0 if disagrees, -1 for parse failures

In [6]:
def create_agreement_df(model_results: dict, ground_truth: dict) -> pd.DataFrame:
    """
    Create a DataFrame comparing model choices to CDT/FDT/EDT recommendations.
    
    Args:
        model_results: Dict mapping problem_id (str) -> {"A": {"choice": int}, "B": ..., "C": ...}
        ground_truth: Dict mapping problem_id (int) -> {"A": {"CDT": int, "FDT": int, "EDT": int}, ...}
    
    Returns:
        DataFrame with MultiIndex columns (schema, theory) and values 1/0/-1
    """
    schemas = ["A", "B", "C"]
    theories = ["CDT", "FDT", "EDT"]
    
    # Create MultiIndex columns
    columns = pd.MultiIndex.from_product([schemas, theories], names=["schema", "theory"])
    
    rows = []
    row_indices = []
    
    for pid in sorted(ground_truth.keys()):
        pid_str = str(pid)
        
        # Check if this problem exists in results
        if pid_str not in model_results:
            # Problem missing from results - skip it
            continue
        
        row_data = []
        for schema in schemas:
            model_choice = model_results[pid_str][schema]["choice"]
            
            for theory in theories:
                gt_recommendation = ground_truth[pid][schema][theory]
                
                if model_choice == -1:
                    # Parse failure - set to -1
                    row_data.append(-1)
                elif model_choice == gt_recommendation:
                    # Agreement
                    row_data.append(1)
                else:
                    # Disagreement
                    row_data.append(0)
        
        rows.append(row_data)
        row_indices.append(pid)
    
    df = pd.DataFrame(rows, index=row_indices, columns=columns)
    df.index.name = "problem_id"
    return df

In [7]:
# Create DataFrames for all models
model_dfs = {}
for model_name, model_results in results.items():
    model_dfs[model_name] = create_agreement_df(model_results, ground_truth)
    print(f"{model_name}: {len(model_dfs[model_name])} rows")

chat-llama2-13b: 999 rows
chat-llama2-7b: 999 rows
chat-qwen3-8b: 999 rows
cogex-llama2-13b: 999 rows
cogex-llama2-7b: 999 rows
cogex-qwen3-8b: 999 rows


In [8]:
# Display sample from first model
first_model = list(model_dfs.keys())[0]
print(f"Sample from {first_model}:")
model_dfs[first_model].head(10)

Sample from chat-llama2-13b:


schema,A,A,A,B,B,B,C,C,C
theory,CDT,FDT,EDT,CDT,FDT,EDT,CDT,FDT,EDT
problem_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,0,1,1,0,0,1,1,1,1
1,0,1,1,0,0,1,1,1,1
2,0,1,1,1,1,0,1,1,1
3,0,1,1,0,0,1,1,1,1
4,0,1,1,0,0,1,0,0,0
5,0,1,1,0,0,1,1,1,1
6,1,0,0,1,1,0,1,1,1
7,0,1,1,1,1,0,1,1,1
8,0,1,1,0,0,1,0,0,0
9,1,0,0,0,0,1,1,1,1


## Summary Statistics

Calculate agreement rates per decision theory per schema for each model.

In [9]:
def calculate_agreement_rates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate agreement rates, excluding parse failures (-1).
    
    Returns DataFrame with agreement rates per (schema, theory).
    """
    rates = {}
    for col in df.columns:
        valid_mask = df[col] != -1
        valid_count = valid_mask.sum()
        if valid_count > 0:
            agreement_rate = (df[col] == 1).sum() / valid_count
        else:
            agreement_rate = float('nan')
        rates[col] = {
            "agreement_rate": agreement_rate,
            "valid_count": valid_count,
            "parse_failures": (df[col] == -1).sum()
        }
    return pd.DataFrame(rates).T

In [10]:
# Calculate and display summary for each model
for model_name, df in model_dfs.items():
    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"{'='*60}")
    
    summary = calculate_agreement_rates(df)
    
    # Format for display
    summary["agreement_rate"] = summary["agreement_rate"].apply(lambda x: f"{x:.1%}" if pd.notna(x) else "N/A")
    summary["valid_count"] = summary["valid_count"].astype(int)
    summary["parse_failures"] = summary["parse_failures"].astype(int)
    
    display(summary)


Model: chat-llama2-13b


Unnamed: 0,Unnamed: 1,agreement_rate,valid_count,parse_failures
A,CDT,35.6%,999,0
A,FDT,64.4%,999,0
A,EDT,64.4%,999,0
B,CDT,50.2%,999,0
B,FDT,50.2%,999,0
B,EDT,49.8%,999,0
C,CDT,56.8%,999,0
C,FDT,56.8%,999,0
C,EDT,56.8%,999,0



Model: chat-llama2-7b


Unnamed: 0,Unnamed: 1,agreement_rate,valid_count,parse_failures
A,CDT,46.5%,999,0
A,FDT,53.5%,999,0
A,EDT,53.5%,999,0
B,CDT,51.3%,999,0
B,FDT,51.3%,999,0
B,EDT,48.7%,999,0
C,CDT,62.0%,999,0
C,FDT,62.0%,999,0
C,EDT,62.0%,999,0



Model: chat-qwen3-8b


Unnamed: 0,Unnamed: 1,agreement_rate,valid_count,parse_failures
A,CDT,68.9%,999,0
A,FDT,31.1%,999,0
A,EDT,31.1%,999,0
B,CDT,81.8%,999,0
B,FDT,81.8%,999,0
B,EDT,18.2%,999,0
C,CDT,88.6%,999,0
C,FDT,88.6%,999,0
C,EDT,88.6%,999,0



Model: cogex-llama2-13b


Unnamed: 0,Unnamed: 1,agreement_rate,valid_count,parse_failures
A,CDT,42.0%,999,0
A,FDT,58.0%,999,0
A,EDT,58.0%,999,0
B,CDT,58.2%,998,1
B,FDT,58.2%,998,1
B,EDT,41.8%,998,1
C,CDT,59.6%,999,0
C,FDT,59.6%,999,0
C,EDT,59.6%,999,0



Model: cogex-llama2-7b


Unnamed: 0,Unnamed: 1,agreement_rate,valid_count,parse_failures
A,CDT,34.0%,995,4
A,FDT,66.0%,995,4
A,EDT,66.0%,995,4
B,CDT,48.0%,996,3
B,FDT,48.0%,996,3
B,EDT,52.0%,996,3
C,CDT,50.9%,998,1
C,FDT,50.9%,998,1
C,EDT,50.9%,998,1



Model: cogex-qwen3-8b


Unnamed: 0,Unnamed: 1,agreement_rate,valid_count,parse_failures
A,CDT,91.0%,999,0
A,FDT,9.0%,999,0
A,EDT,9.0%,999,0
B,CDT,92.5%,999,0
B,FDT,92.5%,999,0
B,EDT,7.5%,999,0
C,CDT,95.7%,999,0
C,FDT,95.7%,999,0
C,EDT,95.7%,999,0


In [11]:
# Create a comparison table across all models
comparison_data = []
for model_name, df in model_dfs.items():
    row = {"model": model_name}
    for schema in ["A", "B", "C"]:
        for theory in ["CDT", "FDT", "EDT"]:
            col = (schema, theory)
            valid_mask = df[col] != -1
            valid_count = valid_mask.sum()
            if valid_count > 0:
                rate = (df[col] == 1).sum() / valid_count
                row[f"{schema}_{theory}"] = rate
            else:
                row[f"{schema}_{theory}"] = float('nan')
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data).set_index("model")
comparison_df

Unnamed: 0_level_0,A_CDT,A_FDT,A_EDT,B_CDT,B_FDT,B_EDT,C_CDT,C_FDT,C_EDT
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
chat-llama2-13b,0.356356,0.643644,0.643644,0.501502,0.501502,0.498498,0.567568,0.567568,0.567568
chat-llama2-7b,0.465465,0.534535,0.534535,0.512513,0.512513,0.487487,0.61962,0.61962,0.61962
chat-qwen3-8b,0.688689,0.311311,0.311311,0.817818,0.817818,0.182182,0.885886,0.885886,0.885886
cogex-llama2-13b,0.42042,0.57958,0.57958,0.582164,0.582164,0.417836,0.595596,0.595596,0.595596
cogex-llama2-7b,0.339698,0.660302,0.660302,0.47992,0.47992,0.52008,0.509018,0.509018,0.509018
cogex-qwen3-8b,0.90991,0.09009,0.09009,0.924925,0.924925,0.075075,0.956957,0.956957,0.956957


In [12]:
# Format as percentages for easier reading
comparison_df_pct = comparison_df.applymap(lambda x: f"{x:.1%}" if pd.notna(x) else "N/A")
comparison_df_pct

  comparison_df_pct = comparison_df.applymap(lambda x: f"{x:.1%}" if pd.notna(x) else "N/A")


Unnamed: 0_level_0,A_CDT,A_FDT,A_EDT,B_CDT,B_FDT,B_EDT,C_CDT,C_FDT,C_EDT
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
chat-llama2-13b,35.6%,64.4%,64.4%,50.2%,50.2%,49.8%,56.8%,56.8%,56.8%
chat-llama2-7b,46.5%,53.5%,53.5%,51.3%,51.3%,48.7%,62.0%,62.0%,62.0%
chat-qwen3-8b,68.9%,31.1%,31.1%,81.8%,81.8%,18.2%,88.6%,88.6%,88.6%
cogex-llama2-13b,42.0%,58.0%,58.0%,58.2%,58.2%,41.8%,59.6%,59.6%,59.6%
cogex-llama2-7b,34.0%,66.0%,66.0%,48.0%,48.0%,52.0%,50.9%,50.9%,50.9%
cogex-qwen3-8b,91.0%,9.0%,9.0%,92.5%,92.5%,7.5%,95.7%,95.7%,95.7%
