# Full Results Tables
Generate tables for Language Fidelity and Task Accuracy with error detection

In [None]:
import json
import glob
import pandas as pd
from pathlib import Path
from collections import defaultdict

MODELS = {
    'gpt-5': 'GPT-5',
    'gemini-3-pro': 'Gemini 3 Pro',
    'claude-opus-4.5': 'Claude Opus 4.5',
    'deepseek-v3.1': 'DeepSeek-V3.1',
    'command-r-plus': 'Command R+'
}

LANGS = ['de', 'zh', 'es', 'ar']
CONDITIONS = ['baseline_en'] + [f'baseline_{l}' for l in LANGS] + [f'en_to_{l}' for l in LANGS] + [f'{l}_to_en' for l in LANGS]

## 1. Load Layer 1 (Language Fidelity) from Summary JSON

In [None]:
def load_layer1_json(model, condition):
    """Load Layer 1 summary JSON file."""
    pattern = f'results/layer1/{model}/language_summary_{condition}_*.json'
    files = sorted(glob.glob(pattern))
    if not files:
        return None
    with open(files[-1]) as f:
        return json.load(f)

# Test loading
test = load_layer1_json('gpt-5', 'baseline_en')
if test:
    print("Sample Layer 1 JSON:")
    print(json.dumps(test, indent=2))
else:
    print("No Layer 1 data found for gpt-5 baseline_en")

## 2. Load Layer 2 (Task Accuracy) from Evaluated JSONL

In [None]:
def load_layer2_jsonl(model, condition):
    """Load Layer 2 evaluated JSONL and compute stats."""
    patterns = [
        f'results/layer2/{model}/evaluated_{condition}_*.jsonl',
        f'results/layer2/{model}/evaluated_{condition}.jsonl',
    ]
    
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        if files:
            stats = {'total': 0, 'passed': 0, 'failed': 0, 'error': 0, 'parse_error': 0}
            with open(files[-1]) as f:
                for line in f:
                    try:
                        data = json.loads(line.strip())
                        stats['total'] += 1
                        eval_result = data.get('evaluation', {})
                        if eval_result.get('passed'):
                            stats['passed'] += 1
                        else:
                            stats['failed'] += 1
                    except json.JSONDecodeError:
                        stats['parse_error'] += 1
            return stats
    return None

# Test loading
test = load_layer2_jsonl('gpt-5', 'baseline_en')
if test:
    print("Sample Layer 2 stats:")
    print(json.dumps(test, indent=2))
else:
    print("No Layer 2 data found")

## 3. Check Response Files for Errors

In [None]:
def check_response_errors(model, condition):
    """Check response files for errors."""
    patterns = [
        f'results/responses/{model}/responses_{condition}_*.jsonl',
        f'results/responses/{model}/responses_{condition}.jsonl',
    ]
    
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        if files:
            stats = {
                'total': 0,
                'success': 0,
                'api_error': 0,
                'empty_response': 0,
                'parse_error': 0
            }
            with open(files[-1]) as f:
                for line in f:
                    try:
                        data = json.loads(line.strip())
                        stats['total'] += 1
                        if data.get('success'):
                            stats['success'] += 1
                            if not data.get('response') or data.get('response', '').strip() == '':
                                stats['empty_response'] += 1
                        else:
                            stats['api_error'] += 1
                    except json.JSONDecodeError:
                        stats['parse_error'] += 1
            return stats
    return None

# Test
test = check_response_errors('gpt-5', 'baseline_en')
if test:
    print("Sample response stats:")
    print(json.dumps(test, indent=2))

## 4. Build Language Fidelity Table

In [None]:
fidelity_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    for cond in CONDITIONS:
        data = load_layer1_json(model_id, cond)
        if data:
            row[cond] = data.get('fidelity_rate')
        else:
            row[cond] = None
    fidelity_data.append(row)

df_fidelity = pd.DataFrame(fidelity_data).set_index('Model')
print("LANGUAGE FIDELITY (%)")
print("=" * 100)
print(df_fidelity.round(1).to_string())

## 5. Build Task Accuracy Table

In [None]:
accuracy_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    for cond in CONDITIONS:
        stats = load_layer2_jsonl(model_id, cond)
        if stats and stats['total'] > 0:
            row[cond] = (stats['passed'] / stats['total']) * 100
        else:
            row[cond] = None
    accuracy_data.append(row)

df_accuracy = pd.DataFrame(accuracy_data).set_index('Model')
print("TASK ACCURACY (%)")
print("=" * 100)
print(df_accuracy.round(1).to_string())

## 6. Error Detection Report

In [None]:
print("ERROR DETECTION REPORT")
print("=" * 100)

for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    has_errors = False
    for cond in CONDITIONS:
        stats = check_response_errors(model_id, cond)
        if stats:
            errors = []
            if stats['api_error'] > 0:
                errors.append(f"API errors: {stats['api_error']}")
            if stats['empty_response'] > 0:
                errors.append(f"Empty: {stats['empty_response']}")
            if stats['parse_error'] > 0:
                errors.append(f"Parse errors: {stats['parse_error']}")
            if errors:
                print(f"  {cond}: {', '.join(errors)}")
                has_errors = True
    if not has_errors:
        print("  No errors found")

## 7. LaTeX Output

In [None]:
def fmt(val):
    if val is None or pd.isna(val):
        return '--'
    return f'{val:.1f}'

print("LaTeX: LANGUAGE FIDELITY")
print("Model & Base EN & Base DE & Base ZH & Base ES & Base AR & EN->DE & EN->ZH & EN->ES & EN->AR & DE->EN & ZH->EN & ES->EN & AR->EN")
for _, row in df_fidelity.iterrows():
    vals = [fmt(row.get(c)) for c in CONDITIONS]
    print(f"{row.name} & {' & '.join(vals)} \\\\")

print("\nLaTeX: TASK ACCURACY")
for _, row in df_accuracy.iterrows():
    vals = [fmt(row.get(c)) for c in CONDITIONS]
    print(f"{row.name} & {' & '.join(vals)} \\\\")