# Full Results Tables
Generate LaTeX tables for Language Fidelity and Task Accuracy

In [None]:
import json
import glob
import pandas as pd
from pathlib import Path

# Models and their display names
MODELS = {
    'gpt-5': 'GPT-5',
    'gemini-3-pro': 'Gemini 3 Pro',
    'claude-opus-4.5': 'Claude Opus 4.5',
    'deepseek-v3.1': 'DeepSeek-V3.1',
    'command-r-plus': 'Command R+'
}

# Conditions
LANGS = ['de', 'zh', 'es', 'ar']
CONDITIONS = ['baseline_en'] + [f'baseline_{l}' for l in LANGS] + [f'en_to_{l}' for l in LANGS] + [f'{l}_to_en' for l in LANGS]

In [None]:
def load_layer1_summary(model, condition):
    """Load language fidelity summary for a model/condition."""
    # Try multiple patterns
    patterns = [
        f'results/layer1/{model}/language_summary_{condition}_*.json',
        f'results/layer1/{model}/language_summary_{condition}.json',
    ]
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        if files:
            with open(files[-1]) as f:
                data = json.load(f)
            # fidelity_rate is already in percentage (0-100)
            return data.get('fidelity_rate', 0)
    return None

def load_layer2_summary(model, condition):
    """Load task accuracy summary for a model/condition."""
    # Try summary json patterns
    patterns = [
        f'results/layer2/{model}/summary_{condition}_*.json',
        f'results/layer2/{model}/summary_{condition}.json',
    ]
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        if files:
            with open(files[-1]) as f:
                data = json.load(f)
            total = data.get('total', data.get('stats', {}).get('total', 0))
            passed = data.get('passed', data.get('stats', {}).get('passed', 0))
            return (passed / total) * 100 if total else 0

    # Try evaluated file patterns
    eval_patterns = [
        f'results/layer2/{model}/evaluated_{condition}_*.jsonl',
        f'results/layer2/{model}/evaluated_{condition}.jsonl',
    ]
    for pattern in eval_patterns:
        files = sorted(glob.glob(pattern))
        if files:
            # Count pass/fail from evaluated file using proper JSON parsing
            passed = 0
            total = 0
            with open(files[-1]) as f:
                for line in f:
                    try:
                        data = json.loads(line.strip())
                        total += 1
                        if data.get('evaluation', {}).get('passed', False):
                            passed += 1
                    except json.JSONDecodeError:
                        continue
            return (passed / total) * 100 if total else 0
    return None

In [None]:
# Build Language Fidelity table
print("=" * 80)
print("LANGUAGE FIDELITY (%)")
print("=" * 80)

fidelity_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    
    # Base EN
    row['Base EN'] = load_layer1_summary(model_id, 'baseline_en')
    
    # Baseline X
    for lang in LANGS:
        row[f'Base {lang.upper()}'] = load_layer1_summary(model_id, f'baseline_{lang}')
    
    # EN->X
    for lang in LANGS:
        row[f'EN->{lang.upper()}'] = load_layer1_summary(model_id, f'en_to_{lang}')
    
    # X->EN
    for lang in LANGS:
        row[f'{lang.upper()}->EN'] = load_layer1_summary(model_id, f'{lang}_to_en')
    
    fidelity_data.append(row)

df_fidelity = pd.DataFrame(fidelity_data)
df_fidelity = df_fidelity.set_index('Model')
print(df_fidelity.round(1).to_string())

In [None]:
# Build Task Accuracy table
print("=" * 80)
print("TASK ACCURACY (%)")
print("=" * 80)

accuracy_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    
    # Base EN
    row['Base EN'] = load_layer2_summary(model_id, 'baseline_en')
    
    # Baseline X
    for lang in LANGS:
        row[f'Base {lang.upper()}'] = load_layer2_summary(model_id, f'baseline_{lang}')
    
    # EN->X
    for lang in LANGS:
        row[f'EN->{lang.upper()}'] = load_layer2_summary(model_id, f'en_to_{lang}')
    
    # X->EN
    for lang in LANGS:
        row[f'{lang.upper()}->EN'] = load_layer2_summary(model_id, f'{lang}_to_en')
    
    accuracy_data.append(row)

df_accuracy = pd.DataFrame(accuracy_data)
df_accuracy = df_accuracy.set_index('Model')
print(df_accuracy.round(1).to_string())

In [None]:
# Generate LaTeX for Language Fidelity
print("\n" + "=" * 80)
print("LaTeX: LANGUAGE FIDELITY")
print("=" * 80)

def fmt(val):
    if val is None or pd.isna(val):
        return '--'
    return f'{val:.1f}'

for _, row in df_fidelity.iterrows():
    model = row.name
    base_en = fmt(row.get('Base EN'))
    
    # Baseline X: DE ZH ES AR
    base_x = ' & '.join([fmt(row.get(f'Base {l.upper()}')) for l in LANGS])
    
    # EN->X: DE ZH ES AR
    en_to_x = ' & '.join([fmt(row.get(f'EN->{l.upper()}')) for l in LANGS])
    
    # X->EN: DE ZH ES AR
    x_to_en = ' & '.join([fmt(row.get(f'{l.upper()}->EN')) for l in LANGS])
    
    print(f'{model} & {base_en} & {base_x} & {en_to_x} & {x_to_en} \\\\')

In [None]:
# Generate LaTeX for Task Accuracy
print("\n" + "=" * 80)
print("LaTeX: TASK ACCURACY")
print("=" * 80)

for _, row in df_accuracy.iterrows():
    model = row.name
    base_en = fmt(row.get('Base EN'))
    
    # Baseline X: DE ZH ES AR
    base_x = ' & '.join([fmt(row.get(f'Base {l.upper()}')) for l in LANGS])
    
    # EN->X: DE ZH ES AR
    en_to_x = ' & '.join([fmt(row.get(f'EN->{l.upper()}')) for l in LANGS])
    
    # X->EN: DE ZH ES AR
    x_to_en = ' & '.join([fmt(row.get(f'{l.upper()}->EN')) for l in LANGS])
    
    print(f'{model} & {base_en} & {base_x} & {en_to_x} & {x_to_en} \\\\')

In [None]:
# Show summary stats
print("\n" + "=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)

print("\nLanguage Fidelity - X->EN (critical condition):")
x_to_en_cols = [f'{l.upper()}->EN' for l in LANGS]
print(df_fidelity[x_to_en_cols].round(1))

print("\nTask Accuracy - X->EN:")
print(df_accuracy[x_to_en_cols].round(1))