# Full Results Tables
Generate tables for Language Fidelity and Task Accuracy with error detection

In [1]:
import os
import json
import glob
import pandas as pd
from pathlib import Path

# Change to project root
#os.chdir(Path(__file__).parent.parent if '__file__' in dir() else Path.cwd().parent)
#print(f"Working directory: {os.getcwd()}")

MODELS = {
    'gpt-5': 'GPT-5',
    'gemini-3-pro': 'Gemini 3 Pro',
    'claude-opus-4.5': 'Claude Opus 4.5',
    'deepseek-v3.1': 'DeepSeek-V3.1',
    'command-r-plus': 'Command R+'
}

LANGS = ['de', 'zh', 'es', 'ar']
CONDITIONS = ['baseline_en'] + [f'baseline_{l}' for l in LANGS] + [f'en_to_{l}' for l in LANGS] + [f'{l}_to_en' for l in LANGS]

# Verify paths exist
print(f"Layer1 exists: {os.path.exists('../results/layer1')}")
print(f"Layer2 exists: {os.path.exists('../results/layer2')}")

Layer1 exists: True
Layer2 exists: True


## 1. Load Layer 1 (Language Fidelity) from JSONL

In [2]:
def load_layer1_jsonl(model, condition):
    """Load Layer 1 evaluation JSONL and compute stats."""
    pattern = f'../results/layer1/{model}/language_eval_{condition}_*.jsonl'
    files = sorted(glob.glob(pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if not files:
        return None

    stats = {'total': 0, 'match': 0, 'mismatch': 0, 'mixed': 0, 'error': 0, 'parse_error': 0}
    with open(files[-1]) as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                stats['total'] += 1
                status = data.get('match_status', '')
                if status == 'match':
                    stats['match'] += 1
                elif status == 'mismatch':
                    stats['mismatch'] += 1
                elif status == 'mixed':
                    stats['mixed'] += 1
                else:
                    stats['error'] += 1
            except json.JSONDecodeError:
                stats['parse_error'] += 1

    if stats['total'] > 0:
        stats['fidelity_rate'] = (stats['match'] / stats['total']) * 100
    else:
        stats['fidelity_rate'] = None
    return stats

# Test loading
test = load_layer1_jsonl('gpt-5', 'baseline_en')
if test:
    print("Sample Layer 1 stats (from JSONL):")
    print(f"  total: {test['total']}, match: {test['match']}, mismatch: {test['mismatch']}")
    print(f"  fidelity_rate: {test['fidelity_rate']:.1f}%")
else:
    print("No Layer 1 data found for gpt-5 baseline_en")
    print("Available files:", glob.glob('results/layer1/gpt-5/*.jsonl')[:3])

Sample Layer 1 stats (from JSONL):
  total: 182, match: 182, mismatch: 0
  fidelity_rate: 100.0%


## 2. Load Layer 2 (Task Accuracy) from Evaluated JSONL

In [3]:
def load_layer2_jsonl(model, condition):
    """Load Layer 2 evaluated JSONL and compute stats."""
    patterns = [
        f'../results/layer2/{model}/evaluated_{condition}_*.jsonl',
        f'../results/layer2/{model}/evaluated_{condition}.jsonl',
    ]
    
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        # Exclude variance runs (run2, run3) - only use primary results
        files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
        if files:
            stats = {'total': 0, 'passed': 0, 'failed': 0, 'error': 0, 'parse_error': 0}
            with open(files[-1]) as f:
                for line in f:
                    try:
                        data = json.loads(line.strip())
                        stats['total'] += 1
                        eval_result = data.get('evaluation', {})
                        if eval_result.get('passed'):
                            stats['passed'] += 1
                        else:
                            stats['failed'] += 1
                    except json.JSONDecodeError:
                        stats['parse_error'] += 1
            return stats
    return None

# Test loading
test = load_layer2_jsonl('gpt-5', 'baseline_en')
if test:
    print("Sample Layer 2 stats:")
    print(f"  {test}")
else:
    print("No Layer 2 data found")
    print("Available files:", glob.glob('results/layer2/gpt-5/*.jsonl')[:3])

Sample Layer 2 stats:
  {'total': 182, 'passed': 104, 'failed': 78, 'error': 0, 'parse_error': 0}


## 3. Check Response Files for Errors

In [4]:
def check_response_errors(model, condition):
    """Check response files for errors."""
    patterns = [
        f'../results/responses/{model}/responses_{condition}_*.jsonl',
        f'../results/responses/{model}/responses_{condition}.jsonl',
    ]
    
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        # Exclude variance runs (run2, run3) - only use primary results
        files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
        if files:
            stats = {
                'total': 0,
                'success': 0,
                'api_error': 0,
                'empty_response': 0,
                'parse_error': 0
            }
            with open(files[-1]) as f:
                for line in f:
                    try:
                        data = json.loads(line.strip())
                        stats['total'] += 1
                        if data.get('success'):
                            stats['success'] += 1
                            if not data.get('response') or data.get('response', '').strip() == '':
                                stats['empty_response'] += 1
                        else:
                            stats['api_error'] += 1
                    except json.JSONDecodeError:
                        stats['parse_error'] += 1
            return stats
    return None

# Test
test = check_response_errors('gpt-5', 'baseline_en')
if test:
    print("Sample response stats:")
    print(f"  {test}")
else:
    print("No response files found")

Sample response stats:
  {'total': 182, 'success': 182, 'api_error': 0, 'empty_response': 0, 'parse_error': 0}


## 4. Build Language Fidelity Table

In [5]:
fidelity_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    for cond in CONDITIONS:
        stats = load_layer1_jsonl(model_id, cond)
        if stats:
            row[cond] = stats['fidelity_rate']
        else:
            row[cond] = None
    fidelity_data.append(row)

df_fidelity = pd.DataFrame(fidelity_data).set_index('Model')
print("LANGUAGE FIDELITY (%)")
print("=" * 120)
print(df_fidelity.round(1).to_string())

LANGUAGE FIDELITY (%)
                 baseline_en  baseline_de  baseline_zh  baseline_es  baseline_ar  en_to_de  en_to_zh  en_to_es  en_to_ar  de_to_en  zh_to_en  es_to_en  ar_to_en
Model                                                                                                                                                           
GPT-5                  100.0         98.9        100.0        100.0         99.5      97.8      99.5      99.5      97.8      94.0      95.6      94.5      96.2
Gemini 3 Pro           100.0         98.9        100.0        100.0         99.5      98.3      98.9      98.4      97.8      78.6      72.5      74.7      69.2
Claude Opus 4.5        100.0         98.9        100.0        100.0         99.5      96.7      94.0      97.3      96.7      10.4       9.9       6.0       4.4
DeepSeek-V3.1          100.0         98.9         98.4        100.0         98.9      93.4      73.1      95.1      91.8      41.8      60.4      41.2      64.3
Command R+  

## 5. Build Task Accuracy Table

In [6]:
accuracy_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    for cond in CONDITIONS:
        stats = load_layer2_jsonl(model_id, cond)
        if stats and stats['total'] > 0:
            row[cond] = (stats['passed'] / stats['total']) * 100
        else:
            row[cond] = None
    accuracy_data.append(row)

df_accuracy = pd.DataFrame(accuracy_data).set_index('Model')
print("TASK ACCURACY (%)")
print("=" * 120)
print(df_accuracy.round(1).to_string())

TASK ACCURACY (%)
                 baseline_en  baseline_de  baseline_zh  baseline_es  baseline_ar  en_to_de  en_to_zh  en_to_es  en_to_ar  de_to_en  zh_to_en  es_to_en  ar_to_en
Model                                                                                                                                                           
GPT-5                   57.1         58.2         57.7         57.7         61.0      57.1      59.9      59.3      60.4      55.5      50.5      49.5      54.4
Gemini 3 Pro            71.4         66.5         72.0         71.4         70.3      73.6      70.3      68.7      70.9      66.5      68.7      72.0      74.2
Claude Opus 4.5         54.4         45.1         48.9         52.7         47.3      49.5      46.7      50.5      48.9      48.4      47.8      52.7      50.5
DeepSeek-V3.1           50.0         39.0         39.0         45.1         37.9      40.1      44.5      44.0      42.9      38.5      37.4      37.9      36.8
Command R+      

## 6. Error Detection Report

In [7]:
print("ERROR DETECTION REPORT")
print("=" * 80)

for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    has_errors = False
    for cond in CONDITIONS:
        stats = check_response_errors(model_id, cond)
        if stats:
            errors = []
            if stats['api_error'] > 0:
                errors.append(f"API errors: {stats['api_error']}")
            if stats['empty_response'] > 0:
                errors.append(f"Empty: {stats['empty_response']}")
            if stats['parse_error'] > 0:
                errors.append(f"Parse errors: {stats['parse_error']}")
            if errors:
                print(f"  {cond}: {', '.join(errors)}")
                has_errors = True
    if not has_errors:
        print("  No errors found")

ERROR DETECTION REPORT

GPT-5:
  No errors found

Gemini 3 Pro:
  baseline_en: Empty: 2
  baseline_de: Empty: 1
  baseline_es: Empty: 1
  en_to_de: Empty: 2
  en_to_zh: Empty: 1
  en_to_ar: Empty: 1

Claude Opus 4.5:


  baseline_de: Empty: 1


  baseline_es: Empty: 1
  en_to_de: Empty: 1

DeepSeek-V3.1:
  No errors found

Command R+:


  No errors found


## 7. Conversation Length Effect (Xâ†’EN Fidelity)

In [8]:
import numpy as np
from scipy import stats

# New turn ranges: Short (3-5), Medium (7-9), Long (11+)
def categorize_length_new(turns):
    """Categorize conversation length with new ranges."""
    if 3 <= turns <= 5:
        return 'Short'
    elif 7 <= turns <= 9:
        return 'Medium'
    elif turns >= 11:
        return 'Long'
    else:
        return None  # Exclude 2, 6, 10 turns

# Load baseline data with turn counts
baseline_data = {}
with open('../data/experiments/baseline_en.jsonl') as f:
    for line in f:
        item = json.loads(line.strip())
        baseline_data[item['QUESTION_ID']] = {
            'turns': len(item['CONVERSATION'])
        }

# Check turn distribution
from collections import Counter
turn_counts = [v['turns'] for v in baseline_data.values()]
print("Turn Distribution in Dataset:")
print("=" * 50)
for t, count in sorted(Counter(turn_counts).items()):
    cat = categorize_length_new(t)
    cat_str = f"({cat})" if cat else "(excluded)"
    print(f"  {t:2d} turns: {count:3d} {cat_str}")

# Count by new categories
short = sum(1 for t in turn_counts if 3 <= t <= 5)
medium = sum(1 for t in turn_counts if 7 <= t <= 9)
long = sum(1 for t in turn_counts if t >= 11)
print(f"\nNew categories:")
print(f"  Short (3-5 turns):   n={short}")
print(f"  Medium (7-9 turns):  n={medium}")
print(f"  Long (11+ turns):    n={long}")

Turn Distribution in Dataset:
   3 turns:   7 (Short)
   5 turns:  37 (Short)
   7 turns:  68 (Medium)
   9 turns:  31 (Medium)
  11 turns:  18 (Long)
  13 turns:  11 (Long)
  15 turns:   8 (Long)
  19 turns:   2 (Long)

New categories:
  Short (3-5 turns):   n=44
  Medium (7-9 turns):  n=99
  Long (11+ turns):    n=39


In [9]:
def load_responses_with_turns(model, condition):
    """Load responses and extract turn counts."""
    patterns = [
        f'../results/responses/{model}/responses_{condition}_*.jsonl',
        f'../results/responses/{model}/responses_{condition}.jsonl',
    ]
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        # Exclude variance runs (run2, run3) - only use primary results
        files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
        if files:
            data = []
            with open(files[-1]) as f:
                for line in f:
                    item = json.loads(line.strip())
                    if item.get('success'):
                        data.append({
                            'question_id': item.get('question_id'),
                            'turn_count': item.get('turn_count'),
                        })
            return data
    return None

def load_language_eval_by_qid(model, condition):
    """Load language evaluation results by question ID."""
    pattern = f'../results/layer1/{model}/language_eval_{condition}_*.jsonl'
    files = sorted(glob.glob(pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if not files:
        return None
    
    data = {}
    with open(files[-1]) as f:
        for line in f:
            item = json.loads(line.strip())
            qid = item.get('question_id')
            data[qid] = item.get('match_status') == 'match'
    return data

def compute_fidelity_by_length_new(model):
    """Compute X->EN fidelity by conversation length (new ranges)."""
    results = {'Short': {'match': 0, 'total': 0},
               'Medium': {'match': 0, 'total': 0},
               'Long': {'match': 0, 'total': 0}}
    
    for lang in LANGS:
        condition = f"{lang}_to_en"
        
        responses = load_responses_with_turns(model, condition)
        lang_eval = load_language_eval_by_qid(model, condition)
        
        if not responses or not lang_eval:
            continue
        
        for resp in responses:
            qid = resp['question_id']
            turns = resp['turn_count']
            category = categorize_length_new(turns)
            
            if category and qid in lang_eval:
                results[category]['total'] += 1
                if lang_eval[qid]:
                    results[category]['match'] += 1
    
    return results

def chi_square_test(results):
    """Run chi-square test for independence between length and fidelity."""
    observed = []
    for cat in ['Short', 'Medium', 'Long']:
        match = results[cat]['match']
        total = results[cat]['total']
        if total == 0:
            return None, None
        observed.append([match, total - match])
    
    observed = np.array(observed).T
    
    # Check if test is valid
    if np.any(observed.sum(axis=0) < 5):
        return None, "Low counts"
    
    try:
        chi2, p, dof, expected = stats.chi2_contingency(observed)
        return chi2, p
    except:
        return None, None

# Compute for all models
length_results = {}
for model_id in MODELS.keys():
    length_results[model_id] = compute_fidelity_by_length_new(model_id)

print("X->EN Fidelity by Conversation Length (New Ranges)")
print("=" * 80)
print(f"{'Model':<20} {'Short (3-5)':>15} {'Med (7-9)':>15} {'Long (11+)':>15} {'p-value':>12}")
print("-" * 80)

for model_id, model_name in MODELS.items():
    res = length_results[model_id]
    chi2, p = chi_square_test(res)
    
    row = []
    for cat in ['Short', 'Medium', 'Long']:
        if res[cat]['total'] > 0:
            pct = res[cat]['match'] / res[cat]['total'] * 100
            row.append(f"{pct:.1f}%")
        else:
            row.append("--")
    
    if p is not None and isinstance(p, float):
        p_str = "<0.001" if p < 0.001 else f"{p:.2f}"
    else:
        p_str = "---"
    
    print(f"{model_name:<20} {row[0]:>15} {row[1]:>15} {row[2]:>15} {p_str:>12}")

X->EN Fidelity by Conversation Length (New Ranges)
Model                    Short (3-5)       Med (7-9)      Long (11+)      p-value
--------------------------------------------------------------------------------
GPT-5                          97.2%           93.9%           95.5%         0.25
Gemini 3 Pro                   82.4%           75.0%           60.9%       <0.001
Claude Opus 4.5                11.9%            5.8%            7.7%         0.04
DeepSeek-V3.1                  55.1%           54.0%           42.9%         0.04
Command R+                      0.0%            0.5%            2.6%         0.02
