# Cross-Lingual Transfer (X→Y) Analysis

Analyzes language switching between non-English languages to examine whether English has a privileged role.

Test pairs:
- ZH→DE (Chinese context, German query)
- DE→ZH (German context, Chinese query)
- ES→AR (Spanish context, Arabic query)
- AR→ES (Arabic context, Spanish query)

In [1]:
import os
import json
import glob
import pandas as pd
from pathlib import Path

# Change to project root
os.chdir(Path(__file__).parent.parent if '__file__' in dir() else Path.cwd().parent)
print(f"Working directory: {os.getcwd()}")

MODELS = {
    'gpt-5': 'GPT-5',
    'claude-opus-4.5': 'Claude 4.5',
}

# Cross-lingual pairs: (context_lang, query_lang) -> expected_response_lang
CROSS_LINGUAL_PAIRS = [
    ('zh', 'de'),  # ZH→DE: Chinese context, German query, expect German response
    ('de', 'zh'),  # DE→ZH: German context, Chinese query, expect Chinese response
    ('es', 'ar'),  # ES→AR: Spanish context, Arabic query, expect Arabic response
    ('ar', 'es'),  # AR→ES: Arabic context, Spanish query, expect Spanish response
]

print(f"Models: {list(MODELS.keys())}")
print(f"Cross-lingual pairs: {CROSS_LINGUAL_PAIRS}")

Working directory: /Users/kyuheekim/codeswitching-apertus
Models: ['gpt-5', 'claude-opus-4.5']
Cross-lingual pairs: [('zh', 'de'), ('de', 'zh'), ('es', 'ar'), ('ar', 'es')]


## 1. Load Task Accuracy (Layer 2)

In [2]:
def load_task_accuracy(model, from_lang, to_lang):
    """Load task accuracy from evaluated JSONL file."""
    condition = f"{from_lang}_to_{to_lang}"
    
    # Try cross-lingual directory first
    patterns = [
        f'results/cross-lingual/{model}/evaluated_{condition}_*.jsonl',
        f'results/cross-lingual/{model}/evaluated_{condition}.jsonl',
    ]
    
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        if files:
            stats = {'total': 0, 'passed': 0, 'failed': 0, 'error': 0}
            with open(files[-1]) as f:
                for line in f:
                    try:
                        data = json.loads(line.strip())
                        stats['total'] += 1
                        if data.get('evaluation', {}).get('passed'):
                            stats['passed'] += 1
                        else:
                            stats['failed'] += 1
                    except json.JSONDecodeError:
                        stats['error'] += 1
            return stats
    return None

# Test loading
test = load_task_accuracy('gpt-5', 'zh', 'de')
if test:
    print(f"GPT-5 ZH→DE: {test['passed']}/{test['total']} = {test['passed']/test['total']*100:.1f}%")
else:
    print("No data found")

GPT-5 ZH→DE: 98/182 = 53.8%


## 2. Load Language Fidelity (Layer 1)

For cross-lingual pairs, the expected response language is the query language (the target of the switch).
- ZH→DE: expect German (de)
- DE→ZH: expect Chinese (zh)
- ES→AR: expect Arabic (ar)
- AR→ES: expect Spanish (es)

In [3]:
def load_language_fidelity(model, from_lang, to_lang):
    """Load language fidelity from language_eval JSONL file.
    
    Expected response language is the query language (to_lang).
    """
    condition = f"{from_lang}_to_{to_lang}"
    expected_lang = to_lang  # Response should be in query language
    
    # Try to find language eval file with correct expected language
    patterns = [
        f'results/cross-lingual/{model}/language_eval_{condition}_*.jsonl',
        f'results/cross-lingual/{model}/language_eval_{expected_lang}_*.jsonl',
    ]
    
    for pattern in patterns:
        files = sorted(glob.glob(pattern))
        if files:
            # Check if file has correct expected language
            stats = {'total': 0, 'match': 0, 'mismatch': 0, 'error': 0}
            correct_file = None
            
            for f_path in files:
                with open(f_path) as f:
                    first_line = f.readline()
                    if first_line:
                        data = json.loads(first_line)
                        if data.get('expected_language') == expected_lang:
                            correct_file = f_path
                            break
            
            if correct_file:
                with open(correct_file) as f:
                    for line in f:
                        try:
                            data = json.loads(line.strip())
                            stats['total'] += 1
                            status = data.get('match_status', '')
                            if status == 'match':
                                stats['match'] += 1
                            elif status == 'mismatch':
                                stats['mismatch'] += 1
                            else:
                                stats['error'] += 1
                        except json.JSONDecodeError:
                            stats['error'] += 1
                return stats
    
    return None

# Test
for from_l, to_l in CROSS_LINGUAL_PAIRS:
    stats = load_language_fidelity('gpt-5', from_l, to_l)
    if stats:
        fidelity = stats['match'] / stats['total'] * 100 if stats['total'] > 0 else 0
        print(f"GPT-5 {from_l.upper()}→{to_l.upper()}: {fidelity:.1f}%")
    else:
        print(f"GPT-5 {from_l.upper()}→{to_l.upper()}: No data (need to run language eval)")

GPT-5 ZH→DE: 96.2%
GPT-5 DE→ZH: 97.3%
GPT-5 ES→AR: 96.2%
GPT-5 AR→ES: 98.4%


## 3. Build Results Tables

In [4]:
# Build task accuracy table
accuracy_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    for from_l, to_l in CROSS_LINGUAL_PAIRS:
        col_name = f"{from_l.upper()}→{to_l.upper()}"
        stats = load_task_accuracy(model_id, from_l, to_l)
        if stats and stats['total'] > 0:
            row[col_name] = stats['passed'] / stats['total'] * 100
        else:
            row[col_name] = None
    accuracy_data.append(row)

df_accuracy = pd.DataFrame(accuracy_data).set_index('Model')
print("TASK ACCURACY (%)")
print("=" * 80)
print(df_accuracy.round(1).to_string())

TASK ACCURACY (%)
            ZH→DE  DE→ZH  ES→AR  AR→ES
Model                                 
GPT-5        53.8   52.2   56.6   51.6
Claude 4.5   47.3   46.2   51.1   48.9


In [5]:
# Build language fidelity table
fidelity_data = []
for model_id, model_name in MODELS.items():
    row = {'Model': model_name}
    for from_l, to_l in CROSS_LINGUAL_PAIRS:
        col_name = f"{from_l.upper()}→{to_l.upper()}"
        stats = load_language_fidelity(model_id, from_l, to_l)
        if stats and stats['total'] > 0:
            row[col_name] = stats['match'] / stats['total'] * 100
        else:
            row[col_name] = None
    fidelity_data.append(row)

df_fidelity = pd.DataFrame(fidelity_data).set_index('Model')
print("LANGUAGE FIDELITY (%)")
print("=" * 80)
print(df_fidelity.round(1).to_string())

LANGUAGE FIDELITY (%)
            ZH→DE  DE→ZH  ES→AR  AR→ES
Model                                 
GPT-5        96.2   97.3   96.2   98.4
Claude 4.5   64.3   35.2   81.3   19.2


## 4. Sanity Check: Errors and Empty Responses

In [6]:
def check_response_errors(model, from_lang, to_lang):
    """Check response files for API errors and empty responses."""
    condition = f"{from_lang}_to_{to_lang}"
    pattern = f'results/cross-lingual/{model}/responses_{condition}_*.jsonl'
    files = sorted(glob.glob(pattern))

    if not files:
        return None

    stats = {
        'total': 0,
        'success': 0,
        'api_error': 0,
        'empty_response': 0,
        'parse_error': 0
    }

    with open(files[-1]) as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                stats['total'] += 1
                if data.get('success'):
                    stats['success'] += 1
                    if not data.get('response') or data.get('response', '').strip() == '':
                        stats['empty_response'] += 1
                else:
                    stats['api_error'] += 1
            except json.JSONDecodeError:
                stats['parse_error'] += 1

    return stats

# Run sanity check
print("SANITY CHECK: Response Errors")
print("=" * 80)

all_clean = True
for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    has_errors = False
    for from_l, to_l in CROSS_LINGUAL_PAIRS:
        condition = f"{from_l}_to_{to_l}"
        stats = check_response_errors(model_id, from_l, to_l)
        if stats:
            errors = []
            if stats['api_error'] > 0:
                errors.append(f"API errors: {stats['api_error']}")
            if stats['empty_response'] > 0:
                errors.append(f"Empty: {stats['empty_response']}")
            if stats['parse_error'] > 0:
                errors.append(f"Parse errors: {stats['parse_error']}")
            if errors:
                print(f"  {condition}: {', '.join(errors)}")
                has_errors = True
                all_clean = False
            else:
                print(f"  {condition}: ✓ {stats['success']}/{stats['total']} OK")
        else:
            print(f"  {condition}: NO FILE")
            all_clean = False

if all_clean:
    print("\n✓ All response files are clean (no errors)")

SANITY CHECK: Response Errors

GPT-5:
  zh_to_de: ✓ 182/182 OK
  de_to_zh: ✓ 182/182 OK
  es_to_ar: ✓ 182/182 OK
  ar_to_es: ✓ 182/182 OK

Claude 4.5:
  zh_to_de: Empty: 1
  de_to_zh: ✓ 182/182 OK
  es_to_ar: ✓ 182/182 OK
  ar_to_es: ✓ 182/182 OK


## 5. LaTeX Output

In [7]:
def fmt(val):
    if val is None or pd.isna(val):
        return '--'
    return f'{val:.1f}'

cols = [f"{f.upper()}→{t.upper()}" for f, t in CROSS_LINGUAL_PAIRS]

print("LaTeX: LANGUAGE FIDELITY (X→Y)")
print("="*80)
for _, row in df_fidelity.iterrows():
    vals = [fmt(row.get(c)) for c in cols]
    print(f"{row.name} & {' & '.join(vals)} \\\\")

print("\nLaTeX: TASK ACCURACY (X→Y)")
print("="*80)
for _, row in df_accuracy.iterrows():
    vals = [fmt(row.get(c)) for c in cols]
    print(f"{row.name} & {' & '.join(vals)} \\\\")

LaTeX: LANGUAGE FIDELITY (X→Y)
GPT-5 & 96.2 & 97.3 & 96.2 & 98.4 \\
Claude 4.5 & 64.3 & 35.2 & 81.3 & 19.2 \\

LaTeX: TASK ACCURACY (X→Y)
GPT-5 & 53.8 & 52.2 & 56.6 & 51.6 \\
Claude 4.5 & 47.3 & 46.2 & 51.1 & 48.9 \\


## 6. Run Language Evaluation for Missing Conditions

If language evaluation is missing for some conditions, run them here.

In [8]:
# Find conditions that need language evaluation
missing_evals = []

for model_id, model_name in MODELS.items():
    for from_l, to_l in CROSS_LINGUAL_PAIRS:
        condition = f"{from_l}_to_{to_l}"
        expected_lang = to_l
        
        # Check if response file exists
        pattern = f'results/cross-lingual/{model_id}/responses_{condition}_*.jsonl'
        response_files = glob.glob(pattern)
        
        if not response_files:
            continue
            
        # Check if correct language eval exists
        stats = load_language_fidelity(model_id, from_l, to_l)
        if not stats:
            missing_evals.append({
                'model': model_id,
                'response_file': response_files[-1],
                'expected_lang': expected_lang,
                'condition': condition,
            })

if missing_evals:
    print("Missing language evaluations:")
    for item in missing_evals:
        print(f"  {item['model']}: {item['condition']} (expect {item['expected_lang']})")
        print(f"    Response file: {item['response_file']}")
else:
    print("All language evaluations are complete!")

All language evaluations are complete!
