# System Prompt Ablation Analysis

Tests whether explicit language instructions improve fidelity on switching conditions (EN→X and X→EN).

**Settings:**
- None: No system prompt
- Explicit: "Always respond in the same language the user uses in their most recent message."

**Models:** GPT-5, Claude Opus 4.5, Command R+

**Conditions:** EN→X and X→EN for DE, ZH, ES, AR

In [1]:
import os
import json
import glob
import pandas as pd
from pathlib import Path

# Change to project root
#os.chdir(Path(__file__).parent.parent if '__file__' in dir() else Path.cwd().parent)
#print(f"Working directory: {os.getcwd()}")

MODELS = {
    'gpt-5': 'GPT-5',
    'claude-opus-4.5': 'Claude Opus 4.5',
    'command-r-plus': 'Command R+',
}

LANGS = ['de', 'zh', 'es', 'ar']
DIRECTIONS = ['en_to', 'to_en']  # EN→X and X→EN

print(f"Models: {list(MODELS.keys())}")
print(f"Languages: {LANGS}")

Models: ['gpt-5', 'claude-opus-4.5', 'command-r-plus']
Languages: ['de', 'zh', 'es', 'ar']


## 1. Sanity Check: Errors and Empty Responses

In [2]:
def check_response_errors(model, condition, with_sysprompt=True):
    """Check response files for API errors and empty responses."""
    if with_sysprompt:
        pattern = f'../results/sysprompt-ablation/{model}/responses/responses_{condition}_sysprompt_*.jsonl'
    else:
        pattern = f'../results/responses/{model}/responses_{condition}_*.jsonl'
    
    files = sorted(glob.glob(pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if not files:
        return None

    stats = {
        'total': 0,
        'success': 0,
        'api_error': 0,
        'empty_response': 0,
        'parse_error': 0
    }

    with open(files[-1]) as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                stats['total'] += 1
                if data.get('success'):
                    stats['success'] += 1
                    if not data.get('response') or data.get('response', '').strip() == '':
                        stats['empty_response'] += 1
                else:
                    stats['api_error'] += 1
            except json.JSONDecodeError:
                stats['parse_error'] += 1

    return stats

# Run sanity check for sysprompt responses
print("SANITY CHECK: Sysprompt Ablation Response Errors")
print("=" * 80)

all_clean = True
for model_id, model_name in MODELS.items():
    print(f"\n{model_name} (with explicit prompt):")
    for lang in LANGS:
        for direction in ['en_to', 'to_en']:
            if direction == 'en_to':
                condition = f"en_to_{lang}"
            else:
                condition = f"{lang}_to_en"
            
            stats = check_response_errors(model_id, condition, with_sysprompt=True)
            if stats:
                errors = []
                if stats['api_error'] > 0:
                    errors.append(f"API errors: {stats['api_error']}")
                if stats['empty_response'] > 0:
                    errors.append(f"Empty: {stats['empty_response']}")
                if stats['parse_error'] > 0:
                    errors.append(f"Parse errors: {stats['parse_error']}")
                if errors:
                    print(f"  {condition}: {', '.join(errors)}")
                    all_clean = False
                else:
                    print(f"  {condition}: OK {stats['success']}/{stats['total']}")
            else:
                print(f"  {condition}: NO FILE")
                all_clean = False

if all_clean:
    print("\nAll sysprompt response files are clean (no errors)")

SANITY CHECK: Sysprompt Ablation Response Errors

GPT-5 (with explicit prompt):
  en_to_de: OK 182/182
  de_to_en: OK 182/182
  en_to_zh: OK 182/182
  zh_to_en: OK 182/182
  en_to_es: OK 182/182
  es_to_en: OK 182/182
  en_to_ar: OK 182/182
  ar_to_en: OK 182/182

Claude Opus 4.5 (with explicit prompt):
  en_to_de: Empty: 1
  de_to_en: OK 182/182
  en_to_zh: OK 182/182
  zh_to_en: OK 182/182
  en_to_es: OK 182/182
  es_to_en: OK 182/182
  en_to_ar: OK 182/182
  ar_to_en: OK 182/182

Command R+ (with explicit prompt):
  en_to_de: OK 182/182
  de_to_en: OK 182/182
  en_to_zh: OK 182/182
  zh_to_en: OK 182/182
  en_to_es: OK 182/182
  es_to_en: OK 182/182
  en_to_ar: OK 182/182
  ar_to_en: OK 182/182


## 2. Load Language Fidelity Results

In [3]:
def load_language_fidelity(model, condition, with_sysprompt=True):
    """Load language fidelity from summary file."""
    if with_sysprompt:
        pattern = f'../results/sysprompt-ablation/{model}/layer1/language_summary_{condition}_*.json'
    else:
        pattern = f'../results/layer1/{model}/language_summary_{condition}_*.json'
    
    files = sorted(glob.glob(pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if files:
        with open(files[-1]) as f:
            data = json.load(f)
        return data.get('fidelity_rate'), data.get('stats', {})
    return None, None

# Test
fidelity, stats = load_language_fidelity('gpt-5', 'en_to_de', with_sysprompt=True)
if fidelity:
    print(f"GPT-5 EN->DE (with prompt): {fidelity:.1f}%")
else:
    print("No data found")

GPT-5 EN->DE (with prompt): 98.4%


## 3. Check Missing Evaluations

In [4]:
print("Evaluation Status (Sysprompt Ablation):")
print("=" * 80)

missing = []
for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    for lang in LANGS:
        for direction in ['en_to', 'to_en']:
            if direction == 'en_to':
                condition = f"en_to_{lang}"
                display = f"EN->{lang.upper()}"
            else:
                condition = f"{lang}_to_en"
                display = f"{lang.upper()}->EN"
            
            # Check if response exists
            resp_pattern = f'../results/sysprompt-ablation/{model_id}/responses/responses_{condition}_sysprompt_*.jsonl'
            resp_files = glob.glob(resp_pattern)
            
            # Check layer1
            fidelity, _ = load_language_fidelity(model_id, condition, with_sysprompt=True)
            
            if resp_files:
                layer1_status = f"OK ({fidelity:.1f}%)" if fidelity else "MISSING"
                print(f"  {display}: Layer1={layer1_status}")
                if not fidelity:
                    missing.append({
                        'model': model_id,
                        'condition': condition,
                        'resp_file': resp_files[-1]
                    })
            else:
                print(f"  {display}: NO RESPONSES")

if missing:
    print(f"\nMissing {len(missing)} evaluations")
else:
    print("\nAll evaluations complete")

Evaluation Status (Sysprompt Ablation):



GPT-5:
  EN->DE: Layer1=OK (98.4%)
  DE->EN: Layer1=OK (94.0%)
  EN->ZH: Layer1=OK (98.9%)
  ZH->EN: Layer1=OK (93.4%)
  EN->ES: Layer1=OK (99.5%)
  ES->EN: Layer1=OK (94.0%)
  EN->AR: Layer1=OK (98.9%)
  AR->EN: Layer1=OK (94.5%)

Claude Opus 4.5:
  EN->DE: Layer1=OK (96.7%)
  DE->EN: Layer1=OK (9.9%)
  EN->ZH: Layer1=OK (94.5%)
  ZH->EN: Layer1=OK (9.9%)
  EN->ES: Layer1=OK (97.3%)
  ES->EN: Layer1=OK (5.5%)
  EN->AR: Layer1=OK (96.2%)
  AR->EN: Layer1=OK (3.3%)

Command R+:
  EN->DE: Layer1=OK (91.8%)
  DE->EN: Layer1=OK (1.1%)
  EN->ZH: Layer1=OK (87.4%)
  ZH->EN: Layer1=OK (0.5%)
  EN->ES: Layer1=OK (96.7%)
  ES->EN: Layer1=OK (0.5%)
  EN->AR: Layer1=OK (83.0%)
  AR->EN: Layer1=OK (0.5%)

All evaluations complete


## 4. Build Comparison Table

In [5]:
# Build comparison table
results = []

for model_id, model_name in MODELS.items():
    for direction in ['EN→X', 'X→EN']:
        for prompt_type in ['None', 'Explicit']:
            row = {
                'Model': model_name,
                'Condition': direction,
                'Prompt': prompt_type
            }
            
            for lang in LANGS:
                if direction == 'EN→X':
                    condition = f"en_to_{lang}"
                else:
                    condition = f"{lang}_to_en"
                
                with_prompt = (prompt_type == 'Explicit')
                fidelity, _ = load_language_fidelity(model_id, condition, with_sysprompt=with_prompt)
                
                row[lang.upper()] = fidelity
            
            results.append(row)

df = pd.DataFrame(results)
print("SYSTEM PROMPT ABLATION - Language Fidelity (%)")
print("=" * 90)
print(df.to_string(index=False))

SYSTEM PROMPT ABLATION - Language Fidelity (%)
          Model Condition   Prompt        DE        ZH        ES        AR
          GPT-5      EN→X     None 97.802198 99.450549 99.450549 97.802198
          GPT-5      EN→X Explicit 98.351648 98.901099 99.450549 98.901099
          GPT-5      X→EN     None 93.956044 95.604396 94.505495 96.153846
          GPT-5      X→EN Explicit 93.956044 93.406593 93.956044 94.505495
Claude Opus 4.5      EN→X     None 96.685083 93.956044 97.252747 96.703297
Claude Opus 4.5      EN→X Explicit 96.685083 94.505495 97.252747 96.153846
Claude Opus 4.5      X→EN     None 10.439560  9.890110  6.043956  4.395604
Claude Opus 4.5      X→EN Explicit  9.890110  9.890110  5.494505  3.296703
     Command R+      EN→X     None 91.758242 89.010989 95.604396 80.769231
     Command R+      EN→X Explicit 91.758242 87.362637 96.703297 82.967033
     Command R+      X→EN     None  1.098901  1.098901  0.549451  0.549451
     Command R+      X→EN Explicit  1.098901  0.54945

## 5. Compute Delta (Explicit - None)

In [6]:
# Compute improvement from explicit prompt
print("IMPROVEMENT FROM EXPLICIT PROMPT (Explicit - None)")
print("=" * 80)

for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    for direction in ['EN→X', 'X→EN']:
        row = []
        for lang in LANGS:
            if direction == 'EN→X':
                condition = f"en_to_{lang}"
            else:
                condition = f"{lang}_to_en"
            
            fidelity_none, _ = load_language_fidelity(model_id, condition, with_sysprompt=False)
            fidelity_explicit, _ = load_language_fidelity(model_id, condition, with_sysprompt=True)
            
            if fidelity_none is not None and fidelity_explicit is not None:
                delta = fidelity_explicit - fidelity_none
                sign = "+" if delta >= 0 else ""
                row.append(f"{sign}{delta:.1f}")
            else:
                row.append("--")
        
        print(f"  {direction}: DE={row[0]:>6}, ZH={row[1]:>6}, ES={row[2]:>6}, AR={row[3]:>6}")

IMPROVEMENT FROM EXPLICIT PROMPT (Explicit - None)

GPT-5:
  EN→X: DE=  +0.5, ZH=  -0.5, ES=  +0.0, AR=  +1.1
  X→EN: DE=  +0.0, ZH=  -2.2, ES=  -0.5, AR=  -1.6

Claude Opus 4.5:
  EN→X: DE=  +0.0, ZH=  +0.5, ES=  +0.0, AR=  -0.5
  X→EN: DE=  -0.5, ZH=  +0.0, ES=  -0.5, AR=  -1.1

Command R+:
  EN→X: DE=  +0.0, ZH=  -1.6, ES=  +1.1, AR=  +2.2
  X→EN: DE=  +0.0, ZH=  -0.5, ES=  +0.0, AR=  +0.0


## 6. Load Task Accuracy Results (Layer 2)

In [7]:
def load_task_accuracy(model, condition, with_sysprompt=True):
    """Load task accuracy from summary or evaluated file."""
    if with_sysprompt:
        summary_pattern = f'../results/sysprompt-ablation/{model}/layer2/summary_{condition}_*.json'
        eval_pattern = f'../results/sysprompt-ablation/{model}/layer2/evaluated_{condition}_*.jsonl'
    else:
        summary_pattern = f'../results/layer2/{model}/summary_{condition}_*.json'
        eval_pattern = f'../results/layer2/{model}/evaluated_{condition}_*.jsonl'
    
    # Try summary first
    files = sorted(glob.glob(summary_pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if files:
        with open(files[-1]) as f:
            data = json.load(f)
        return data.get('pass_rate'), data.get('stats', {})
    
    # Fall back to evaluated file
    files = sorted(glob.glob(eval_pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if files:
        total, passed = 0, 0
        with open(files[-1]) as f:
            for line in f:
                d = json.loads(line)
                total += 1
                if d.get('evaluation', {}).get('passed'):
                    passed += 1
        if total > 0:
            return (passed / total) * 100, {'total': total, 'passed': passed}
    return None, None

# Test
accuracy, stats = load_task_accuracy('gpt-5', 'en_to_de', with_sysprompt=True)
print(f"GPT-5 EN->DE Task Accuracy (Explicit): {accuracy:.1f}%" if accuracy else "No data")
accuracy, stats = load_task_accuracy('gpt-5', 'en_to_de', with_sysprompt=False)
print(f"GPT-5 EN->DE Task Accuracy (None): {accuracy:.1f}%" if accuracy else "No data")

GPT-5 EN->DE Task Accuracy (Explicit): 58.2%
GPT-5 EN->DE Task Accuracy (None): 57.1%


## 7. Check Missing Layer 2 Evaluations

In [8]:
print("Layer 2 Evaluation Status (Sysprompt Ablation):")
print("=" * 80)

missing_layer2 = []
for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    for lang in LANGS:
        for direction in ['en_to', 'to_en']:
            if direction == 'en_to':
                condition = f"en_to_{lang}"
                display = f"EN->{lang.upper()}"
            else:
                condition = f"{lang}_to_en"
                display = f"{lang.upper()}->EN"
            
            # Check if response exists
            resp_pattern = f'../results/sysprompt-ablation/{model_id}/responses/responses_{condition}_sysprompt_*.jsonl'
            resp_files = glob.glob(resp_pattern)
            
            # Check layer2
            accuracy, _ = load_task_accuracy(model_id, condition, with_sysprompt=True)
            
            if resp_files:
                layer2_status = f"OK ({accuracy:.1f}%)" if accuracy else "MISSING"
                print(f"  {display}: Layer2={layer2_status}")
                if not accuracy:
                    missing_layer2.append({
                        'model': model_id,
                        'condition': condition,
                        'resp_file': resp_files[-1]
                    })
            else:
                print(f"  {display}: NO RESPONSES")

if missing_layer2:
    print(f"\nMissing {len(missing_layer2)} Layer 2 evaluations")
else:
    print("\nAll Layer 2 evaluations complete")

Layer 2 Evaluation Status (Sysprompt Ablation):

GPT-5:
  EN->DE: Layer2=OK (58.2%)
  DE->EN: Layer2=OK (54.9%)
  EN->ZH: Layer2=OK (59.9%)
  ZH->EN: Layer2=OK (57.7%)
  EN->ES: Layer2=OK (58.8%)
  ES->EN: Layer2=OK (53.8%)
  EN->AR: Layer2=OK (56.0%)
  AR->EN: Layer2=OK (53.8%)

Claude Opus 4.5:
  EN->DE: Layer2=OK (48.4%)
  DE->EN: Layer2=OK (48.4%)
  EN->ZH: Layer2=OK (52.7%)
  ZH->EN: Layer2=OK (51.1%)
  EN->ES: Layer2=OK (55.5%)
  ES->EN: Layer2=OK (51.6%)
  EN->AR: Layer2=OK (54.4%)
  AR->EN: Layer2=OK (48.9%)

Command R+:
  EN->DE: Layer2=OK (14.8%)
  DE->EN: Layer2=OK (11.5%)
  EN->ZH: Layer2=OK (17.6%)
  ZH->EN: Layer2=OK (12.6%)
  EN->ES: Layer2=OK (14.8%)
  ES->EN: Layer2=OK (12.6%)
  EN->AR: Layer2=OK (15.4%)
  AR->EN: Layer2=OK (11.0%)

All Layer 2 evaluations complete


## 8. Task Accuracy Comparison Table

In [9]:
# Build task accuracy comparison table
acc_results = []

for model_id, model_name in MODELS.items():
    for direction in ['EN→X', 'X→EN']:
        for prompt_type in ['None', 'Explicit']:
            row = {
                'Model': model_name,
                'Condition': direction,
                'Prompt': prompt_type
            }
            
            for lang in LANGS:
                if direction == 'EN→X':
                    condition = f"en_to_{lang}"
                else:
                    condition = f"{lang}_to_en"
                
                with_prompt = (prompt_type == 'Explicit')
                accuracy, _ = load_task_accuracy(model_id, condition, with_sysprompt=with_prompt)
                
                row[lang.upper()] = accuracy
            
            acc_results.append(row)

df_acc = pd.DataFrame(acc_results)
print("SYSTEM PROMPT ABLATION - Task Accuracy (%)")
print("=" * 90)
print(df_acc.to_string(index=False))

SYSTEM PROMPT ABLATION - Task Accuracy (%)
          Model Condition   Prompt        DE        ZH        ES        AR
          GPT-5      EN→X     None 57.142857 59.890110 59.340659 60.439560
          GPT-5      EN→X Explicit 58.241758 59.890110 58.791209 56.043956
          GPT-5      X→EN     None 55.494505 50.549451 49.450549 54.395604
          GPT-5      X→EN Explicit 54.945055 57.692308 53.846154 53.846154
Claude Opus 4.5      EN→X     None 49.450549 46.703297 50.549451 48.901099
Claude Opus 4.5      EN→X Explicit 48.351648 52.747253 55.494505 54.395604
Claude Opus 4.5      X→EN     None 48.351648 47.802198 52.747253 50.549451
Claude Opus 4.5      X→EN Explicit 48.351648 51.098901 51.648352 48.901099
     Command R+      EN→X     None 15.384615 13.186813 15.384615 15.934066
     Command R+      EN→X Explicit 14.835165 17.582418 14.835165 15.384615
     Command R+      X→EN     None 12.087912 10.989011 11.538462 10.989011
     Command R+      X→EN Explicit 11.538462 12.637363 12

## 9. Task Accuracy Delta (Explicit - None)

In [10]:
# Compute task accuracy improvement from explicit prompt
print("TASK ACCURACY IMPROVEMENT FROM EXPLICIT PROMPT (Explicit - None)")
print("=" * 80)

for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    for direction in ['EN→X', 'X→EN']:
        row = []
        for lang in LANGS:
            if direction == 'EN→X':
                condition = f"en_to_{lang}"
            else:
                condition = f"{lang}_to_en"
            
            acc_none, _ = load_task_accuracy(model_id, condition, with_sysprompt=False)
            acc_explicit, _ = load_task_accuracy(model_id, condition, with_sysprompt=True)
            
            if acc_none is not None and acc_explicit is not None:
                delta = acc_explicit - acc_none
                sign = "+" if delta >= 0 else ""
                row.append(f"{sign}{delta:.1f}")
            else:
                row.append("--")
        
        print(f"  {direction}: DE={row[0]:>6}, ZH={row[1]:>6}, ES={row[2]:>6}, AR={row[3]:>6}")

TASK ACCURACY IMPROVEMENT FROM EXPLICIT PROMPT (Explicit - None)

GPT-5:


  EN→X: DE=  +1.1, ZH=  +0.0, ES=  -0.5, AR=  -4.4


  X→EN: DE=  -0.5, ZH=  +7.1, ES=  +4.4, AR=  -0.5

Claude Opus 4.5:


  EN→X: DE=  -1.1, ZH=  +6.0, ES=  +4.9, AR=  +5.5


  X→EN: DE=  +0.0, ZH=  +3.3, ES=  -1.1, AR=  -1.6

Command R+:
  EN→X: DE=  -0.5, ZH=  +4.4, ES=  -0.5, AR=  -0.5
  X→EN: DE=  -0.5, ZH=  +1.6, ES=  +1.1, AR=  +0.0
