# Conversation Length Effect Analysis

Analyzes whether context-anchoring intensifies with conversation length.

**Length categories:**
- Short: 3-5 turns (n=44)
- Medium: 7-9 turns (n=99)
- Long: 11+ turns (n=39)

**Focus:** X→EN fidelity (where context-anchoring is most visible)

In [None]:
import os
import json
import glob
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats

# Change to project root
os.chdir(Path(__file__).parent.parent if '__file__' in dir() else Path.cwd().parent)
print(f"Working directory: {os.getcwd()}")

MODELS = {
    'gpt-5': 'GPT-5',
    'gemini-3-pro': 'Gemini 3 Pro',
    'claude-opus-4.5': 'Claude Opus 4.5',
    'deepseek-v3.1': 'DeepSeek-V3.1',
    'command-r-plus': 'Command R+',
}

LANGS = ['de', 'zh', 'es', 'ar']

def categorize_length(turns):
    """Categorize conversation length."""
    if 3 <= turns <= 5:
        return 'Short'
    elif 7 <= turns <= 9:
        return 'Medium'
    elif turns >= 11:
        return 'Long'
    else:
        return None  # Exclude 2, 6, 10 turns

print(f"Models: {list(MODELS.keys())}")

## 1. Dataset Turn Composition

In [None]:
# Analyze turn distribution in baseline dataset
from collections import Counter

with open('data/experiments/baseline_en.jsonl') as f:
    data = [json.loads(l) for l in f]
    turns = [len(d['CONVERSATION']) for d in data]
    axes = [d['AXIS'] for d in data]

print("Turn Distribution in Dataset:")
print("=" * 40)
for t, count in sorted(Counter(turns).items()):
    pct = count / len(turns) * 100
    cat = categorize_length(t)
    cat_str = f"({cat})" if cat else "(excluded)"
    print(f"  {t:2d} turns: {count:3d} ({pct:5.1f}%) {cat_str}")

print(f"\nTotal samples: {len(turns)}")
print(f"\nBy category:")
short = sum(1 for t in turns if 3 <= t <= 5)
medium = sum(1 for t in turns if 7 <= t <= 9)
long = sum(1 for t in turns if t >= 11)
print(f"  Short (3-5 turns):   {short:3d} ({short/len(turns)*100:.1f}%)")
print(f"  Medium (7-9 turns):  {medium:3d} ({medium/len(turns)*100:.1f}%)")
print(f"  Long (11+ turns):    {long:3d} ({long/len(turns)*100:.1f}%)")

print(f"\nBy axis:")
for axis, count in Counter(axes).items():
    print(f"  {axis}: {count}")

# Turn distribution by axis
print(f"\n" + "=" * 60)
print("Turn Distribution BY AXIS:")
print("=" * 60)

for axis in ['INFERENCE_MEMORY', 'INSTRUCTION_RETENTION']:
    axis_data = [d for d in data if d['AXIS'] == axis]
    axis_turns = [len(d['CONVERSATION']) for d in axis_data]
    
    print(f"\n{axis} (n={len(axis_data)}):")
    print(f"  {'Turns':<10} {'Count':>8} {'Pct':>8}")
    print("-" * 30)
    for t, count in sorted(Counter(axis_turns).items()):
        pct = count / len(axis_turns) * 100
        print(f"  {t:<10} {count:>8} {pct:>7.1f}%")
    
    # By category
    short = sum(1 for t in axis_turns if 3 <= t <= 5)
    medium = sum(1 for t in axis_turns if 7 <= t <= 9)
    long = sum(1 for t in axis_turns if t >= 11)
    print(f"\n  Category breakdown:")
    print(f"    Short (3-5):   {short:3d} ({short/len(axis_turns)*100:.1f}%)")
    print(f"    Medium (7-9):  {medium:3d} ({medium/len(axis_turns)*100:.1f}%)")
    print(f"    Long (11+):    {long:3d} ({long/len(axis_turns)*100:.1f}%)")

# LaTeX output for turn by axis
print(f"\n" + "=" * 60)
print("LaTeX: Turn Composition by Axis")
print("=" * 60)
print("\\begin{tabular}{@{}lcccc@{}}")
print("\\toprule")
print("\\textbf{Axis} & \\textbf{n} & \\textbf{Short} & \\textbf{Medium} & \\textbf{Long} \\\\")
print("\\midrule")

for axis in ['INFERENCE_MEMORY', 'INSTRUCTION_RETENTION']:
    axis_data = [d for d in data if d['AXIS'] == axis]
    axis_turns = [len(d['CONVERSATION']) for d in axis_data]
    
    short = sum(1 for t in axis_turns if 3 <= t <= 5)
    medium = sum(1 for t in axis_turns if 7 <= t <= 9)
    long = sum(1 for t in axis_turns if t >= 11)
    
    axis_display = axis.replace('_', '\\_')
    print(f"{axis_display} & {len(axis_data)} & {short} & {medium} & {long} \\\\")

print("\\bottomrule")
print("\\end{tabular}")

## 2. Load Language Eval Results with Turn Counts

In [None]:
def load_responses_with_turns(model, condition):
    """Load responses and extract turn counts."""
    pattern = f'results/responses/{model}/responses_{condition}_*.jsonl'
    files = sorted(glob.glob(pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if not files:
        return None
    
    data = []
    with open(files[-1]) as f:
        for line in f:
            item = json.loads(line.strip())
            if item.get('success'):
                data.append({
                    'question_id': item.get('question_id'),
                    'turn_count': item.get('turn_count'),
                    'response': item.get('response', '')
                })
    return data

def load_language_eval(model, condition):
    """Load language evaluation results."""
    pattern = f'results/layer1/{model}/language_eval_{condition}_*.jsonl'
    files = sorted(glob.glob(pattern))
    # Exclude variance runs (run2, run3) - only use primary results
    files = [f for f in files if '_run2_' not in f and '_run3_' not in f]
    if not files:
        return None
    
    data = {}
    with open(files[-1]) as f:
        for line in f:
            item = json.loads(line.strip())
            qid = item.get('question_id')
            data[qid] = item.get('match_status') == 'match'
    return data

# Test
responses = load_responses_with_turns('gpt-5', 'de_to_en')
lang_eval = load_language_eval('gpt-5', 'de_to_en')
if responses and lang_eval:
    print(f"Loaded {len(responses)} responses, {len(lang_eval)} eval results")
else:
    print("Data not found")

## 3. Compute Fidelity by Length for X→EN

In [None]:
def compute_fidelity_by_length(model):
    """Compute X->EN fidelity by conversation length."""
    results = {'Short': {'match': 0, 'total': 0},
               'Medium': {'match': 0, 'total': 0},
               'Long': {'match': 0, 'total': 0}}
    
    for lang in LANGS:
        condition = f"{lang}_to_en"
        
        responses = load_responses_with_turns(model, condition)
        lang_eval = load_language_eval(model, condition)
        
        if not responses or not lang_eval:
            continue
        
        for resp in responses:
            qid = resp['question_id']
            turns = resp['turn_count']
            category = categorize_length(turns)
            
            if category and qid in lang_eval:
                results[category]['total'] += 1
                if lang_eval[qid]:
                    results[category]['match'] += 1
    
    return results

# Compute for all models
all_results = {}
for model_id, model_name in MODELS.items():
    all_results[model_id] = compute_fidelity_by_length(model_id)

print("X->EN Fidelity by Conversation Length")
print("=" * 70)
print(f"{'Model':<20} {'Short (3-5)':>15} {'Medium (7-9)':>15} {'Long (11+)':>15}")
print("-" * 70)

for model_id, model_name in MODELS.items():
    res = all_results[model_id]
    row = []
    for cat in ['Short', 'Medium', 'Long']:
        if res[cat]['total'] > 0:
            pct = res[cat]['match'] / res[cat]['total'] * 100
            row.append(f"{pct:.1f}% ({res[cat]['match']}/{res[cat]['total']})")
        else:
            row.append("--")
    print(f"{model_name:<20} {row[0]:>15} {row[1]:>15} {row[2]:>15}")

## 4. Chi-Square Tests for Length Effect

In [None]:
def chi_square_test(results):
    """Run chi-square test for independence between length and fidelity."""
    # Build contingency table: [[match_short, match_med, match_long], [mismatch_short, ...]]
    observed = []
    for cat in ['Short', 'Medium', 'Long']:
        match = results[cat]['match']
        total = results[cat]['total']
        if total == 0:
            return None, None
        observed.append([match, total - match])
    
    observed = np.array(observed).T  # Transpose to get 2xN matrix
    
    # Check if test is valid (expected counts > 5)
    if np.any(observed.sum(axis=0) < 5):
        return None, "Low counts"
    
    try:
        chi2, p, dof, expected = stats.chi2_contingency(observed)
        return chi2, p
    except:
        return None, None

print("Chi-Square Tests for Length Effect (X→EN)")
print("=" * 60)
print(f"{'Model':<25} {'χ²':>10} {'p-value':>15} {'Significant?':>15}")
print("-" * 60)

for model_id, model_name in MODELS.items():
    res = all_results[model_id]
    chi2, p = chi_square_test(res)
    
    if chi2 is not None and p is not None:
        sig = "Yes (p<0.001)" if p < 0.001 else ("Yes (p<0.05)" if p < 0.05 else "No")
        p_str = f"<0.001" if p < 0.001 else f"{p:.3f}"
        print(f"{model_name:<25} {chi2:>10.2f} {p_str:>15} {sig:>15}")
    elif p == "Low counts":
        print(f"{model_name:<25} {'--':>10} {'Low counts':>15} {'--':>15}")
    else:
        print(f"{model_name:<25} {'--':>10} {'--':>15} {'--':>15}")

## 5. Detailed Breakdown by Language

In [None]:
def compute_fidelity_by_length_and_lang(model, lang):
    """Compute fidelity by length for specific language."""
    results = {'Short': {'match': 0, 'total': 0},
               'Medium': {'match': 0, 'total': 0},
               'Long': {'match': 0, 'total': 0}}
    
    condition = f"{lang}_to_en"
    
    responses = load_responses_with_turns(model, condition)
    lang_eval = load_language_eval(model, condition)
    
    if not responses or not lang_eval:
        return results
    
    for resp in responses:
        qid = resp['question_id']
        turns = resp['turn_count']
        category = categorize_length(turns)
        
        if category and qid in lang_eval:
            results[category]['total'] += 1
            if lang_eval[qid]:
                results[category]['match'] += 1
    
    return results

# Show breakdown for a specific model
print("Detailed Breakdown: Claude Opus 4.5 (X->EN)")
print("=" * 60)

for lang in LANGS:
    res = compute_fidelity_by_length_and_lang('claude-opus-4.5', lang)
    print(f"\n{lang.upper()}->EN:")
    for cat in ['Short', 'Medium', 'Long']:
        if res[cat]['total'] > 0:
            pct = res[cat]['match'] / res[cat]['total'] * 100
            print(f"  {cat}: {pct:.1f}% ({res[cat]['match']}/{res[cat]['total']})")
        else:
            print(f"  {cat}: --")

## 6. LaTeX Output

In [None]:
print("LaTeX: X->EN Fidelity by Conversation Length")
print("="*80)

for model_id, model_name in MODELS.items():
    res = all_results[model_id]
    chi2, p = chi_square_test(res)
    
    vals = []
    for cat in ['Short', 'Medium', 'Long']:
        if res[cat]['total'] > 0:
            pct = res[cat]['match'] / res[cat]['total'] * 100
            vals.append(f"{pct:.1f}\\%")
        else:
            vals.append("--")
    
    if p is not None and isinstance(p, float):
        p_str = "$<$0.001" if p < 0.001 else f"{p:.2f}"
    else:
        p_str = "---"
    
    print(f"{model_name} & {vals[0]} & {vals[1]} & {vals[2]} & {p_str} \\\\")

## 7. Axis Breakdown (INFERENCE_MEMORY vs INSTRUCTION_RETENTION)

In [None]:
# Load baseline data with axis info
baseline_data = {}
with open('data/experiments/baseline_en.jsonl') as f:
    for line in f:
        item = json.loads(line.strip())
        baseline_data[item['QUESTION_ID']] = {
            'axis': item['AXIS'],
            'turns': len(item['CONVERSATION'])
        }

def compute_fidelity_by_axis_and_length(model):
    """Compute X->EN fidelity by axis and conversation length."""
    results = {
        'INFERENCE_MEMORY': {'Short': {'match': 0, 'total': 0}, 
                             'Medium': {'match': 0, 'total': 0}, 
                             'Long': {'match': 0, 'total': 0}},
        'INSTRUCTION_RETENTION': {'Short': {'match': 0, 'total': 0}, 
                                   'Medium': {'match': 0, 'total': 0}, 
                                   'Long': {'match': 0, 'total': 0}}
    }
    
    for lang in LANGS:
        condition = f"{lang}_to_en"
        
        responses = load_responses_with_turns(model, condition)
        lang_eval = load_language_eval(model, condition)
        
        if not responses or not lang_eval:
            continue
        
        for resp in responses:
            qid = resp['question_id']
            if qid not in baseline_data or qid not in lang_eval:
                continue
                
            axis = baseline_data[qid]['axis']
            turns = resp['turn_count']
            category = categorize_length(turns)
            
            if category is None:
                continue
            
            results[axis][category]['total'] += 1
            if lang_eval[qid]:
                results[axis][category]['match'] += 1
    
    return results

# Compute axis breakdown for all models
axis_results = {}
for model_id in MODELS.keys():
    axis_results[model_id] = compute_fidelity_by_axis_and_length(model_id)

# Display results
print("X->EN Fidelity by Axis and Conversation Length")
print("=" * 90)

for model_id, model_name in MODELS.items():
    print(f"\n{model_name}:")
    print(f"  {'Axis':<25} {'Short (3-5)':>15} {'Medium (7-9)':>15} {'Long (11+)':>15}")
    print("-" * 75)
    
    for axis in ['INFERENCE_MEMORY', 'INSTRUCTION_RETENTION']:
        res = axis_results[model_id][axis]
        row = []
        for cat in ['Short', 'Medium', 'Long']:
            if res[cat]['total'] > 0:
                pct = res[cat]['match'] / res[cat]['total'] * 100
                row.append(f"{pct:.1f}% ({res[cat]['match']}/{res[cat]['total']})")
            else:
                row.append("--")
        print(f"  {axis:<25} {row[0]:>15} {row[1]:>15} {row[2]:>15}")