# Cross-Run Consistency Analysis

This notebook analyzes the reproducibility/consistency of model responses across 3 independent runs.

**Condition**: `es_to_en` (Spanish context → English query)  
**Questions**: 182 per run  
**Models**: GPT-5, Gemini 3 Pro

## Metrics Analyzed:
- **Layer 1 (Language Fidelity)**: Does the model respond in the correct language?
- **Layer 2 (Task Accuracy)**: Does the model complete the task correctly?

In [1]:
import json
from pathlib import Path
from collections import defaultdict
import pandas as pd
import numpy as np

# For nice table display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
# File paths for the 3 runs
RESULTS_DIR = Path("../results")

# Layer 2: Task Accuracy files
LAYER2_FILES = {
    "GPT-5": {
        "run1": RESULTS_DIR / "layer2/gpt-5/evaluated_es_to_en_20251219_203007.jsonl",
        "run2": RESULTS_DIR / "layer2/gpt-5/evaluated_es_to_en_run2_20251220_005346.jsonl",
        "run3": RESULTS_DIR / "layer2/gpt-5/evaluated_es_to_en_run3_20251220_005409.jsonl",
    },
    "Gemini 3 Pro": {
        "run1": RESULTS_DIR / "layer2/gemini-3-pro/evaluated_es_to_en_20251219_202518.jsonl",
        "run2": RESULTS_DIR / "layer2/gemini-3-pro/evaluated_es_to_en_run2_20251220_005112.jsonl",
        "run3": RESULTS_DIR / "layer2/gemini-3-pro/evaluated_es_to_en_run3_20251220_005133.jsonl",
    },
}

# Layer 1: Language Fidelity files
LAYER1_FILES = {
    "GPT-5": {
        "run1": RESULTS_DIR / "layer1/gpt-5/language_eval_es_to_en_20251219_213343.jsonl",
        "run2": RESULTS_DIR / "layer1/gpt-5/language_eval_es_to_en_run2_20251220_111816.jsonl",
        "run3": RESULTS_DIR / "layer1/gpt-5/language_eval_es_to_en_run3_20251220_111833.jsonl",
    },
    "Gemini 3 Pro": {
        "run1": RESULTS_DIR / "layer1/gemini-3-pro/language_eval_es_to_en_20251219_213507.jsonl",
        "run2": RESULTS_DIR / "layer1/gemini-3-pro/language_eval_es_to_en_run2_20251220_003921.jsonl",
        "run3": RESULTS_DIR / "layer1/gemini-3-pro/language_eval_es_to_en_run3_20251220_003926.jsonl",
    },
}

In [3]:
def load_layer2_evaluations(filepath):
    """Load Layer 2 (Task Accuracy) evaluations."""
    results = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                record = json.loads(line)
                qid = record.get("question_id")
                if qid:
                    eval_data = record.get("evaluation", {})
                    results[qid] = {
                        "verdict": eval_data.get("verdict"),
                        "passed": eval_data.get("passed"),
                    }
    return results


def load_layer1_evaluations(filepath):
    """Load Layer 1 (Language Fidelity) evaluations."""
    results = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                record = json.loads(line)
                qid = record.get("question_id")
                if qid:
                    results[qid] = {
                        "match_status": record.get("match_status"),
                        "detected_language": record.get("detected_language"),
                        "expected_language": record.get("expected_language"),
                        "is_match": record.get("match_status") == "match",
                    }
    return results

In [4]:
def calculate_rate(results, key="passed"):
    """Calculate success rate."""
    if not results:
        return 0.0
    passed = sum(1 for r in results.values() if r.get(key))
    return passed / len(results) * 100


def calculate_pairwise_agreement(results1, results2, key="passed"):
    """Calculate pairwise agreement rate."""
    common_ids = set(results1.keys()) & set(results2.keys())
    if not common_ids:
        return 0.0
    agreements = sum(1 for qid in common_ids 
                     if results1[qid].get(key) == results2[qid].get(key))
    return agreements / len(common_ids) * 100


def calculate_three_way_agreement(r1, r2, r3, key="passed"):
    """Calculate three-way agreement rate."""
    common_ids = set(r1.keys()) & set(r2.keys()) & set(r3.keys())
    if not common_ids:
        return 0.0
    all_agree = sum(1 for qid in common_ids 
                    if r1[qid].get(key) == r2[qid].get(key) == r3[qid].get(key))
    return all_agree / len(common_ids) * 100


def cohen_kappa(results1, results2, key="passed"):
    """Calculate Cohen's Kappa."""
    common_ids = set(results1.keys()) & set(results2.keys())
    if not common_ids:
        return 0.0
    
    n11 = sum(1 for qid in common_ids if results1[qid].get(key) and results2[qid].get(key))
    n00 = sum(1 for qid in common_ids if not results1[qid].get(key) and not results2[qid].get(key))
    n10 = sum(1 for qid in common_ids if results1[qid].get(key) and not results2[qid].get(key))
    n01 = sum(1 for qid in common_ids if not results1[qid].get(key) and results2[qid].get(key))
    
    n = len(common_ids)
    p_o = (n11 + n00) / n
    p_yes = ((n11 + n10) / n) * ((n11 + n01) / n)
    p_no = ((n00 + n01) / n) * ((n00 + n10) / n)
    p_e = p_yes + p_no
    
    if p_e == 1:
        return 1.0
    return (p_o - p_e) / (1 - p_e)

## Layer 2: Task Accuracy Consistency Analysis

In [5]:
# Load all Layer 2 data
layer2_data = {}
for model_name, run_files in LAYER2_FILES.items():
    layer2_data[model_name] = {}
    for run_name, filepath in run_files.items():
        if filepath.exists():
            layer2_data[model_name][run_name] = load_layer2_evaluations(filepath)
            print(f"{model_name} {run_name}: {len(layer2_data[model_name][run_name])} records")
        else:
            print(f"{model_name} {run_name}: FILE NOT FOUND")

GPT-5 run1: 182 records
GPT-5 run2: 182 records
GPT-5 run3: 182 records
Gemini 3 Pro run1: 182 records
Gemini 3 Pro run2: 182 records
Gemini 3 Pro run3: 182 records


In [6]:
# Calculate Layer 2 metrics for each model
layer2_results = []

for model_name, runs_data in layer2_data.items():
    if len(runs_data) < 3:
        continue
    
    # Pass rates
    rates = [calculate_rate(runs_data[f"run{i}"]) for i in range(1, 4)]
    mean_rate = np.mean(rates)
    std_rate = np.std(rates)
    cv = (std_rate / mean_rate * 100) if mean_rate > 0 else 0
    
    # Pairwise agreement
    pairs = [("run1", "run2"), ("run1", "run3"), ("run2", "run3")]
    agreements = [calculate_pairwise_agreement(runs_data[r1], runs_data[r2]) for r1, r2 in pairs]
    kappas = [cohen_kappa(runs_data[r1], runs_data[r2]) for r1, r2 in pairs]
    
    # Three-way agreement
    three_way = calculate_three_way_agreement(runs_data["run1"], runs_data["run2"], runs_data["run3"])
    
    layer2_results.append({
        "Model": model_name,
        "Run 1 (%)": rates[0],
        "Run 2 (%)": rates[1],
        "Run 3 (%)": rates[2],
        "Mean (%)": mean_rate,
        "Std Dev": std_rate,
        "CV (%)": cv,
        "Pairwise Agr. (%)": np.mean(agreements),
        "Cohen's κ": np.mean(kappas),
        "3-Way Agr. (%)": three_way,
    })

layer2_df = pd.DataFrame(layer2_results)
print("\n" + "="*80)
print("TABLE 1: Layer 2 (Task Accuracy) Consistency Across 3 Runs")
print("="*80)
layer2_df


TABLE 1: Layer 2 (Task Accuracy) Consistency Across 3 Runs


Unnamed: 0,Model,Run 1 (%),Run 2 (%),Run 3 (%),Mean (%),Std Dev,CV (%),Pairwise Agr. (%),Cohen's κ,3-Way Agr. (%)
0,GPT-5,49.450549,50.549451,50.549451,50.18315,0.518027,1.032273,77.655678,0.55313,66.483516
1,Gemini 3 Pro,71.978022,67.032967,74.175824,71.062271,2.987089,4.203481,84.981685,0.636406,77.472527


In [7]:
# Nice formatted display
layer2_display = layer2_df.copy()
layer2_display = layer2_display.round(2)

# Add mean row
mean_row = layer2_display.select_dtypes(include=[np.number]).mean()
mean_row["Model"] = "Mean"
layer2_display = pd.concat([layer2_display, pd.DataFrame([mean_row])], ignore_index=True)

layer2_display.style.set_caption("Layer 2 (Task Accuracy) Consistency").format(precision=2)

Unnamed: 0,Model,Run 1 (%),Run 2 (%),Run 3 (%),Mean (%),Std Dev,CV (%),Pairwise Agr. (%),Cohen's κ,3-Way Agr. (%)
0,GPT-5,49.45,50.55,50.55,50.18,0.52,1.03,77.66,0.55,66.48
1,Gemini 3 Pro,71.98,67.03,74.18,71.06,2.99,4.2,84.98,0.64,77.47
2,Mean,60.72,58.79,62.37,60.62,1.76,2.62,81.32,0.59,71.97


## Layer 1: Language Fidelity Consistency Analysis

In [8]:
# Load all Layer 1 data
layer1_data = {}
for model_name, run_files in LAYER1_FILES.items():
    layer1_data[model_name] = {}
    for run_name, filepath in run_files.items():
        if filepath.exists():
            layer1_data[model_name][run_name] = load_layer1_evaluations(filepath)
            print(f"{model_name} {run_name}: {len(layer1_data[model_name][run_name])} records")
        else:
            print(f"{model_name} {run_name}: FILE NOT FOUND")

GPT-5 run1: 182 records
GPT-5 run2: 182 records
GPT-5 run3: 182 records
Gemini 3 Pro run1: 182 records
Gemini 3 Pro run2: 182 records
Gemini 3 Pro run3: 182 records


In [9]:
# Calculate Layer 1 metrics (only for models with 3 runs)
layer1_results = []

for model_name, runs_data in layer1_data.items():
    num_runs = len(runs_data)
    
    # Fidelity rates
    rates = [calculate_rate(runs_data[f"run{i}"], key="is_match") for i in range(1, num_runs + 1)]
    
    if num_runs >= 3:
        mean_rate = np.mean(rates)
        std_rate = np.std(rates)
        cv = (std_rate / mean_rate * 100) if mean_rate > 0 else 0
        
        # Pairwise agreement
        pairs = [("run1", "run2"), ("run1", "run3"), ("run2", "run3")]
        agreements = [calculate_pairwise_agreement(runs_data[r1], runs_data[r2], key="is_match") for r1, r2 in pairs]
        kappas = [cohen_kappa(runs_data[r1], runs_data[r2], key="is_match") for r1, r2 in pairs]
        
        # Three-way agreement
        three_way = calculate_three_way_agreement(
            runs_data["run1"], runs_data["run2"], runs_data["run3"], key="is_match"
        )
        
        layer1_results.append({
            "Model": model_name,
            "Run 1 (%)": rates[0],
            "Run 2 (%)": rates[1],
            "Run 3 (%)": rates[2],
            "Mean (%)": mean_rate,
            "Std Dev": std_rate,
            "CV (%)": cv,
            "Pairwise Agr. (%)": np.mean(agreements),
            "Cohen's κ": np.mean(kappas),
            "3-Way Agr. (%)": three_way,
        })
    else:
        # Only 1 run available
        layer1_results.append({
            "Model": model_name,
            "Run 1 (%)": rates[0] if len(rates) > 0 else None,
            "Run 2 (%)": rates[1] if len(rates) > 1 else None,
            "Run 3 (%)": rates[2] if len(rates) > 2 else None,
            "Mean (%)": rates[0] if len(rates) == 1 else np.mean(rates),
            "Std Dev": None,
            "CV (%)": None,
            "Pairwise Agr. (%)": None,
            "Cohen's κ": None,
            "3-Way Agr. (%)": None,
        })

layer1_df = pd.DataFrame(layer1_results)
print("\n" + "="*80)
print("TABLE 2: Layer 1 (Language Fidelity) Consistency Across Runs")
print("="*80)
layer1_df


TABLE 2: Layer 1 (Language Fidelity) Consistency Across Runs


Unnamed: 0,Model,Run 1 (%),Run 2 (%),Run 3 (%),Mean (%),Std Dev,CV (%),Pairwise Agr. (%),Cohen's κ,3-Way Agr. (%)
0,GPT-5,94.505495,94.505495,95.604396,94.871795,0.518027,0.546028,97.435897,0.734763,96.153846
1,Gemini 3 Pro,74.725275,74.175824,74.725275,74.542125,0.259013,0.347473,86.446886,0.64286,79.67033


## Combined Summary Table

In [10]:
# Create combined summary
summary_data = []

for model in ["GPT-5", "Gemini 3 Pro"]:
    row = {"Model": model}
    
    # Layer 2 metrics
    l2 = layer2_df[layer2_df["Model"] == model]
    if not l2.empty:
        row["L2 Mean (%)"] = l2["Mean (%)"].values[0]
        row["L2 Std Dev"] = l2["Std Dev"].values[0]
        row["L2 3-Way Agr. (%)"] = l2["3-Way Agr. (%)"].values[0]
        row["L2 κ"] = l2["Cohen's κ"].values[0]
    
    # Layer 1 metrics
    l1 = layer1_df[layer1_df["Model"] == model]
    if not l1.empty:
        row["L1 Mean (%)"] = l1["Mean (%)"].values[0]
        row["L1 Std Dev"] = l1["Std Dev"].values[0]
        row["L1 3-Way Agr. (%)"] = l1["3-Way Agr. (%)"].values[0]
        row["L1 κ"] = l1["Cohen's κ"].values[0]
    
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*80)
print("TABLE 3: Combined Layer 1 & Layer 2 Consistency Summary")
print("="*80)
summary_df.round(2)


TABLE 3: Combined Layer 1 & Layer 2 Consistency Summary


Unnamed: 0,Model,L2 Mean (%),L2 Std Dev,L2 3-Way Agr. (%),L2 κ,L1 Mean (%),L1 Std Dev,L1 3-Way Agr. (%),L1 κ
0,GPT-5,50.18,0.52,66.48,0.55,94.87,0.52,96.15,0.73
1,Gemini 3 Pro,71.06,2.99,77.47,0.64,74.54,0.26,79.67,0.64


## Response Pattern Analysis

In [11]:
def analyze_patterns(runs_data, key="passed"):
    """Analyze response patterns across runs."""
    common_ids = set.intersection(*[set(r.keys()) for r in runs_data.values()])
    
    patterns = defaultdict(int)
    for qid in common_ids:
        vals = tuple(runs_data[run][qid].get(key) for run in sorted(runs_data.keys()))
        patterns[vals] += 1
    
    # Aggregate patterns
    pattern_map = {
        (True, True, True): "All PASS (3/3)",
        (False, False, False): "All FAIL (3/3)",
    }
    for p in [(True, True, False), (True, False, True), (False, True, True)]:
        pattern_map[p] = "2 PASS, 1 FAIL"
    for p in [(False, False, True), (False, True, False), (True, False, False)]:
        pattern_map[p] = "1 PASS, 2 FAIL"
    
    aggregated = defaultdict(int)
    for pattern, count in patterns.items():
        label = pattern_map.get(pattern, str(pattern))
        aggregated[label] += count
    
    return dict(aggregated), len(common_ids)


# Layer 2 patterns
print("\n" + "="*80)
print("TABLE 4: Response Pattern Distribution (Layer 2 - Task Accuracy)")
print("="*80)

pattern_data = []
for model_name, runs_data in layer2_data.items():
    if len(runs_data) >= 3:
        patterns, total = analyze_patterns(runs_data)
        for pattern, count in sorted(patterns.items(), key=lambda x: -x[1]):
            pattern_data.append({
                "Model": model_name,
                "Pattern": pattern,
                "Count": count,
                "Percentage": count / total * 100,
            })

pattern_df = pd.DataFrame(pattern_data)
pattern_pivot = pattern_df.pivot(index="Pattern", columns="Model", values="Percentage").fillna(0)
pattern_pivot.round(1)


TABLE 4: Response Pattern Distribution (Layer 2 - Task Accuracy)


Model,GPT-5,Gemini 3 Pro
Pattern,Unnamed: 1_level_1,Unnamed: 2_level_1
"1 PASS, 2 FAIL",18.7,9.9
"2 PASS, 1 FAIL",14.8,12.6
All FAIL (3/3),32.4,18.1
All PASS (3/3),34.1,59.3
