# Reproducibility Notebook: Recognition Theory in AI Tutoring

This notebook independently reproduces all 17 tables and key statistical findings from the paper,
using only the raw SQLite database and dialogue log files.

**Data sources:**
- `../data/evaluations.db` — 3,458 scored evaluation rows
- `../logs/tutor-dialogues/*.json` — 654 dialogue log files

**Six key run IDs:**

| Key | Run ID | Section | N |
|-----|--------|---------|---|
| recognition_validation | eval-2026-02-03-86b159cd | 6.1 | 36 |
| full_factorial | eval-2026-02-03-f5d4dd93 | 6.2 | 342 |
| ab_nemotron | eval-2026-02-04-948e04b3 | 6.3 | 17 |
| ab_kimi | eval-2026-02-05-10b344fb | 6.3 | 60 |
| domain_nemotron | eval-2026-02-04-79b633ca | 6.4 | 47 |
| domain_kimi | eval-2026-02-05-e87f452d | 6.4 | 60 |

## 0. Setup

In [None]:
# ── Data availability check ──────────────────────────────────────────────
# The evaluation database and dialogue logs are distributed separately
# as a GitHub Release artifact (~19 MB compressed).

import os, sys
from pathlib import Path

DB_PATH = Path('../data/evaluations.db')
LOGS_DIR = Path('../logs/tutor-dialogues')

missing = []
if not DB_PATH.exists():
    missing.append(f'  - Database: {DB_PATH}')
if not LOGS_DIR.exists() or not any(LOGS_DIR.glob('*.json')):
    missing.append(f'  - Dialogue logs: {LOGS_DIR}/')

if missing:
    print('DATA NOT FOUND — this notebook requires the evaluation dataset.\n')
    print('Missing:')
    print('\n'.join(missing))
    print('\nTo obtain the data:\n')
    print('  1. Download machinespirits-eval-data-v0.2.0.tar.gz from:')
    print('     https://github.com/liammagee/machinespirits-eval/releases/tag/v0.2.0\n')
    print('  2. Extract from the repository root:')
    print('     tar xzf machinespirits-eval-data-v0.2.0.tar.gz\n')
    print('This will populate data/evaluations.db and logs/tutor-dialogues/.')
    print('Then re-run this cell and continue.')
    # Uncomment the next line to halt execution if data is missing:
    # sys.exit(1)
else:
    print(f'Database found: {DB_PATH} ({DB_PATH.stat().st_size / 1e6:.1f} MB)')
    n_logs = len(list(LOGS_DIR.glob('*.json')))
    print(f'Dialogue logs found: {n_logs} files in {LOGS_DIR}/')

In [None]:
import sqlite3
import json
import os
import re
import hashlib
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='whitegrid', font_scale=1.1)
%matplotlib inline

print('All imports successful.')

In [None]:
# ── Database connection ──────────────────────────────────────────────────
DB_PATH = '../data/evaluations.db'
LOGS_DIR = '../logs/tutor-dialogues'

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

df_all = pd.read_sql_query("""
    SELECT *
    FROM evaluation_results
    WHERE success = 1
""", conn)

print(f'Total rows loaded: {len(df_all)}')
print(f'Rows with overall_score: {df_all["overall_score"].notna().sum()}')
print(f'Distinct run_ids: {df_all["run_id"].nunique()}')

In [None]:
# ── Run ID dictionary ────────────────────────────────────────────────────
RUN_IDS = {
    'recognition_validation': 'eval-2026-02-03-86b159cd',
    'full_factorial':         'eval-2026-02-03-f5d4dd93',
    'ab_nemotron':            'eval-2026-02-04-948e04b3',
    'ab_kimi':                'eval-2026-02-05-10b344fb',
    'domain_nemotron':        'eval-2026-02-04-79b633ca',
    'domain_kimi':            'eval-2026-02-05-e87f452d',
}

# Expected N per run from the paper
EXPECTED_N = {
    'recognition_validation': 36,
    'full_factorial':         342,
    'ab_nemotron':            17,
    'ab_kimi':                60,
    'domain_nemotron':        47,
    'domain_kimi':            60,
}

def get_run(key):
    """Get scored rows for a named run, deduplicating if needed.
    
    Some runs have the same response judged by multiple models (rejudging).
    When total rows > expected N but unique suggestions == expected N,
    we keep only the first row per unique suggestion to match paper's analysis.
    When total rows == expected N (e.g., domain_nemotron with 47 rows from
    multiple judgments of different responses), we keep all rows.
    """
    run_id = RUN_IDS[key]
    mask = (df_all['run_id'] == run_id) & df_all['overall_score'].notna()
    df_run = df_all[mask].copy()
    
    expected = EXPECTED_N.get(key)
    if expected and len(df_run) > expected:
        # Deduplicate: keep first row per unique suggestions content
        df_run['_content_hash'] = df_run['suggestions'].apply(
            lambda s: hashlib.md5(s.encode()).hexdigest() if isinstance(s, str) else str(id(s))
        )
        df_run = df_run.drop_duplicates(subset='_content_hash', keep='first')
        df_run = df_run.drop(columns=['_content_hash'])
    
    return df_run

# Verify run sizes
for key, run_id in RUN_IDS.items():
    n = get_run(key).shape[0]
    expected = EXPECTED_N[key]
    status = 'OK' if n == expected else f'MISMATCH (expected {expected})'
    print(f'  {key:30s}  N={n}  {status}')

In [None]:
# ── Helper functions ──────────────────────────────────────────────────────

def cohens_d(group1, group2):
    """Compute Cohen's d (pooled SD)."""
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    if pooled_std == 0:
        return 0.0
    return (np.mean(group1) - np.mean(group2)) / pooled_std

def ci_95(data):
    """95% confidence interval for the mean."""
    n = len(data)
    m = np.mean(data)
    se = stats.sem(data)
    h = se * stats.t.ppf(0.975, n - 1)
    return m - h, m + h

def parse_scores_json(row):
    """Parse scores_with_reasoning JSON into individual recognition dimension scores."""
    try:
        parsed = json.loads(row)
        result = {}
        for dim in ['mutual_recognition', 'dialectical_responsiveness',
                    'transformative_potential', 'memory_integration',
                    'tutor_adaptation', 'learner_growth']:
            if dim in parsed and 'score' in parsed[dim]:
                result[dim] = parsed[dim]['score']
            else:
                result[dim] = np.nan
        return result
    except (json.JSONDecodeError, TypeError):
        return {dim: np.nan for dim in ['mutual_recognition', 'dialectical_responsiveness',
                                         'transformative_potential', 'memory_integration',
                                         'tutor_adaptation', 'learner_growth']}

# Parse recognition dimension scores from JSON
recog_dims = df_all['scores_with_reasoning'].apply(parse_scores_json).apply(pd.Series)
for col in recog_dims.columns:
    df_all[f'score_{col}'] = recog_dims[col]

print('Recognition dimension columns added:')
print([c for c in df_all.columns if 'mutual' in c or 'dialectical' in c or 'transformative' in c or 'memory_int' in c or 'tutor_adapt' in c or 'learner_growth' in c])

# ── Derive factors from profile_name ──────────────────────────────────────
# CRITICAL: factor columns are only 48% populated; always derive from profile_name

def derive_factors(profile):
    """Derive experimental factors from profile_name."""
    if pd.isna(profile):
        return pd.Series({'factor_A_recognition': None, 'factor_B_multi': None,
                          'factor_C_learner': None, 'prompt_type': None})
    p = str(profile).lower()
    # Factor A: Recognition
    if 'recog' in p:
        recognition = True
        prompt_type = 'recognition'
    elif 'enhanced' in p or 'cell_9' in p or 'cell_10' in p or 'cell_11' in p or 'cell_12' in p:
        recognition = False
        prompt_type = 'enhanced'
    elif 'placebo' in p:
        recognition = False
        prompt_type = 'placebo'
    else:
        recognition = False
        prompt_type = 'base'
    # Factor B: Multi-agent tutor
    multi = '_multi_' in p
    # Factor C: Learner architecture
    psycho = 'psycho' in p
    return pd.Series({
        'factor_A_recognition': recognition,
        'factor_B_multi': multi,
        'factor_C_learner': psycho,
        'prompt_type': prompt_type,
    })

factors = df_all['profile_name'].apply(derive_factors)
for col in factors.columns:
    df_all[col] = factors[col]

print(f'\nFactor derivation complete. Sample:')
print(df_all[['profile_name', 'factor_A_recognition', 'factor_B_multi', 'factor_C_learner', 'prompt_type']].drop_duplicates().to_string(index=False))

## 1. Table 1 — Model Configuration

In [None]:
# Table 1: Verify model configurations per run
print('Table 1: Model Configuration by Run')
print('=' * 70)

for key, run_id in RUN_IDS.items():
    df_run = get_run(key)
    models = df_run[['model', 'judge_model']].drop_duplicates()
    print(f'\n{key} ({run_id}):')
    print(f'  Tutor model(s):  {df_run["model"].unique().tolist()}')
    print(f'  Judge model(s):  {df_run["judge_model"].unique().tolist()}')
    if 'ego_model' in df_run.columns:
        ego = df_run['ego_model'].dropna().unique()
        sup = df_run['superego_model'].dropna().unique()
        if len(ego) > 0:
            print(f'  Ego model(s):    {ego.tolist()}')
            print(f'  Superego model(s): {sup.tolist()}')

## 2. Table 2 — Sample Summary

In [None]:
# Table 2: Sample summary
print('Table 2: Evaluation Sample Summary')
print('=' * 70)

summary_rows = []
for key, run_id in RUN_IDS.items():
    total_raw = len(df_all[df_all['run_id'] == run_id])
    scored_raw = len(df_all[(df_all['run_id'] == run_id) & df_all['overall_score'].notna()])
    scored_dedup = len(get_run(key))
    summary_rows.append({
        'Evaluation': key,
        'Run ID': run_id,
        'Total (raw)': total_raw,
        'Scored (raw)': scored_raw,
        'Scored (dedup)': scored_dedup,
        'Expected N': EXPECTED_N[key],
    })

df_summary = pd.DataFrame(summary_rows)
print(df_summary.to_string(index=False))

raw_total = df_summary['Total (raw)'].sum()
raw_scored = df_summary['Scored (raw)'].sum()
dedup_scored = df_summary['Scored (dedup)'].sum()
print(f'\nRaw totals:   {raw_total} attempted, {raw_scored} scored')
print(f'Deduplicated: {dedup_scored} scored (paper primary N)')
print(f'Expected:     623 attempted, 562 scored')

## 3. Table 3 — Inter-Judge Reliability

In [None]:
# Table 3: Inter-judge reliability
# Match responses by MD5 hash of suggestions content
print('Table 3: Inter-Judge Reliability')
print('=' * 70)

df_judged = df_all[df_all['judge_model'].notna() & df_all['overall_score'].notna() & df_all['suggestions'].notna()].copy()
df_judged['content_hash'] = df_judged['suggestions'].apply(
    lambda s: hashlib.md5(s.encode()).hexdigest() if isinstance(s, str) else None
)

# Group by content hash to find same-response multi-judge pairs
hash_groups = df_judged.groupby('content_hash')
paired_data = []

for content_hash, group in hash_groups:
    judges = group['judge_model'].unique()
    if len(judges) < 2:
        continue
    judge_list = sorted(judges)
    for i in range(len(judge_list)):
        for j in range(i + 1, len(judge_list)):
            j1_rows = group[group['judge_model'] == judge_list[i]]
            j2_rows = group[group['judge_model'] == judge_list[j]]
            for _, r1 in j1_rows.iterrows():
                for _, r2 in j2_rows.iterrows():
                    paired_data.append({
                        'judge1': judge_list[i],
                        'judge2': judge_list[j],
                        'score1': r1['overall_score'],
                        'score2': r2['overall_score'],
                        'content_hash': content_hash,
                    })

df_pairs = pd.DataFrame(paired_data)
print(f'Total paired judgments: {len(df_pairs)}')

if len(df_pairs) > 0:
    # Per-pair correlation
    for pair_key, pair_df in df_pairs.groupby(['judge1', 'judge2']):
        j1, j2 = pair_key
        n = len(pair_df)
        r_pearson, p_pearson = stats.pearsonr(pair_df['score1'], pair_df['score2'])
        r_spearman, p_spearman = stats.spearmanr(pair_df['score1'], pair_df['score2'])
        mad = np.mean(np.abs(pair_df['score1'] - pair_df['score2']))
        m1, m2 = pair_df['score1'].mean(), pair_df['score2'].mean()
        j1_short = j1.split('/')[-1] if '/' in j1 else j1
        j2_short = j2.split('/')[-1] if '/' in j2 else j2
        print(f'\n  {j1_short} vs {j2_short}  (N={n})')
        print(f'    Pearson r  = {r_pearson:.3f}  (p={p_pearson:.4f})')
        print(f'    Spearman ρ = {r_spearman:.3f}  (p={p_spearman:.4f})')
        print(f'    Mean Abs Diff = {mad:.1f} pts')
        print(f'    Mean scores: {m1:.1f} vs {m2:.1f}')

    # Mean score by judge
    print('\nMean scores by judge:')
    for judge in sorted(df_judged['judge_model'].unique()):
        m = df_judged[df_judged['judge_model'] == judge]['overall_score'].mean()
        print(f'  {judge.split("/")[-1]:30s}  {m:.1f}')
else:
    print('No paired judgments found. This requires rejudging runs with multiple judge models.')

In [None]:
# Scatter plot of inter-judge agreement (if pairs exist)
if len(df_pairs) > 0:
    pair_keys = df_pairs.groupby(['judge1', 'judge2']).ngroups
    fig, axes = plt.subplots(1, min(pair_keys, 3), figsize=(5 * min(pair_keys, 3), 5))
    if pair_keys == 1:
        axes = [axes]

    for ax, (pair_key, pair_df) in zip(axes, df_pairs.groupby(['judge1', 'judge2'])):
        j1, j2 = pair_key
        ax.scatter(pair_df['score1'], pair_df['score2'], alpha=0.5)
        ax.plot([0, 100], [0, 100], 'k--', alpha=0.3)
        ax.set_xlabel(j1.split('/')[-1])
        ax.set_ylabel(j2.split('/')[-1])
        r, _ = stats.pearsonr(pair_df['score1'], pair_df['score2'])
        ax.set_title(f'r = {r:.3f}, N = {len(pair_df)}')
        ax.set_xlim(0, 105)
        ax.set_ylim(0, 105)

    plt.suptitle('Table 3: Inter-Judge Reliability', fontsize=14)
    plt.tight_layout()
    plt.show()
else:
    print('(No scatter plot: no paired data)')

## 4. Table 4 — Recognition Validation (N=36)

In [None]:
# Table 4: Recognition Validation — 3-way comparison
print('Table 4: Base vs Enhanced vs Recognition (N=36)')
print('=' * 70)

df_val = get_run('recognition_validation')
print(f'Run N = {len(df_val)}')
print(f'Profiles: {df_val["profile_name"].unique().tolist()}')

# Derive prompt_type
df_val['pt'] = df_val['prompt_type']

for pt in ['base', 'enhanced', 'recognition']:
    subset = df_val[df_val['pt'] == pt]['overall_score']
    print(f'  {pt:12s}  N={len(subset):3d}  Mean={subset.mean():.1f}  SD={subset.std():.1f}')

# One-way ANOVA: F(2, 33)
groups = [df_val[df_val['pt'] == pt]['overall_score'].values for pt in ['base', 'enhanced', 'recognition']]
f_stat, p_val = stats.f_oneway(*groups)
print(f'\nOne-way ANOVA: F(2, {len(df_val) - 3}) = {f_stat:.2f}, p = {p_val:.4f}')

# Effect decomposition
base_mean = groups[0].mean()
enhanced_mean = groups[1].mean()
recog_mean = groups[2].mean()

total_effect = recog_mean - base_mean
engineering_effect = enhanced_mean - base_mean
unique_effect = recog_mean - enhanced_mean

print(f'\nEffect Decomposition:')
print(f'  Total recognition effect:        +{total_effect:.1f} pts')
print(f'  Prompt engineering (enh vs base): +{engineering_effect:.1f} pts ({engineering_effect/total_effect*100:.0f}%)')
print(f'  Recognition unique (rec vs enh):  +{unique_effect:.1f} pts ({unique_effect/total_effect*100:.0f}%)')
print(f'\nPaper reports: +20.1 total, +11.4 engineering, +8.7 unique')

In [None]:
# Box plot for Table 4
fig, ax = plt.subplots(figsize=(8, 5))
order = ['base', 'enhanced', 'recognition']
sns.boxplot(data=df_val, x='pt', y='overall_score', order=order, ax=ax, palette='Set2')
sns.stripplot(data=df_val, x='pt', y='overall_score', order=order, ax=ax,
              color='black', alpha=0.4, size=4)
ax.set_xlabel('Prompt Type')
ax.set_ylabel('Overall Score (0-100)')
ax.set_title(f'Table 4: Recognition Validation (N={len(df_val)}, F={f_stat:.2f}, p={p_val:.4f})')
plt.tight_layout()
plt.show()

## 5. Table 5 — Full Factorial ANOVA (N=342)

In [None]:
# Table 5: Full 2x2x2 Factorial
print('Table 5: Full Factorial ANOVA')
print('=' * 70)

df_fact = get_run('full_factorial')
print(f'Run N = {len(df_fact)}')

# 8-cell means table
print('\n8-Cell Means:')
cell_stats = df_fact.groupby('profile_name')['overall_score'].agg(['count', 'mean', 'std']).round(1)
cell_stats = cell_stats.sort_index()
print(cell_stats.to_string())

# Derive A/B/C
df_fact['A'] = df_fact['factor_A_recognition'].astype(int)
df_fact['B'] = df_fact['factor_B_multi'].astype(int)
df_fact['C'] = df_fact['factor_C_learner'].astype(int)

# Main effects with 95% CIs
print('\nMain Effects:')
for factor, label in [('A', 'Recognition'), ('B', 'Multi-agent'), ('C', 'Learner ego-superego')]:
    g0 = df_fact[df_fact[factor] == 0]['overall_score']
    g1 = df_fact[df_fact[factor] == 1]['overall_score']
    effect = g1.mean() - g0.mean()
    # Bootstrap CI via formula: SE of difference
    se = np.sqrt(g0.var() / len(g0) + g1.var() / len(g1))
    ci_lo = effect - 1.96 * se
    ci_hi = effect + 1.96 * se
    print(f'  {label:30s}  +{effect:.1f} pts  95% CI [{ci_lo:.1f}, {ci_hi:.1f}]')

In [None]:
# ANOVA: Type II with statsmodels
model = ols('overall_score ~ C(A) * C(B) * C(C)', data=df_fact).fit()
anova_table = anova_lm(model, typ=2)

# Add eta-squared
ss_total = anova_table['sum_sq'].sum()
anova_table['eta_sq'] = anova_table['sum_sq'] / ss_total

print('\nType II ANOVA:')
print(anova_table[['sum_sq', 'df', 'F', 'PR(>F)', 'eta_sq']].round(4).to_string())

# Extract key values for concordance
f_A = anova_table.loc['C(A)', 'F']
eta_A = anova_table.loc['C(A)', 'eta_sq']
print(f'\nKey: F(A) = {f_A:.2f}, η²(A) = {eta_A:.3f}')
print(f'Paper reports: F = 43.27, η² = .109')

In [None]:
# Heatmap: Recognition x Multi-agent
pivot = df_fact.groupby(['factor_A_recognition', 'factor_B_multi'])['overall_score'].mean().unstack()
pivot.index = ['Base', 'Recognition']
pivot.columns = ['Single', 'Multi']

fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(pivot, annot=True, fmt='.1f', cmap='YlOrRd', ax=ax, vmin=70, vmax=90)
ax.set_title(f'Table 5: Factorial Mean Scores (N={len(df_fact)})')
ax.set_ylabel('Factor A: Recognition')
ax.set_xlabel('Factor B: Architecture')
plt.tight_layout()
plt.show()

## 6. Table 6 — A×B Interaction

In [None]:
# Table 6: A x B Interaction
print('Table 6: A×B Interaction')
print('=' * 70)

# ── Nemotron (N=17) ──────────────────────────────────────────────────────
df_ab_nem = get_run('ab_nemotron')
print(f'\nNemotron A×B run: N={len(df_ab_nem)}')
print(f'Profiles: {df_ab_nem["profile_name"].unique().tolist()}')

for pt in df_ab_nem['prompt_type'].unique():
    for multi in [False, True]:
        subset = df_ab_nem[(df_ab_nem['prompt_type'] == pt) & (df_ab_nem['factor_B_multi'] == multi)]
        arch = 'multi' if multi else 'single'
        if len(subset) > 0:
            print(f'  {pt:12s} {arch:6s}  N={len(subset):3d}  Mean={subset["overall_score"].mean():.1f}')

# ── Kimi replication (N=60) ──────────────────────────────────────────────
df_ab_kimi = get_run('ab_kimi')
print(f'\nKimi A×B replication: N={len(df_ab_kimi)}')
print(f'Profiles: {df_ab_kimi["profile_name"].unique().tolist()}')

for pt in sorted(df_ab_kimi['prompt_type'].unique()):
    for multi in [False, True]:
        subset = df_ab_kimi[(df_ab_kimi['prompt_type'] == pt) & (df_ab_kimi['factor_B_multi'] == multi)]
        arch = 'multi' if multi else 'single'
        if len(subset) > 0:
            print(f'  {pt:12s} {arch:6s}  N={len(subset):3d}  Mean={subset["overall_score"].mean():.1f}')

In [None]:
# Bar chart for A x B interaction
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for ax, (key, title) in zip(axes, [('ab_nemotron', 'Nemotron (N=17)'), ('ab_kimi', 'Kimi Replication (N=60)')]):
    df_run = get_run(key)
    df_run['Architecture'] = df_run['factor_B_multi'].map({True: 'Multi', False: 'Single'})
    df_run['Prompt'] = df_run['prompt_type'].str.capitalize()
    sns.barplot(data=df_run, x='Prompt', y='overall_score', hue='Architecture', ax=ax, palette='Set1')
    ax.set_title(title)
    ax.set_ylabel('Overall Score')
    ax.set_ylim(50, 100)

plt.suptitle('Table 6: A×B Interaction', fontsize=14)
plt.tight_layout()
plt.show()

## 7. Tables 7–8 — Domain Generalizability

In [None]:
# Tables 7-8: Domain Generalizability
print('Tables 7-8: Domain Generalizability')
print('=' * 70)

# ── Nemotron elementary (N=47) ───────────────────────────────────────────
df_dom_nem = get_run('domain_nemotron')
print(f'\nNemotron Elementary: N={len(df_dom_nem)}')

for factor, label in [('factor_A_recognition', 'A: Recognition'), ('factor_B_multi', 'B: Multi-agent'), ('factor_C_learner', 'C: Learner')]:
    g0 = df_dom_nem[df_dom_nem[factor] == False]['overall_score']
    g1 = df_dom_nem[df_dom_nem[factor] == True]['overall_score']
    if len(g0) > 0 and len(g1) > 0:
        effect = g1.mean() - g0.mean()
        print(f'  {label:30s}  +{effect:.1f} pts (N: {len(g0)}+{len(g1)})')

print(f'  Overall mean: {df_dom_nem["overall_score"].mean():.1f}')

# ── Kimi elementary replication (N=60) ───────────────────────────────────
df_dom_kimi = get_run('domain_kimi')
print(f'\nKimi Elementary Replication: N={len(df_dom_kimi)}')

base_scores = df_dom_kimi[df_dom_kimi['factor_A_recognition'] == False]['overall_score']
recog_scores = df_dom_kimi[df_dom_kimi['factor_A_recognition'] == True]['overall_score']

print(f'  Base mean:        {base_scores.mean():.1f} (N={len(base_scores)})')
print(f'  Recognition mean: {recog_scores.mean():.1f} (N={len(recog_scores)})')
print(f'  Delta:            +{recog_scores.mean() - base_scores.mean():.1f}')

d = cohens_d(recog_scores.values, base_scores.values)
print(f'  Cohen\'s d:        {d:.2f}')
print(f'  Paper reports: d ≈ 0.61')

# Per-scenario breakdown
print(f'\n  Per-scenario effects (Kimi elementary):')
for scenario in sorted(df_dom_kimi['scenario_id'].unique()):
    sc_base = df_dom_kimi[(df_dom_kimi['scenario_id'] == scenario) & (df_dom_kimi['factor_A_recognition'] == False)]['overall_score']
    sc_recog = df_dom_kimi[(df_dom_kimi['scenario_id'] == scenario) & (df_dom_kimi['factor_A_recognition'] == True)]['overall_score']
    if len(sc_base) > 0 and len(sc_recog) > 0:
        delta = sc_recog.mean() - sc_base.mean()
        print(f'    {scenario:40s}  +{delta:.1f}')

In [None]:
# Grouped bar chart: Nemotron elementary vs Philosophy (from factorial)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Elementary
for ax, (df_run, title) in zip(axes, [
    (df_dom_kimi, 'Elementary (Kimi, N=60)'),
    (df_fact, 'Philosophy (Kimi Factorial, N=342)')
]):
    df_plot = df_run.copy()
    df_plot['Condition'] = df_plot['factor_A_recognition'].map({True: 'Recognition', False: 'Base'})
    sns.barplot(data=df_plot, x='Condition', y='overall_score', ax=ax, palette='Set2',
                order=['Base', 'Recognition'])
    ax.set_title(title)
    ax.set_ylabel('Overall Score')
    ax.set_ylim(40, 100)

plt.suptitle('Tables 7-8: Domain Generalizability', fontsize=14)
plt.tight_layout()
plt.show()

## 8. Table 9 — Superego Rejection Patterns

In [None]:
# Table 9: Superego Rejection Patterns
print('Table 9: Superego Rejection Patterns')
print('=' * 70)

log_dir = Path(LOGS_DIR)
log_files = sorted(log_dir.glob('*.json'))
print(f'Total dialogue log files: {len(log_files)}')

# Extract all superego rejections
all_rejections = []
files_with_rejections = 0
total_superego_entries = 0

for log_file in log_files:
    try:
        with open(log_file) as f:
            data = json.load(f)
        trace = data.get('dialogueTrace', [])
        file_has_rejection = False
        for entry in trace:
            if entry.get('agent') == 'superego':
                total_superego_entries += 1
                if entry.get('approved') == False:
                    verdict = entry.get('verdict', {}) or {}
                    feedback = verdict.get('feedback', '')
                    all_rejections.append({
                        'file': log_file.name,
                        'round': entry.get('round'),
                        'feedback': feedback,
                        'profile': data.get('profileName', ''),
                    })
                    file_has_rejection = True
        if file_has_rejection:
            files_with_rejections += 1
    except (json.JSONDecodeError, IOError):
        continue

print(f'Total superego entries: {total_superego_entries}')
print(f'Total rejections: {len(all_rejections)}')
print(f'Files with at least one rejection: {files_with_rejections}/{len(log_files)}')

# Classify feedback text by pattern
REJECTION_CATEGORIES = {
    'Engagement': re.compile(r'engage|engag|interact|respond.*learner|learner.*position|acknowledge', re.I),
    'Specificity': re.compile(r'specific|vague|generic|concrete|lecture.?\d|reference', re.I),
    'Struggle': re.compile(r'struggl|frustr|stuck|difficult|confus|overwhelm|overload', re.I),
    'Memory': re.compile(r'previous|history|past|returning|remember|earlier|last time', re.I),
    'Level-matching': re.compile(r'level|advanced|beginner|appropriate|scaffold|zone|ZPD|mismatch', re.I),
}

category_counts = Counter()
for rej in all_rejections:
    fb = rej['feedback']
    for cat, pattern in REJECTION_CATEGORIES.items():
        if pattern.search(fb):
            category_counts[cat] += 1

print(f'\nRejection Pattern Frequency:')
total_rej = len(all_rejections)
for cat in ['Engagement', 'Specificity', 'Struggle', 'Memory', 'Level-matching']:
    count = category_counts[cat]
    pct = count / total_rej * 100 if total_rej > 0 else 0
    print(f'  {cat:20s}  {count:4d}  ({pct:.0f}%)')

## 9. Table 10 — Dimension Effect Sizes

In [None]:
# Table 10: Per-dimension Cohen's d (factorial run, base vs recognition)
print('Table 10: Dimension-Level Effect Sizes')
print('=' * 70)

STANDARD_DIMS = ['score_relevance', 'score_specificity', 'score_pedagogical',
                 'score_personalization', 'score_actionability', 'score_tone']
DIM_LABELS = ['Relevance', 'Specificity', 'Pedagogical', 'Personalization', 'Actionability', 'Tone']

base_mask = df_fact['factor_A_recognition'] == False
recog_mask = df_fact['factor_A_recognition'] == True

dim_effects = []
for dim, label in zip(STANDARD_DIMS, DIM_LABELS):
    base_vals = df_fact.loc[base_mask, dim].dropna()
    recog_vals = df_fact.loc[recog_mask, dim].dropna()
    d = cohens_d(recog_vals.values, base_vals.values)
    dim_effects.append({
        'Dimension': label,
        'Base Mean': base_vals.mean(),
        'Recognition Mean': recog_vals.mean(),
        'Cohen\'s d': d,
    })
    print(f'  {label:20s}  Base={base_vals.mean():.2f}  Recog={recog_vals.mean():.2f}  d={d:.2f}')

df_dim_effects = pd.DataFrame(dim_effects).sort_values("Cohen's d", ascending=False)
print(f'\nPaper reports: Personalization d=1.82, Pedagogical d=1.39, Relevance d=1.11, Tone d=1.02')

In [None]:
# Forest plot for dimension effect sizes
fig, ax = plt.subplots(figsize=(8, 5))
df_plot = df_dim_effects.sort_values("Cohen's d")
colors = ['#e74c3c' if d > 0.8 else '#f39c12' if d > 0.5 else '#3498db' for d in df_plot["Cohen's d"]]
ax.barh(df_plot['Dimension'], df_plot["Cohen's d"], color=colors)
ax.axvline(x=0.8, color='red', linestyle='--', alpha=0.5, label='Large (d=0.8)')
ax.axvline(x=0.5, color='orange', linestyle='--', alpha=0.5, label='Medium (d=0.5)')
ax.axvline(x=0.2, color='blue', linestyle='--', alpha=0.5, label='Small (d=0.2)')
ax.set_xlabel("Cohen's d")
ax.set_title('Table 10: Dimension Effect Sizes (Recognition vs Base)')
ax.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 10. Table 11 — Standard Dimensions Only

In [None]:
# Table 11: Re-weighted scores — standard-only vs recognition-only
print('Table 11: Standard Dimensions Only Analysis')
print('=' * 70)

# Raw weights from rubric YAML
WEIGHTS = {
    'score_relevance': 0.15,
    'score_specificity': 0.15,
    'score_pedagogical': 0.15,
    'score_personalization': 0.10,
    'score_actionability': 0.10,
    'score_tone': 0.10,
    'score_mutual_recognition': 0.083,
    'score_dialectical_responsiveness': 0.083,
    'score_transformative_potential': 0.083,
    'score_memory_integration': 0.05,
    'score_tutor_adaptation': 0.05,
    'score_learner_growth': 0.05,
}

STANDARD_WEIGHT_KEYS = ['score_relevance', 'score_specificity', 'score_pedagogical',
                         'score_personalization', 'score_actionability', 'score_tone']
RECOG_WEIGHT_KEYS = ['score_mutual_recognition', 'score_dialectical_responsiveness',
                      'score_transformative_potential', 'score_memory_integration']

def compute_weighted_score(row, weight_keys):
    """Compute weighted score using only specified dimensions, re-normalized."""
    total_weight = 0
    weighted_sum = 0
    for key in weight_keys:
        val = row.get(key)
        if pd.notna(val):
            w = WEIGHTS[key]
            weighted_sum += val * w
            total_weight += w
    if total_weight == 0:
        return np.nan
    return (weighted_sum / total_weight) * 20  # scale to 0-100

df_fact['standard_only_score'] = df_fact.apply(lambda r: compute_weighted_score(r, STANDARD_WEIGHT_KEYS), axis=1)
df_fact['recognition_only_score'] = df_fact.apply(lambda r: compute_weighted_score(r, RECOG_WEIGHT_KEYS), axis=1)

for condition, label in [(False, 'Base (cells 1-4)'), (True, 'Recognition (cells 5-8)')]:
    mask = df_fact['factor_A_recognition'] == condition
    n = mask.sum()
    overall = df_fact.loc[mask, 'overall_score'].mean()
    std_only = df_fact.loc[mask, 'standard_only_score'].mean()
    rec_only = df_fact.loc[mask, 'recognition_only_score'].mean()
    print(f'  {label:25s}  N={n:3d}  Overall={overall:.1f}  Standard-only={std_only:.1f}  Recog-only={rec_only:.1f}')

# Differences
base_std = df_fact.loc[~df_fact['factor_A_recognition'], 'standard_only_score'].mean()
recog_std = df_fact.loc[df_fact['factor_A_recognition'], 'standard_only_score'].mean()
print(f'\n  Standard-only difference: +{recog_std - base_std:.1f}')
print(f'  Paper reports: +6.1 pts advantage persists on standard-only dimensions')

## 11. Table 12 — Multi-Turn Scenarios

In [None]:
# Table 12: Multi-turn scenario results
print('Table 12: Multi-Turn Scenarios')
print('=' * 70)

MULTI_TURN_SCENARIOS = [
    'misconception_correction_flow',
    'mood_frustration_to_breakthrough',
    'mutual_transformation_journey',
]

# Use ALL data across runs (not run-specific per plan)
df_scored = df_all[df_all['overall_score'].notna()].copy()

mt_results = []
for scenario in MULTI_TURN_SCENARIOS:
    sc_data = df_scored[df_scored['scenario_id'] == scenario].copy()
    sc_data['is_recog'] = sc_data['profile_name'].str.contains('recog', case=False, na=False)

    base = sc_data[~sc_data['is_recog']]['overall_score']
    recog = sc_data[sc_data['is_recog']]['overall_score']
    avg_rounds = sc_data['dialogue_rounds'].mean() if 'dialogue_rounds' in sc_data.columns else np.nan

    if len(base) > 1 and len(recog) > 1:
        d = cohens_d(recog.values, base.values)
        delta = recog.mean() - base.mean()
        mt_results.append({
            'Scenario': scenario,
            'N': len(sc_data),
            'Avg Rounds': avg_rounds,
            'Base': base.mean(),
            'Recognition': recog.mean(),
            'Delta': delta,
            'Cohen\'s d': d,
        })
        print(f'  {scenario:40s}  N={len(sc_data):4d}  Base={base.mean():.1f}  Recog={recog.mean():.1f}  Δ={delta:+.1f}  d={d:.2f}')

print(f'\nPaper reports: d = 0.85 / 0.59 / 0.78')

In [None]:
# Bar chart with error bars
if mt_results:
    df_mt = pd.DataFrame(mt_results)
    fig, ax = plt.subplots(figsize=(10, 5))
    x = np.arange(len(df_mt))
    width = 0.35

    bars1 = ax.bar(x - width/2, df_mt['Base'], width, label='Base', color='#3498db')
    bars2 = ax.bar(x + width/2, df_mt['Recognition'], width, label='Recognition', color='#e74c3c')

    ax.set_xlabel('Scenario')
    ax.set_ylabel('Overall Score')
    ax.set_title('Table 12: Multi-Turn Scenarios (Base vs Recognition)')
    ax.set_xticks(x)
    ax.set_xticklabels([s.replace('_', '\n') for s in df_mt['Scenario']], fontsize=9)
    ax.legend()
    ax.set_ylim(0, 100)

    # Annotate Cohen's d
    for i, row in df_mt.iterrows():
        ax.annotate(f'd={row["Cohen\'s d"]:.2f}', (i, max(row['Base'], row['Recognition']) + 3),
                    ha='center', fontsize=10, fontweight='bold')

    plt.tight_layout()
    plt.show()

## 12. Table 13 — Bilateral Transformation

In [None]:
# Table 13: Bilateral Transformation Metrics
print('Table 13: Bilateral Transformation Metrics')
print('=' * 70)

log_dir = Path(LOGS_DIR)
transformation_data = []

for log_file in sorted(log_dir.glob('*.json')):
    try:
        with open(log_file) as f:
            data = json.load(f)

        if not data.get('isMultiTurn') or not data.get('transformationAnalysis'):
            continue

        ta = data['transformationAnalysis']
        tp = ta.get('turnProgression', {})
        dtr = ta.get('dialogueTraceReport', {})
        oa = dtr.get('overallAssessment', {})

        profile = data.get('profileName', '')
        is_recog = 'recog' in str(profile).lower()

        transformation_data.append({
            'file': log_file.name,
            'profile': profile,
            'is_recog': is_recog,
            'adaptationIndex': tp.get('adaptationIndex'),
            'learnerGrowthIndex': tp.get('learnerGrowthIndex'),
            'bilateralTransformationIndex': tp.get('bilateralTransformationIndex'),
            'transformationQuality': oa.get('transformationQuality'),
        })
    except (json.JSONDecodeError, IOError):
        continue

df_trans = pd.DataFrame(transformation_data)
print(f'Dialogue files with transformationAnalysis: {len(df_trans)}')

if len(df_trans) > 0:
    metrics = ['adaptationIndex', 'learnerGrowthIndex', 'bilateralTransformationIndex', 'transformationQuality']
    metric_labels = ['Tutor Adaptation Index', 'Learner Growth Index', 'Bilateral Transformation Index', 'Transformation Quality']

    for is_recog, label in [(False, 'Base'), (True, 'Recognition')]:
        subset = df_trans[df_trans['is_recog'] == is_recog]
        print(f'\n  {label} (N={len(subset)}):')
        for metric, mlabel in zip(metrics, metric_labels):
            vals = subset[metric].dropna()
            if len(vals) > 0:
                print(f'    {mlabel:40s}  Mean={vals.mean():.3f}  SD={vals.std():.3f}')

    # Deltas
    print(f'\n  Deltas (Recognition - Base):')
    for metric, mlabel in zip(metrics, metric_labels):
        base_m = df_trans.loc[~df_trans['is_recog'], metric].dropna().mean()
        recog_m = df_trans.loc[df_trans['is_recog'], metric].dropna().mean()
        if pd.notna(base_m) and pd.notna(recog_m):
            print(f'    {mlabel:40s}  Δ = {recog_m - base_m:+.3f}')

## 13. Tables 14–15 — Lexical Analysis

In [None]:
# Tables 14-15: Lexical Analysis
print('Tables 14-15: Lexical Analysis')
print('=' * 70)

# Stopwords matching scripts/qualitative-analysis.js
STOPWORDS = set([
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
    'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been',
    'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
    "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does',
    "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for',
    'from', 'further', 'get', 'got', 'had', "hadn't", 'has', "hasn't",
    'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her',
    'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
    "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into',
    'is', "isn't", 'it', "it's", 'its', 'itself', 'just', 'let', "let's",
    'like', 'make', 'me', 'might', 'more', 'most', "mustn't", 'my',
    'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
    'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
    'really', 'right', 'same', "shan't", 'she', "she'd", "she'll",
    "she's", 'should', "shouldn't", 'so', 'some', 'such', 'take', 'than',
    'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
    'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
    "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until',
    'up', 'us', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're",
    "we've", 'well', 'were', "weren't", 'what', "what's", 'when',
    "when's", 'where', "where's", 'which', 'while', 'who', "who's",
    'whom', 'why', "why's", 'will', 'with', "won't", 'would', "wouldn't",
    'you', "you'd", "you'll", "you're", "you've", 'your', 'yours',
    'yourself', 'yourselves', 'also', 'been', 'being', 'come', 'even',
    'first', 'going', 'good', 'know', 'look', 'much', 'need', 'new', 'now',
    'one', 'people', 'really', 'see', 'think', 'thing', 'time', 'two',
    'use', 'want', 'way', 'work', 'would', 'year', 'back', 'long', 'say',
    'still', 'tell', 'try', 'give', 'go', 'help', 'keep', 'many',
    'may', 'put', 'seem', 'show', 'start', 'turn', 'big', 'end', 'set',
    'll', 've', 're', 's', 't', 'd', 'don', 'isn', 'doesn', 'didn',
    'won', 'can', 'couldn', 'shouldn', 'wasn', 'weren', 'hasn', 'haven',
    'hadn', 'aren', 'mustn', 'shan', 'ain',
])

def tokenize(text):
    """Tokenize matching JS implementation."""
    text = re.sub(r"[^a-z'\s-]", ' ', text.lower())
    return [w for w in text.split() if len(w) > 1]

def tokenize_filtered(text):
    return [w for w in tokenize(text) if w not in STOPWORDS]

def count_sentences(text):
    sentences = [s for s in re.split(r'[.!?]+', text) if s.strip()]
    return max(len(sentences), 1)

def extract_messages(suggestions_json):
    """Extract message text from suggestions JSON."""
    try:
        parsed = json.loads(suggestions_json)
        if not isinstance(parsed, list):
            return []
        return [s.get('message', '') for s in parsed if s.get('message')]
    except (json.JSONDecodeError, TypeError):
        return []

# Build corpora: cells 1-4 (base) vs cells 5-8 (recognition)
BASE_CELLS = ['cell_1_base_single_unified', 'cell_2_base_single_psycho',
              'cell_3_base_multi_unified', 'cell_4_base_multi_psycho']
RECOG_CELLS = ['cell_5_recog_single_unified', 'cell_6_recog_single_psycho',
               'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho']

df_corpus = df_all[df_all['suggestions'].notna() & df_all['profile_name'].isin(BASE_CELLS + RECOG_CELLS)].copy()

base_messages = []
recog_messages = []

for _, row in df_corpus.iterrows():
    msgs = extract_messages(row['suggestions'])
    if row['profile_name'] in BASE_CELLS:
        base_messages.extend(msgs)
    else:
        recog_messages.extend(msgs)

print(f'Base messages: {len(base_messages)}')
print(f'Recognition messages: {len(recog_messages)}')

base_text = ' '.join(base_messages)
recog_text = ' '.join(recog_messages)

In [None]:
# Table 14: Lexical diversity metrics
print('Table 14: Lexical Diversity Metrics')
print('=' * 70)

def compute_lexical_metrics(text, label):
    all_tokens = tokenize(text)
    types = set(all_tokens)
    sentences = count_sentences(text)
    ttr = len(types) / len(all_tokens) if all_tokens else 0
    mean_word_len = np.mean([len(w) for w in all_tokens]) if all_tokens else 0
    mean_sent_len = len(all_tokens) / sentences if sentences else 0
    return {
        'label': label,
        'tokens': len(all_tokens),
        'vocabulary': len(types),
        'ttr': ttr,
        'mean_word_length': mean_word_len,
        'mean_sentence_length': mean_sent_len,
    }

lex_base = compute_lexical_metrics(base_text, 'Base (message)')
lex_recog = compute_lexical_metrics(recog_text, 'Recognition (message)')

df_lex = pd.DataFrame([lex_base, lex_recog]).set_index('label')
print(df_lex.round(4).to_string())

print(f'\nPaper reports: TTR 0.039/0.044, Vocab 2319/3689')
print(f'Computed:      TTR {lex_base["ttr"]:.3f}/{lex_recog["ttr"]:.3f}, Vocab {lex_base["vocabulary"]}/{lex_recog["vocabulary"]}')

In [None]:
# Table 15: Differential word frequency
print('Table 15: Differential Word Frequency')
print('=' * 70)

base_tokens_filtered = tokenize_filtered(base_text)
recog_tokens_filtered = tokenize_filtered(recog_text)

base_freq = Counter(base_tokens_filtered)
recog_freq = Counter(recog_tokens_filtered)

base_total = len(base_tokens_filtered)
recog_total = len(recog_tokens_filtered)

all_words = set(base_freq.keys()) | set(recog_freq.keys())

differential = []
for word in all_words:
    bc = base_freq.get(word, 0)
    rc = recog_freq.get(word, 0)
    if bc + rc < 10:
        continue
    base_rate = bc / base_total if base_total else 0
    recog_rate = rc / recog_total if recog_total else 0
    if base_rate > 0 and recog_rate > 0:
        ratio = recog_rate / base_rate
    elif recog_rate > 0:
        ratio = float('inf')
    else:
        ratio = 0
    differential.append({
        'word': word, 'base_count': bc, 'recog_count': rc,
        'base_rate': base_rate, 'recog_rate': recog_rate, 'ratio': ratio
    })

df_diff = pd.DataFrame(differential)

# Recognition-skewed (top 15)
recog_skewed = df_diff[(df_diff['ratio'] != float('inf')) & (df_diff['ratio'] > 1) & (df_diff['recog_count'] >= 10)]
recog_skewed = recog_skewed.sort_values('ratio', ascending=False).head(15)

print('\nRecognition-Skewed Words (top 15):')
for _, row in recog_skewed.iterrows():
    print(f'  {row["word"]:20s}  base={row["base_count"]:4d}  recog={row["recog_count"]:4d}  ratio={row["ratio"]:.1f}x')

# Base-skewed (top 15)
base_skewed = df_diff[(df_diff['ratio'] > 0) & (df_diff['ratio'] < 1) & (df_diff['base_count'] >= 10)]
base_skewed = base_skewed.sort_values('ratio').head(15)

print('\nBase-Skewed Words (top 15):')
for _, row in base_skewed.iterrows():
    print(f'  {row["word"]:20s}  base={row["base_count"]:4d}  recog={row["recog_count"]:4d}  ratio={row["ratio"]:.2f}x')

## 14. Table 16 — Thematic Coding

In [None]:
# Table 16: Thematic Coding with Chi-Square
print('Table 16: Thematic Code Frequency by Condition')
print('=' * 70)

# 6 regex categories matching scripts/qualitative-analysis.js exactly
THEMATIC_CATEGORIES = {
    'engagement': {
        'label': 'Engagement markers',
        'patterns': [
            r'your insight', r'building on your', r'your question', r'your point',
            r'your observation', r'your analysis', r'your argument', r'your critique',
            r"you've (raised|identified|highlighted|noticed|pointed out)",
            r"you're (asking|raising|pushing|exploring|getting at)",
        ],
    },
    'transformation': {
        'label': 'Transformation language',
        'patterns': [
            r'reconsidering', r'that changes (how I|my)', r"I hadn't (thought|considered)",
            r'revising (my|the)', r'let me (revise|adjust|rethink)',
            r"you've (helped|pushed|made) me",
            r'your .{1,20} (complicates|enriches|changes)',
            r'shifts? (my|the|our) (understanding|framing|approach)',
        ],
    },
    'struggle_honoring': {
        'label': 'Struggle-honoring',
        'patterns': [
            r'wrestling with', r'productive confusion', r'working through',
            r'grappling with', r'sitting with (the|this)',
            r'tension (between|here|you)', r'difficulty (is|here)',
            r'struggle (with|is|here)', r'not (easy|simple|straightforward)',
        ],
    },
    'learner_as_subject': {
        'label': 'Learner-as-subject framing',
        'patterns': [
            r'your interpretation', r'your analysis', r'your understanding',
            r"you're grappling with", r'your perspective', r'your framework',
            r'your reading', r"what you're (doing|building|developing|constructing)",
            r'your (intellectual|philosophical|analytical)',
        ],
    },
    'directive': {
        'label': 'Directive framing',
        'patterns': [
            r'you should', r'you need to', r'you must',
            r'the correct (answer|approach|way)', r'the answer is',
            r'let me explain', r"here's what",
            r'make sure (to|you)', r'first,? you',
        ],
    },
    'generic': {
        'label': 'Generic/placeholder',
        'patterns': [
            r'foundational', r'key concepts', r'learning objectives',
            r'knowledge base', r'solid foundation', r'core concepts',
            r'build (a|your) (solid|strong)',
            r'comprehensive (understanding|overview|review)',
        ],
    },
}

def count_matches(text, patterns):
    """Count total regex matches in text."""
    total = 0
    for pat in patterns:
        total += len(re.findall(pat, text, re.IGNORECASE))
    return total

def count_response_presence(messages, patterns):
    """Count how many messages contain at least one match."""
    present = 0
    for msg in messages:
        for pat in patterns:
            if re.search(pat, msg, re.IGNORECASE):
                present += 1
                break
    return present

base_word_count = len(tokenize(base_text))
recog_word_count = len(tokenize(recog_text))

thematic_results = []

for cat_key, config in THEMATIC_CATEGORIES.items():
    patterns = config['patterns']

    base_raw = count_matches(base_text, patterns)
    recog_raw = count_matches(recog_text, patterns)

    base_per1000 = (base_raw / base_word_count) * 1000 if base_word_count else 0
    recog_per1000 = (recog_raw / recog_word_count) * 1000 if recog_word_count else 0
    ratio = recog_per1000 / base_per1000 if base_per1000 > 0 else (float('inf') if recog_per1000 > 0 else 1.0)

    # Chi-square: response-level presence/absence with Yates correction
    base_present = count_response_presence(base_messages, patterns)
    base_absent = len(base_messages) - base_present
    recog_present = count_response_presence(recog_messages, patterns)
    recog_absent = len(recog_messages) - recog_present

    # 2x2 contingency table
    observed = np.array([[base_present, base_absent], [recog_present, recog_absent]])
    chi2, p_val, dof, expected = stats.chi2_contingency(observed, correction=True)

    sig = '*' if p_val < 0.05 else ''

    thematic_results.append({
        'Category': config['label'],
        'Base (per 1000)': base_per1000,
        'Recog (per 1000)': recog_per1000,
        'Ratio': ratio,
        'chi2': chi2,
        'p': p_val,
        'Sig': sig,
    })

    print(f'  {config["label"]:30s}  Base={base_per1000:.1f}  Recog={recog_per1000:.1f}  '
          f'Ratio={ratio:.2f}x  χ²={chi2:.2f}  p={p_val:.4f} {sig}')

print(f'\nPaper reports: χ² = 141.90 (struggle), 69.85 (engagement), 93.15 (generic)')

In [None]:
# Grouped bar chart for thematic coding
if thematic_results:
    df_theme = pd.DataFrame(thematic_results)
    fig, ax = plt.subplots(figsize=(12, 5))

    x = np.arange(len(df_theme))
    width = 0.35

    bars1 = ax.bar(x - width/2, df_theme['Base (per 1000)'], width, label='Base', color='#3498db')
    bars2 = ax.bar(x + width/2, df_theme['Recog (per 1000)'], width, label='Recognition', color='#e74c3c')

    ax.set_xlabel('Thematic Category')
    ax.set_ylabel('Occurrences per 1000 words')
    ax.set_title('Table 16: Thematic Coding by Condition')
    ax.set_xticks(x)
    ax.set_xticklabels([r['Category'] for r in thematic_results], rotation=30, ha='right', fontsize=9)
    ax.legend()

    # Annotate significance
    for i, row in df_theme.iterrows():
        if row['Sig'] == '*':
            y_max = max(row['Base (per 1000)'], row['Recog (per 1000)'])
            ax.annotate('*', (i, y_max + 0.3), ha='center', fontsize=16, fontweight='bold', color='red')

    plt.tight_layout()
    plt.show()

## 15. Table 17 — Cost-Benefit

In [None]:
# Table 17: Cost-Benefit by Domain and Architecture
print('Table 17: Cost-Benefit Analysis')
print('=' * 70)

# Philosophy: factorial cells 1,3 (base single/multi) and 5,7 (recog single/multi)
phil_cells_single = ['cell_1_base_single_unified', 'cell_5_recog_single_unified']
phil_cells_multi = ['cell_3_base_multi_unified', 'cell_7_recog_multi_unified']

phil_single = df_fact[df_fact['profile_name'].isin(phil_cells_single)]
phil_multi = df_fact[df_fact['profile_name'].isin(phil_cells_multi)]

# Elementary: domain nemotron run
df_elem = get_run('domain_nemotron')
elem_single = df_elem[df_elem['factor_B_multi'] == False]
elem_multi = df_elem[df_elem['factor_B_multi'] == True]

cost_data = []
for domain, single_df, multi_df in [
    ('Philosophy', phil_single, phil_multi),
    ('Elementary', elem_single, elem_multi),
]:
    for arch, arch_df in [('Single-agent', single_df), ('Multi-agent', multi_df)]:
        avg_score = arch_df['overall_score'].mean()
        latency = arch_df['latency_ms'].mean() / 1000 if 'latency_ms' in arch_df.columns and arch_df['latency_ms'].notna().any() else np.nan
        cost_data.append({
            'Domain': domain,
            'Architecture': arch,
            'Avg Score': avg_score,
            'Latency (s)': latency,
            'N': len(arch_df),
        })

df_cost = pd.DataFrame(cost_data)
print(df_cost.round(1).to_string(index=False))

# Compute deltas
for domain in ['Philosophy', 'Elementary']:
    single = df_cost[(df_cost['Domain'] == domain) & (df_cost['Architecture'] == 'Single-agent')].iloc[0]
    multi = df_cost[(df_cost['Domain'] == domain) & (df_cost['Architecture'] == 'Multi-agent')].iloc[0]
    score_delta = multi['Avg Score'] - single['Avg Score']
    latency_mult = multi['Latency (s)'] / single['Latency (s)'] if single['Latency (s)'] > 0 else np.nan
    print(f'\n  {domain}: Score Δ = {score_delta:+.1f}, Latency = {latency_mult:.1f}x')

## 16. Transcript Excerpts

In [None]:
# Transcript excerpts: 3 high-contrast pairs
print('Transcript Excerpts: High-Contrast Pairs')
print('=' * 70)

PAIR_SCENARIOS = ['struggling_learner', 'recognition_seeking_learner', 'adversarial_tester']

df_with_msgs = df_all[
    df_all['overall_score'].notna() &
    df_all['suggestions'].notna() &
    (df_all['overall_score'] > 0)
].copy()

for scenario in PAIR_SCENARIOS:
    sc = df_with_msgs[df_with_msgs['scenario_id'] == scenario]

    # Best recognition
    recog_sc = sc[sc['profile_name'].isin(RECOG_CELLS)]
    base_sc = sc[sc['profile_name'].isin(BASE_CELLS)]

    if len(recog_sc) == 0 or len(base_sc) == 0:
        continue

    best_recog = recog_sc.loc[recog_sc['overall_score'].idxmax()]
    worst_base = base_sc.loc[base_sc['overall_score'].idxmin()]

    print(f'\n--- {scenario} ---')
    print(f'  Recognition (id={best_recog["id"]}, {best_recog["profile_name"]}, score={best_recog["overall_score"]:.1f}):')

    try:
        recog_msgs = json.loads(best_recog['suggestions'])
        for s in recog_msgs[:2]:
            msg = s.get('message', '')[:200]
            print(f'    "{msg}..."' if len(s.get('message', '')) > 200 else f'    "{msg}"')
    except (json.JSONDecodeError, TypeError):
        print('    (could not parse)')

    print(f'  Base (id={worst_base["id"]}, {worst_base["profile_name"]}, score={worst_base["overall_score"]:.1f}):')
    try:
        base_msgs = json.loads(worst_base['suggestions'])
        for s in base_msgs[:2]:
            msg = s.get('message', '')[:200]
            print(f'    "{msg}..."' if len(s.get('message', '')) > 200 else f'    "{msg}"')
    except (json.JSONDecodeError, TypeError):
        print('    (could not parse)')

    print(f'  Score gap: {best_recog["overall_score"] - worst_base["overall_score"]:.1f} points')

## 17. Concordance Check

In [None]:
# Concordance check: compare computed vs paper-reported values
print('CONCORDANCE CHECK')
print('=' * 70)
print(f'{"Statistic":45s} {"Paper":>12s} {"Computed":>12s} {"Match":>8s}')
print('-' * 80)

def check(label, paper_val, computed_val, tolerance=0.5):
    """Check if computed value matches paper within tolerance."""
    if pd.isna(computed_val):
        match = '?'
    elif abs(computed_val - paper_val) <= tolerance:
        match = 'YES'
    else:
        match = f'NO ({computed_val - paper_val:+.2f})'
    print(f'{label:45s} {paper_val:12.2f} {computed_val:12.2f} {match:>8s}')

# Sample sizes
print('\n-- Sample Sizes --')
total_scored = sum(get_run(k).shape[0] for k in RUN_IDS)
check('Total scored (6 primary runs)', 562, total_scored, 5)

for key, expected in [('recognition_validation', 36), ('full_factorial', 342), ('ab_nemotron', 17),
                       ('ab_kimi', 60), ('domain_nemotron', 47), ('domain_kimi', 60)]:
    check(f'  {key} N', expected, get_run(key).shape[0], 1)

# Table 4 key values
print('\n-- Table 4: Recognition Validation --')
df_v = get_run('recognition_validation')
base_m = df_v[df_v['prompt_type'] == 'base']['overall_score'].mean()
enh_m = df_v[df_v['prompt_type'] == 'enhanced']['overall_score'].mean()
rec_m = df_v[df_v['prompt_type'] == 'recognition']['overall_score'].mean()
check('Recognition unique effect (+8.7)', 8.7, rec_m - enh_m, 1.0)
check('Total effect (+20.1)', 20.1, rec_m - base_m, 1.0)
check('Engineering effect (+11.4)', 11.4, enh_m - base_m, 1.0)

# Table 5 key values
print('\n-- Table 5: Factorial ANOVA --')
check('F(A: Recognition) = 43.27', 43.27, f_A, 2.0)
check('eta-sq(A) = .109', 0.109, eta_A, 0.01)

# Multi-turn Cohen's d
print('\n-- Table 12: Multi-Turn Cohen\'s d --')
for scenario, expected_d in [('misconception_correction_flow', 0.85),
                               ('mood_frustration_to_breakthrough', 0.59),
                               ('mutual_transformation_journey', 0.78)]:
    sc_data = df_scored[df_scored['scenario_id'] == scenario].copy()
    sc_data['is_recog'] = sc_data['profile_name'].str.contains('recog', case=False, na=False)
    base_s = sc_data[~sc_data['is_recog']]['overall_score']
    recog_s = sc_data[sc_data['is_recog']]['overall_score']
    if len(base_s) > 1 and len(recog_s) > 1:
        d_val = cohens_d(recog_s.values, base_s.values)
        check(f'd({scenario[:25]})', expected_d, d_val, 0.15)

# Thematic chi-squares
print('\n-- Table 16: Thematic Chi-Square --')
chi_lookup = {r['Category']: r for r in thematic_results}
for label, expected in [('Struggle-honoring', 141.90), ('Engagement markers', 69.85), ('Generic/placeholder', 93.15)]:
    if label in chi_lookup:
        check(f'chi2({label})', expected, chi_lookup[label]['chi2'], 10.0)

# Lexical metrics
print('\n-- Table 14: Lexical Metrics --')
check('Base TTR', 0.039, lex_base['ttr'], 0.005)
check('Recog TTR', 0.044, lex_recog['ttr'], 0.005)
check('Base vocab', 2319, lex_base['vocabulary'], 50)
check('Recog vocab', 3689, lex_recog['vocabulary'], 50)

print('\n' + '=' * 80)
print('Concordance check complete.')
print('Values marked YES match within tolerance. Values marked NO show the discrepancy.')
print('Small discrepancies may arise from floating-point differences, rounding,\n'
      'or slight DB content changes between paper finalization and current state.')

In [None]:
# Clean up
conn.close()
print('Database connection closed. Notebook complete.')