# Reasoning Trace Analysis

Analyzes the length and structure of reasoning traces from CoT/reasoning models.
Compares reasoning effort across models, domains, and conditions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['figure.dpi'] = 100

# ── Configuration ───────────────────────────────────────────────────────
BASE_DIR = Path('full_results')          # Change to Path('test_output') for test results

REASONING_MODELS = ['gpt-5.2', 'deepseek-v3.1', 'qwen3-235b', 'qwen3-235b-thinking', 'qwen3-next-thinking']
NON_REASONING_MODELS = ['gpt-4o', 'qwen-coder', 'llama-4']
ALL_MODELS = NON_REASONING_MODELS + REASONING_MODELS

CONDITIONS = {
    'regular':   {'subdir': 'results',           'suffix': ''},
    'no_guide':  {'subdir': 'results_no_guide',  'suffix': '_no_guide'},
    'math_only': {'subdir': 'results_math_only', 'suffix': '_math_only'},
}

print(f'Base directory: {BASE_DIR}')
print(f'Reasoning models: {REASONING_MODELS}')
print(f'Non-reasoning models: {NON_REASONING_MODELS}')

## 1. Load all results

In [None]:
def load_all_results(base_dir: Path, models: list, conditions: dict) -> pd.DataFrame:
    """Load all result TSVs into a single DataFrame."""
    frames = []
    for condition, cfg in conditions.items():
        results_dir = base_dir / cfg['subdir']
        suffix = cfg['suffix']
        for model in models:
            model_dir = results_dir / model
            if not model_dir.exists():
                continue
            for f in sorted(model_dir.glob('*_converted.tsv')):
                try:
                    df = pd.read_csv(f, sep='\t')
                except Exception as e:
                    print(f'Error loading {f}: {e}')
                    continue
                # Extract domain name
                domain = f.stem.replace('_converted', '')
                for s in ['_no_guide', '_math_only']:
                    domain = domain.replace(s, '')
                df['model'] = model
                df['condition'] = condition
                df['domain'] = domain
                df['is_reasoning_model'] = model in REASONING_MODELS
                frames.append(df)
    if not frames:
        print('WARNING: No result files found!')
        return pd.DataFrame()
    data = pd.concat(frames, ignore_index=True)
    print(f'Loaded {len(data):,} rows from {len(frames)} files')
    return data

df = load_all_results(BASE_DIR, ALL_MODELS, CONDITIONS)
df.head()

## 2. Extract reasoning trace features

In [None]:
def extract_reasoning_features(raw: str) -> dict:
    """Parse a raw_response to extract reasoning trace metrics."""
    if pd.isna(raw):
        return {'trace_chars': 0, 'trace_words': 0, 'has_think_tags': False,
                'has_reasoning_tags': False, 'reasoning_text': '', 'final_answer_text': raw}

    raw = str(raw)
    total_chars = len(raw)
    total_words = len(raw.split())

    # Check for <think>...</think> tags (qwen3-235b inline reasoning)
    think_match = re.search(r'<think>(.*?)</think>', raw, re.DOTALL)
    # Check for [REASONING]...[/REASONING] tags (captured from reasoning_content field)
    reasoning_match = re.search(r'\[REASONING\](.*?)\[/REASONING\]', raw, re.DOTALL)

    has_think = think_match is not None
    has_reasoning = reasoning_match is not None

    if has_think:
        reasoning_text = think_match.group(1).strip()
        final_answer_text = raw[think_match.end():].strip()
    elif has_reasoning:
        reasoning_text = reasoning_match.group(1).strip()
        final_answer_text = raw[reasoning_match.end():].strip()
    else:
        reasoning_text = ''
        final_answer_text = raw.strip()

    trace_chars = len(reasoning_text)
    trace_words = len(reasoning_text.split()) if reasoning_text else 0

    return {
        'total_response_chars': total_chars,
        'total_response_words': total_words,
        'trace_chars': trace_chars,
        'trace_words': trace_words,
        'has_think_tags': has_think,
        'has_reasoning_tags': has_reasoning,
        'has_any_trace': has_think or has_reasoning,
        'reasoning_text': reasoning_text,
        'final_answer_text': final_answer_text,
    }

# Apply to all rows
features = df['raw_response'].apply(extract_reasoning_features).apply(pd.Series)
df = pd.concat([df, features], axis=1)

# Approximate token count (rough: 1 token ≈ 4 chars for English)
df['trace_tokens_approx'] = (df['trace_chars'] / 4).astype(int)
df['total_tokens_approx'] = (df['total_response_chars'] / 4).astype(int)

# Correct answer flag
df['correct'] = (df['loss'] == 0).astype(int)

print(f'\nRows with reasoning traces: {df["has_any_trace"].sum():,} / {len(df):,}')
print(f'Rows with <think> tags:     {df["has_think_tags"].sum():,}')
print(f'Rows with [REASONING] tags: {df["has_reasoning_tags"].sum():,}')

## 3. Summary statistics: response & trace length by model

In [None]:
# Summary table: reasoning models only
reasoning_df = df[df['is_reasoning_model']].copy()

summary = reasoning_df.groupby('model').agg(
    n_rows=('raw_response', 'count'),
    pct_has_trace=('has_any_trace', 'mean'),
    avg_total_chars=('total_response_chars', 'mean'),
    avg_trace_chars=('trace_chars', 'mean'),
    median_trace_chars=('trace_chars', 'median'),
    max_trace_chars=('trace_chars', 'max'),
    avg_trace_tokens=('trace_tokens_approx', 'mean'),
    accuracy=('correct', 'mean'),
).round(2)

summary['pct_has_trace'] = (summary['pct_has_trace'] * 100).round(1).astype(str) + '%'
summary['accuracy'] = (summary['accuracy'] * 100).round(1).astype(str) + '%'

print('=== Reasoning Model Summary ===')
display(summary)

## 4. Response length distributions

In [None]:
Path('plots').mkdir(exist_ok=True)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 4a. Total response length (all models)
ax = axes[0]
for model in ALL_MODELS:
    subset = df[df['model'] == model]
    if len(subset) == 0:
        continue
    vals = subset['total_response_chars'].clip(upper=subset['total_response_chars'].quantile(0.99))
    ax.hist(vals, bins=50, alpha=0.5, label=model, density=True)
ax.set_xlabel('Total response length (chars)')
ax.set_ylabel('Density')
ax.set_title('Total Response Length Distribution')
ax.legend(fontsize=8)

# 4b. Reasoning trace length (reasoning models only, where trace exists)
ax = axes[1]
trace_df = reasoning_df[reasoning_df['has_any_trace']]
for model in REASONING_MODELS:
    subset = trace_df[trace_df['model'] == model]
    if len(subset) == 0:
        continue
    vals = subset['trace_chars'].clip(upper=subset['trace_chars'].quantile(0.99))
    ax.hist(vals, bins=50, alpha=0.5, label=model, density=True)
ax.set_xlabel('Reasoning trace length (chars)')
ax.set_ylabel('Density')
ax.set_title('Reasoning Trace Length (where present)')
ax.legend(fontsize=8)

plt.tight_layout()
plt.savefig('plots/reasoning_trace_length_dist.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Trace length by model × condition

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

plot_data = reasoning_df.groupby(['model', 'condition'])['total_response_chars'].mean().reset_index()
pivot = plot_data.pivot(index='model', columns='condition', values='total_response_chars')
pivot = pivot.reindex(REASONING_MODELS)

pivot.plot(kind='bar', ax=ax, width=0.7)
ax.set_ylabel('Mean total response length (chars)')
ax.set_title('Mean Response Length by Model and Condition')
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
ax.legend(title='Condition')

plt.tight_layout()
plt.savefig('plots/reasoning_length_by_model_condition.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Trace length by model × domain

In [None]:
# Heatmap: mean total response length — model vs domain
trace_heatmap = reasoning_df.groupby(['model', 'domain'])['total_response_chars'].mean().reset_index()
pivot_heat = trace_heatmap.pivot(index='model', columns='domain', values='total_response_chars')
pivot_heat = pivot_heat.reindex(REASONING_MODELS)

fig, ax = plt.subplots(figsize=(18, 5))
sns.heatmap(pivot_heat, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax,
            linewidths=0.5, cbar_kws={'label': 'Mean response chars'})
ax.set_title('Mean Response Length (chars) — Model × Domain')
ax.set_xlabel('')
ax.set_ylabel('')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('plots/reasoning_heatmap_model_domain.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Does longer reasoning → better accuracy?

In [None]:
fig, axes = plt.subplots(1, len(REASONING_MODELS), figsize=(4 * len(REASONING_MODELS), 5),
                         sharey=True)

for i, model in enumerate(REASONING_MODELS):
    ax = axes[i]
    subset = reasoning_df[reasoning_df['model'] == model].copy()
    if len(subset) == 0:
        ax.set_title(f'{model}\n(no data)')
        continue

    # Bin response length into quintiles
    subset['length_bin'] = pd.qcut(
        subset['total_response_chars'], q=5, labels=False, duplicates='drop'
    )
    bin_acc = subset.groupby('length_bin')['correct'].mean()
    bin_labels = subset.groupby('length_bin')['total_response_chars'].mean()

    ax.bar(range(len(bin_acc)), bin_acc.values, color=sns.color_palette('viridis', 5))
    ax.set_xticks(range(len(bin_acc)))
    ax.set_xticklabels([f'{v:.0f}' for v in bin_labels.values], rotation=45, fontsize=8)
    ax.set_title(model, fontsize=10)
    ax.set_xlabel('Avg chars in bin')
    if i == 0:
        ax.set_ylabel('Accuracy')
    ax.set_ylim(0, 1.05)

fig.suptitle('Accuracy by Response Length Quintile', fontsize=13, y=1.02)
plt.tight_layout()
plt.savefig('plots/accuracy_vs_reasoning_length.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Reasoning vs non-reasoning: accuracy comparison

In [None]:
acc_by_model = df.groupby(['model', 'condition']).agg(
    accuracy=('correct', 'mean'),
    n=('correct', 'count'),
    avg_response_len=('total_response_chars', 'mean'),
).round(4).reset_index()

acc_by_model['is_reasoning'] = acc_by_model['model'].isin(REASONING_MODELS)
acc_by_model = acc_by_model.sort_values(['condition', 'is_reasoning', 'model'])

print('=== Accuracy & Response Length by Model and Condition ===')
display(acc_by_model.style.format({
    'accuracy': '{:.1%}',
    'avg_response_len': '{:,.0f}',
}).background_gradient(subset=['accuracy'], cmap='RdYlGn', vmin=0, vmax=1))

In [None]:
# Side-by-side bar chart: reasoning vs non-reasoning accuracy by condition
fig, ax = plt.subplots(figsize=(14, 6))

order = NON_REASONING_MODELS + REASONING_MODELS

sns.barplot(data=acc_by_model, x='model', y='accuracy', hue='condition',
            order=order, ax=ax)

# Add vertical separator between reasoning and non-reasoning
ax.axvline(x=len(NON_REASONING_MODELS) - 0.5, color='gray', linestyle='--', alpha=0.5)
ax.text(1, 1.02, 'Non-reasoning', transform=ax.get_xaxis_transform(),
        ha='center', fontsize=10, color='gray')
ax.text(len(NON_REASONING_MODELS) + len(REASONING_MODELS) / 2, 1.02, 'Reasoning',
        transform=ax.get_xaxis_transform(), ha='center', fontsize=10, color='gray')

ax.set_ylabel('Accuracy')
ax.set_title('Model Accuracy: Reasoning vs Non-Reasoning')
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
ax.set_ylim(0, 1.05)
ax.legend(title='Condition')

plt.tight_layout()
plt.savefig('plots/accuracy_reasoning_vs_nonreasoning.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Detailed domain-level breakdown (reasoning models)

In [None]:
domain_stats = reasoning_df.groupby(['model', 'domain', 'condition']).agg(
    n=('raw_response', 'count'),
    accuracy=('correct', 'mean'),
    avg_total_chars=('total_response_chars', 'mean'),
    avg_trace_chars=('trace_chars', 'mean'),
    pct_has_trace=('has_any_trace', 'mean'),
).round(3).reset_index()

domain_stats['accuracy'] = (domain_stats['accuracy'] * 100).round(1)
domain_stats['pct_has_trace'] = (domain_stats['pct_has_trace'] * 100).round(1)

print('=== Domain-Level Reasoning Stats ===')
display(domain_stats.sort_values(['model', 'condition', 'domain']))

## 10. Difficulty vs reasoning effort

In [None]:
# Do models reason more on harder problems?
if 'difficulty' in reasoning_df.columns and reasoning_df['difficulty'].notna().any():
    diff_data = reasoning_df[reasoning_df['has_any_trace']].copy()

    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # 10a. Trace length vs difficulty
    ax = axes[0]
    for model in REASONING_MODELS:
        subset = diff_data[diff_data['model'] == model]
        if len(subset) == 0:
            continue
        grouped = subset.groupby('difficulty')['trace_chars'].mean()
        ax.plot(grouped.index, grouped.values, marker='o', label=model)
    ax.set_xlabel('Difficulty')
    ax.set_ylabel('Mean trace length (chars)')
    ax.set_title('Reasoning Effort vs Problem Difficulty')
    ax.legend(fontsize=8)

    # 10b. Accuracy vs difficulty by model type
    ax = axes[1]
    for label, group in [('Reasoning', REASONING_MODELS), ('Non-reasoning', NON_REASONING_MODELS)]:
        subset = df[df['model'].isin(group)]
        if len(subset) == 0:
            continue
        grouped = subset.groupby('difficulty')['correct'].mean()
        ax.plot(grouped.index, grouped.values, marker='o', label=label, linewidth=2)
    ax.set_xlabel('Difficulty')
    ax.set_ylabel('Accuracy')
    ax.set_title('Accuracy vs Difficulty: Reasoning vs Non-Reasoning')
    ax.legend()

    plt.tight_layout()
    plt.savefig('plots/difficulty_vs_reasoning.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print('No difficulty column or all NaN — skipping this analysis.')

## 11. Export summary tables