# Probe Baselines 
Train probes across next-event prediction, context retrieval, session classification, and anomaly detection while capturing storage/compute costs per rung. Results go in `research/results/probe_metrics.csv`.


In [None]:
from __future__ import annotations
import json
import shutil
import subprocess
import sys
from pathlib import Path
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

# Find repo root - the directory containing research/rung_extractors
# Walk up from current directory until we find research/rung_extractors
current = Path.cwd()
REPO_ROOT = None

# Walk up the directory tree to find repo root
while current != current.parent:
    # Check if this directory contains research/rung_extractors (the actual repo root)
    research_dir = current / 'research'
    rung_extractors = research_dir / 'rung_extractors'
    if research_dir.exists() and rung_extractors.exists():
        REPO_ROOT = current
        break
    current = current.parent

# Fallback: if not found, try to find any research directory
if REPO_ROOT is None:
    current = Path.cwd()
    while current != current.parent:
        if (current / 'research').exists():
            REPO_ROOT = current
            break
        current = current.parent

# Final fallback: use current directory
if REPO_ROOT is None:
    REPO_ROOT = Path.cwd()

print(f"Using REPO_ROOT: {REPO_ROOT}")

# Import shared rung extraction functions
RESEARCH_DIR = REPO_ROOT / 'research'
RUNG_EXTRACTORS_DIR = RESEARCH_DIR / 'rung_extractors'
if str(RUNG_EXTRACTORS_DIR) not in sys.path:
    sys.path.insert(0, str(RUNG_EXTRACTORS_DIR))
if str(RESEARCH_DIR) not in sys.path:
    sys.path.insert(0, str(RESEARCH_DIR))

from rung_extractors import (
    tokens_repr_str,
    semantic_edits_repr_str,
    functions_repr_str,
    motifs_repr_str,
    raw_repr_str,
)
TRACE_EXPORT = REPO_ROOT / 'research/data/companion_traces.jsonl'
TRACE_EXPORT.parent.mkdir(parents=True, exist_ok=True)
RESULTS_DIR = REPO_ROOT / 'research/results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Export traces if needed (use Python script if available, otherwise skip if file exists)
def export_traces() -> None:
    # Use the exact path to the export script
    EXPORT_SCRIPT = REPO_ROOT / 'research/scripts/export_complete_traces.py'
    
    if not EXPORT_SCRIPT.exists():
        # If script doesn't exist, check if trace file already exists
        if TRACE_EXPORT.exists():
            print(f"Trace file already exists at {TRACE_EXPORT}, skipping export")
            return
        raise FileNotFoundError(
            f'Export script not found at {EXPORT_SCRIPT} and trace file does not exist at {TRACE_EXPORT}'
        )
    
    print(f"Running export script: {EXPORT_SCRIPT}")
    subprocess.run([sys.executable, str(EXPORT_SCRIPT)], check=True, cwd=str(REPO_ROOT))

if not TRACE_EXPORT.exists():
    print(f"Trace file not found at {TRACE_EXPORT}. Exporting traces...")
    export_traces()
else:
    print(f"Trace file exists: {TRACE_EXPORT}")

# Load traces
if not TRACE_EXPORT.exists():
    raise FileNotFoundError(f"Trace file still does not exist after export attempt: {TRACE_EXPORT}")

with TRACE_EXPORT.open('r', encoding='utf-8') as fh:
    traces = [json.loads(line) for line in fh]
print(f'Loaded {len(traces)} traces')


Using REPO_ROOT: /Users/hamidaho/new_cursor
Trace file exists: /Users/hamidaho/new_cursor/research/data/companion_traces.jsonl
Loaded 40238 traces


In [None]:
# ============================================================================
# Rung Extraction Functions
# ============================================================================
# All rung extraction functions are imported from rung_extractors.py
# This ensures consistency across all notebooks and provides a single source of truth.
#
# Wrapper functions to match the expected interface (return strings)
def tokens_repr(trace):
    """Use shared tokens_repr_str from rung_extractors."""
    return tokens_repr_str(trace)

def edits_repr(trace):
    """Use shared semantic_edits_repr_str from rung_extractors."""
    return semantic_edits_repr_str(trace)

def functions_repr(trace):
    """Use shared functions_repr_str from rung_extractors."""
    return functions_repr_str(trace)

def motifs_repr(trace):
    """Use shared motifs_repr_str from rung_extractors."""
    return motifs_repr_str(trace)


def raw_repr(trace):
    """Use shared raw_repr_str from rung_extractors."""
    return raw_repr_str(trace)

RUNG_FUNCS = {
    'tokens': tokens_repr,
    'semantic_edits': edits_repr,
    'functions': functions_repr,
    'motifs': motifs_repr,
    'raw': raw_repr,
}

print("✓ Using shared rung extraction functions from rung_extractors.py")
print(f"  Available rungs: {list(RUNG_FUNCS.keys())}")
def label_from_trace(trace):
    events = trace.get('events', [])
    if not events:
        return 'none'
    return events[-1].get('type') or 'unknown'

# Improved label creation logic - creates balanced labels based on trace characteristics
def count_code_changes(trace):
    """Count code/file change events in trace."""
    code_change_types = [
        'code_change', 'file_change', 'file_create', 'file_delete', 
        'file_rename', 'entry_created'
    ]
    count = 0
    for event in trace.get('events', []):
        event_type = (event.get('type') or '').lower()
        if any(change_type in event_type for change_type in code_change_types):
            count += 1
        # Also check details for code content
        details = event.get('details', {})
        if isinstance(details, dict):
            if details.get('after_content') or details.get('before_content') or details.get('code'):
                count += 1
    return count

def count_unique_files(trace):
    """Count number of unique files modified in trace."""
    files = set()
    for event in trace.get('events', []):
        details = event.get('details', {})
        if isinstance(details, dict):
            file_path = details.get('file_path') or details.get('file')
            if file_path:
                files.add(str(file_path))
    return len(files)

def has_terminal_commands(trace):
    """Check if trace contains terminal commands."""
    for event in trace.get('events', []):
        event_type = (event.get('type') or '').lower()
        if 'terminal' in event_type or 'command' in event_type:
            return True
        details = event.get('details', {})
        if isinstance(details, dict) and details.get('command'):
            return True
    return False

def has_prompts(trace):
    """Check if trace contains prompts/AI interactions."""
    for event in trace.get('events', []):
        event_type = (event.get('type') or '').lower()
        if 'prompt' in event_type or 'conversation' in event_type:
            return True
        if event.get('ai_generated'):
            return True
    return False

def last_event_is_code_change(trace):
    """Check if last event is a code change (for next_event prediction)."""
    events = trace.get('events', [])
    if not events:
        return False
    last_event = events[-1]
    event_type = (last_event.get('type') or '').lower()
    code_types = ['code_change', 'file_change', 'entry_created']
    return any(code_type in event_type for code_type in code_types)

# Calculate cutoffs for balanced labels
event_counts = [len(trace.get('events', [])) for trace in traces]
code_change_counts = [count_code_changes(trace) for trace in traces]
file_counts = [count_unique_files(trace) for trace in traces]

anomaly_cutoff = np.percentile(event_counts, 90) if event_counts else 0
high_code_activity_cutoff = np.percentile(code_change_counts, 50) if code_change_counts else 0  # Median split
multi_file_cutoff = np.percentile(file_counts, 50) if file_counts else 1  # Median split

features = []
for trace in traces:
    entry = {}
    for name, fn in RUNG_FUNCS.items():
        entry[name] = fn(trace)
    
    entry['label'] = label_from_trace(trace)
    
    # Improved label creation - creates balanced binary labels
    # 1. next_code_change: Check if last event is a code change (for next-event prediction)
    entry['next_code_change'] = int(last_event_is_code_change(trace))
    
    # 2. high_code_activity: Traces with above-median code change count
    code_change_count = count_code_changes(trace)
    entry['high_code_activity'] = int(code_change_count > high_code_activity_cutoff)
    
    # 3. multi_file_session: Traces that modify multiple files (above median)
    unique_files = count_unique_files(trace)
    entry['multi_file_session'] = int(unique_files > multi_file_cutoff)
    
    # 4. has_terminal: Traces with terminal commands
    entry['has_terminal'] = int(has_terminal_commands(trace))
    
    # 5. has_prompts: Traces with AI prompts/interactions
    entry['has_prompts'] = int(has_prompts(trace))
    
    # 6. anomaly: Based on event count (top 10%)
    entry['anomaly'] = int(len(trace.get('events', [])) > anomaly_cutoff)
    
    # Target file for retrieval
    entry['target_file'] = next(
        (event.get('details', {}).get('file_path') or event.get('details', {}).get('file') 
         for event in reversed(trace.get('events', [])) if event.get('details')), 
        trace.get('workspace_path', 'unknown')
    )
    
    # Storage and term counts
    entry['storage_bytes'] = {name: len(entry[name].encode('utf-8')) for name in RUNG_FUNCS}
    entry['term_count'] = {name: len(entry[name].split()) for name in RUNG_FUNCS}
    
    features.append(entry)
print('Built {} feature rows'.format(len(features)))

# Debug: Check label distributions
print('\n' + '='*60)
print('Label Distribution Analysis')
print('='*60)

from collections import Counter

# Check all label distributions
label_names = [
    'next_code_change', 'high_code_activity', 'multi_file_session', 
    'has_terminal', 'has_prompts', 'anomaly'
]
for label_name in label_names:
    counts = [entry[label_name] for entry in features]
    positive = sum(counts)
    negative = len(counts) - positive
    print(f'\n{label_name}:')
    print(f'  Positive: {positive} ({positive/len(counts)*100:.1f}%)')
    print(f'  Negative: {negative} ({negative/len(counts)*100:.1f}%)')
    print(f'  Unique values: {set(counts)}')
    if positive == 0 or negative == 0:
        print(f'  ⚠ WARNING: Only one class present - will skip classification')
    elif positive < len(counts) * 0.1 or negative < len(counts) * 0.1:
        print(f'  ⚠ WARNING: Severe class imbalance (<10% minority class)')

# Check what labels are actually being generated
label_types = [entry['label'] for entry in features]
label_counter = Counter(label_types)
print(f'\nMost common event types (last event in trace):')
for label, count in label_counter.most_common(10):
    print(f'  {label}: {count}')

# Check event types in traces
all_event_types = []
for trace in traces:
    for event in trace.get('events', []):
        event_type = event.get('type', '').lower()
        if event_type:
            all_event_types.append(event_type)
event_type_counter = Counter(all_event_types)
print(f'\nMost common event types (all events):')
for etype, count in event_type_counter.most_common(10):
    print(f'  {etype}: {count}')


✓ Using shared rung extraction functions from rung_extractors.py
  Available rungs: ['tokens', 'semantic_edits', 'functions', 'motifs', 'raw']


Built 40238 feature rows

Label Distribution Analysis

next_code_change:
  Positive: 164 (0.4%)
  Negative: 40074 (99.6%)
  Unique values: {0, 1}

high_code_activity:
  Positive: 164 (0.4%)
  Negative: 40074 (99.6%)
  Unique values: {0, 1}

multi_file_session:
  Positive: 164 (0.4%)
  Negative: 40074 (99.6%)
  Unique values: {0, 1}

has_terminal:
  Positive: 0 (0.0%)
  Negative: 40238 (100.0%)
  Unique values: {0}

has_prompts:
  Positive: 1 (0.0%)
  Negative: 40237 (100.0%)
  Unique values: {0, 1}

anomaly:
  Positive: 164 (0.4%)
  Negative: 40074 (99.6%)
  Unique values: {0, 1}

Most common event types (last event in trace):
  none: 40074
  code_change: 164

Most common event types (all events):
  code_change: 5549


In [3]:
def build_probe_dataset(rung: str, label: str):
    reps = [entry[rung] for entry in features]
    placeholder='empty_repr'
    cleaned=[repr if repr.strip() else placeholder for repr in reps]
    vec = TfidfVectorizer(max_features=4096)
    # Check if we have any non-empty representations
    if cleaned and any(rep.strip() for rep in cleaned):
        X = vec.fit_transform(cleaned).toarray()
    else:
        X = np.zeros((len(cleaned), 1))
    y = np.array([entry[label] for entry in features], dtype=int)
    return X, y
results = []
# Updated task labels - balanced labels based on trace characteristics
task_labels = [
    ('next_code_change', 'next_event'),
    ('high_code_activity', 'code_activity_classification'),
    ('multi_file_session', 'multi_file_classification'),
    ('has_terminal', 'terminal_classification'),
    ('has_prompts', 'prompt_classification'),
    ('anomaly', 'anomaly_detection')
]

for rung in RUNG_FUNCS:
    for label_name, task in task_labels:
        X, y = build_probe_dataset(rung, label_name)
        if len(np.unique(y)) < 2:
            print(f'Skipping {rung} / {label_name} because only one class present')
            continue
        clf = LogisticRegression(max_iter=500)
        start = time.perf_counter()
        clf.fit(X, y)
        train_time = time.perf_counter() - start
        start = time.perf_counter()
        y_pred = clf.predict(X)
        infer_time = time.perf_counter() - start
        results.append({
            'rung': rung,
            'task': task,
            'accuracy': accuracy_score(y, y_pred),
            'f1': f1_score(y, y_pred),
            'train_time': train_time,
            'infer_time': infer_time,
            'storage_bytes': np.mean([entry['storage_bytes'][rung] for entry in features]),
            'avg_terms': np.mean([entry['term_count'][rung] for entry in features])
        })
df_probes = pd.DataFrame(results)
df_probes


Skipping tokens / has_terminal because only one class present
Skipping semantic_edits / has_terminal because only one class present
Skipping functions / has_terminal because only one class present
Skipping motifs / has_terminal because only one class present
Skipping raw / has_terminal because only one class present


Unnamed: 0,rung,task,accuracy,f1,train_time,infer_time,storage_bytes,avg_terms
0,tokens,next_event,0.99995,0.993865,0.340506,0.008164,16.693374,1.818207
1,tokens,code_activity_classification,0.99995,0.993865,0.120972,0.009219,16.693374,1.818207
2,tokens,multi_file_classification,0.99995,0.993865,0.162775,0.00698,16.693374,1.818207
3,tokens,prompt_classification,0.999975,0.0,0.190184,0.023912,16.693374,1.818207
4,tokens,anomaly_detection,0.99995,0.993865,0.197283,0.013617,16.693374,1.818207
5,semantic_edits,next_event,1.0,1.0,0.451204,0.025396,15.896739,1.148765
6,semantic_edits,code_activity_classification,1.0,1.0,0.467872,0.028396,15.896739,1.148765
7,semantic_edits,multi_file_classification,1.0,1.0,0.654524,0.063071,15.896739,1.148765
8,semantic_edits,prompt_classification,0.999975,0.0,0.495498,0.05443,15.896739,1.148765
9,semantic_edits,anomaly_detection,1.0,1.0,0.888593,0.067836,15.896739,1.148765


In [4]:
def retrieval_metrics(rung: str, ks=(1, 5, 10)):
    reps = [entry[rung] for entry in features]
    targets = [entry['target_file'] for entry in features]
    placeholder = 'empty_representation'
    clean_reps = [rep if rep.strip() else placeholder for rep in reps]
    vec = TfidfVectorizer(max_features=4096)
    matrix = vec.fit_transform(clean_reps)
    sims = cosine_similarity(matrix)
    np.fill_diagonal(sims, -np.inf)
    results = {k: 0 for k in ks}
    for idx, target in enumerate(targets):
        ranks = np.argsort(-sims[idx])
        for k in ks:
            if any(targets[pos] == target for pos in ranks[:k]):
                results[k] += 1
    total = len(targets)
    return {f'recall@{k}': results[k] / total for k in ks}
retrieval_stats = []
for rung in RUNG_FUNCS:
    start = time.perf_counter()
    recs = retrieval_metrics(rung)
    elapsed = time.perf_counter() - start
    recs.update({
        'rung': rung,
        'task': 'context_retrieval',
        'latency': elapsed,
        'storage_bytes': np.mean([entry['storage_bytes'][rung] for entry in features]),
        'avg_terms': np.mean([entry['term_count'][rung] for entry in features])
    })
    retrieval_stats.append(recs)
df_retrieval = pd.DataFrame(retrieval_stats)
df_final = pd.concat([df_probes, df_retrieval], ignore_index=True, sort=False)
df_final.to_csv(REPO_ROOT / 'research/results/probe_metrics.csv', index=False)
df_final


Unnamed: 0,rung,task,accuracy,f1,train_time,infer_time,storage_bytes,avg_terms,recall@1,recall@5,recall@10,latency
0,tokens,next_event,0.99995,0.993865,0.340506,0.008164,16.693374,1.818207,,,,
1,tokens,code_activity_classification,0.99995,0.993865,0.120972,0.009219,16.693374,1.818207,,,,
2,tokens,multi_file_classification,0.99995,0.993865,0.162775,0.00698,16.693374,1.818207,,,,
3,tokens,prompt_classification,0.999975,0.0,0.190184,0.023912,16.693374,1.818207,,,,
4,tokens,anomaly_detection,0.99995,0.993865,0.197283,0.013617,16.693374,1.818207,,,,
5,semantic_edits,next_event,1.0,1.0,0.451204,0.025396,15.896739,1.148765,,,,
6,semantic_edits,code_activity_classification,1.0,1.0,0.467872,0.028396,15.896739,1.148765,,,,
7,semantic_edits,multi_file_classification,1.0,1.0,0.654524,0.063071,15.896739,1.148765,,,,
8,semantic_edits,prompt_classification,0.999975,0.0,0.495498,0.05443,15.896739,1.148765,,,,
9,semantic_edits,anomaly_detection,1.0,1.0,0.888593,0.067836,15.896739,1.148765,,,,


In [None]:
# ============================================================================
# Dataset Parsing and Interpretation
# ============================================================================
# Analyze the dataset to understand why certain rungs perform better/worse

print("="*80)
print("DATASET PARSING AND INTERPRETATION")
print("="*80)

# 1. Representation Characteristics Analysis
print("\n1. REPRESENTATION CHARACTERISTICS")
print("-" * 80)

from collections import Counter
import re

rung_stats = {}
for rung in RUNG_FUNCS.keys():
    reps = [entry[rung] for entry in features]
    
    # Vocabulary analysis
    all_terms = []
    for rep in reps:
        if rep and rep.strip():
            terms = rep.split()
            all_terms.extend(terms)
    
    vocab = Counter(all_terms)
    unique_terms = len(vocab)
    total_terms = len(all_terms)
    avg_length = np.mean([len(rep.split()) for rep in reps if rep.strip()])
    
    # Diversity metrics
    if total_terms > 0:
        diversity = unique_terms / total_terms  # Type-token ratio
        most_common = vocab.most_common(10)
    else:
        diversity = 0
        most_common = []
    
    rung_stats[rung] = {
        'vocab_size': unique_terms,
        'total_terms': total_terms,
        'avg_length': avg_length,
        'diversity': diversity,
        'most_common': most_common,
        'empty_reprs': sum(1 for rep in reps if not rep or not rep.strip()),
    }
    
    print(f"\n{rung.upper()}:")
    print(f"  Vocabulary size: {unique_terms}")
    print(f"  Total terms: {total_terms}")
    print(f"  Avg representation length: {avg_length:.1f} terms")
    print(f"  Diversity (type-token ratio): {diversity:.3f}")
    print(f"  Empty representations: {rung_stats[rung]['empty_reprs']} ({rung_stats[rung]['empty_reprs']/len(reps)*100:.1f}%)")
    if most_common:
        print(f"  Top 10 most common terms:")
        for term, count in most_common:
            print(f"    '{term}': {count} ({count/total_terms*100:.1f}%)")

# 2. Feature Importance Analysis (for successful classifications)
print("\n\n2. FEATURE IMPORTANCE ANALYSIS")
print("-" * 80)

# Analyze which features are most important for each successful task
for rung in RUNG_FUNCS.keys():
    for label_name, task in task_labels:
        if label_name not in ['high_code_activity', 'multi_file_session']:  # Focus on balanced tasks
            continue
        
        X, y = build_probe_dataset(rung, label_name)
        if len(np.unique(y)) < 2:
            continue
        
        clf = LogisticRegression(max_iter=500)
        clf.fit(X, y)
        
        # Get feature importance (coefficient magnitude)
        feature_importance = np.abs(clf.coef_[0])
        top_indices = np.argsort(feature_importance)[-10:][::-1]
        
        # Get feature names from vectorizer
        reps = [entry[rung] for entry in features]
        cleaned = [repr if repr.strip() else 'empty_repr' for repr in reps]
        vec = TfidfVectorizer(max_features=4096)

            vec.fit_transform(cleaned)
            feature_names = vec.get_feature_names_out()
        
        print(f"\n{rung} / {task}:")
        print(f"  Accuracy: {accuracy_score(y, clf.predict(X)):.4f}")
        print(f"  F1: {f1_score(y, clf.predict(X)):.4f}")
        print(f"  Top 10 most important features:")
        for idx in top_indices[:10]:
            if idx < len(feature_names):
                importance = feature_importance[idx]
                feature_name = feature_names[idx]
                print(f"    '{feature_name}': {importance:.4f}")

# 3. Representation Overlap Analysis
print("\n\n3. REPRESENTATION OVERLAP ANALYSIS")
print("-" * 80)
print("How much do representations overlap across rungs?")

# Sample a few traces and show their representations
print("\nSample trace representations (first 3 traces):")
for i in range(min(3, len(features))):
    print(f"\nTrace {i+1}:")
    for rung in RUNG_FUNCS.keys():
        rep = features[i][rung]
        preview = rep[:200] + "..." if len(rep) > 200 else rep
        print(f"  {rung:15}: {preview}")

# 4. Performance vs Representation Characteristics
print("\n\n4. PERFORMANCE VS REPRESENTATION CHARACTERISTICS")
print("-" * 80)

# Create summary dataframe
summary_data = []
for rung in RUNG_FUNCS.keys():
    stats = rung_stats[rung]
    
    # Get performance metrics
    perf_data = df_final[df_final['rung'] == rung]
    if not perf_data.empty:
        avg_acc = perf_data['accuracy'].mean() if 'accuracy' in perf_data.columns else np.nan
        avg_f1 = perf_data['f1'].mean() if 'f1' in perf_data.columns else np.nan
        avg_recall1 = perf_data['recall@1'].mean() if 'recall@1' in perf_data.columns else np.nan
    else:
        avg_acc = avg_f1 = avg_recall1 = np.nan
    
    summary_data.append({
        'rung': rung,
        'vocab_size': stats['vocab_size'],
        'avg_length': stats['avg_length'],
        'diversity': stats['diversity'],
        'storage_bytes': stats.get('storage_bytes', np.nan),
        'avg_accuracy': avg_acc,
        'avg_f1': avg_f1,
        'avg_recall@1': avg_recall1,
    })

df_summary = pd.DataFrame(summary_data)
print("\nSummary Statistics:")
print(df_summary.to_string(index=False))

# 5. Error Analysis
print("\n\n5. ERROR ANALYSIS")
print("-" * 80)

for rung in RUNG_FUNCS.keys():
    for label_name, task in task_labels:
        if label_name not in ['high_code_activity', 'multi_file_session']:
            continue
        
        X, y = build_probe_dataset(rung, label_name)
        if len(np.unique(y)) < 2:
            continue
        
        clf = LogisticRegression(max_iter=500)
        clf.fit(X, y)
        y_pred = clf.predict(X)
        
        # Find misclassified examples
        errors = np.where(y != y_pred)[0]
        if len(errors) > 0:
            print(f"\n{rung} / {task}: {len(errors)} errors ({len(errors)/len(y)*100:.1f}%)")
            # Show a few error examples
            for err_idx in errors[:3]:
                print(f"  Error example {err_idx}:")
                print(f"    True label: {y[err_idx]}, Predicted: {y_pred[err_idx]}")
                rep = features[err_idx][rung]
                preview = rep[:150] + "..." if len(rep) > 150 else rep
                print(f"    Representation: {preview}")

print("\n" + "="*80)
print("✓ Dataset parsing complete")
print("="*80)


IndentationError: unexpected indent (755139599.py, line 87)

In [None]:
# ============================================================================
# Visualization: Performance Analysis (Altair) - OPTIMIZED
# ============================================================================

import altair as alt
import pandas as pd

# Enable high-resolution export
alt.renderers.enable('default')
alt.data_transformers.enable('default')

# Shared chart configuration for better readability
chart_config = {
    'background': 'white',
    'padding': {'left': 15, 'top': 15, 'right': 15, 'bottom': 15},
    'view': {'continuousWidth': 400, 'continuousHeight': 300}
}

def configure_chart(chart, title_size=18, axis_label_size=12, axis_title_size=14):
    """Configure chart with consistent styling."""
    return chart.configure(**chart_config).configure_title(
        fontSize=title_size,
        fontWeight='bold',
        anchor='start',
        offset=10
    ).configure_axis(
        labelFontSize=axis_label_size,
        titleFontSize=axis_title_size,
        titleFontWeight='bold',
        labelAngle=0
    ).configure_legend(
        labelFontSize=11,
        titleFontSize=12,
        titleFontWeight='bold'
    )

# Filter out NaN values for classification tasks
class_df = df_final[df_final['task'].isin(['code_activity_classification', 'multi_file_classification', 'anomaly_detection'])].copy()
class_df = class_df.dropna(subset=['accuracy', 'f1'])

# ============================================================================
# 1. Performance Comparison Charts
# ============================================================================

if not class_df.empty:
    # Prepare data for accuracy plot
    pivot_acc = class_df.pivot_table(values='accuracy', index='rung', columns='task', aggfunc='mean').reset_index()
    pivot_acc_melted = pivot_acc.melt(id_vars='rung', var_name='task', value_name='accuracy')
    pivot_acc_melted['task'] = pivot_acc_melted['task'].str.replace('_', ' ').str.title()
    
    # Chart 1: Accuracy by Rung and Task (OPTIMIZED - larger, better colors)
    chart_acc = alt.Chart(pivot_acc_melted).mark_bar(opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('rung:N', title='Abstraction Rung', sort=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'], 
                axis=alt.Axis(labelAngle=0)),
        y=alt.Y('accuracy:Q', title='Accuracy', scale=alt.Scale(domain=[0, 1.05])),
        color=alt.Color('task:N', 
                       scale=alt.Scale(domain=['Code Activity Classification', 'Multi File Classification', 'Anomaly Detection'],
                                      range=['#82a7a6', '#b57c61', '#73648a']),
                       legend=alt.Legend(title='Task', orient='right', columns=1)),
        tooltip=['rung', 'task', alt.Tooltip('accuracy:Q', format='.3f')]
    ).properties(
        width=500,
        height=350,
        title='Classification Accuracy by Rung'
    )
    
    chart_acc_configured = configure_chart(chart_acc)
    chart_acc_configured.save(REPO_ROOT / 'research/results/classification_accuracy_by_rung.png', scale_factor=3)
    print("✓ Saved classification_accuracy_by_rung.png")
    
    # Chart 2: F1 Score by Rung and Task (OPTIMIZED)
    pivot_f1 = class_df.pivot_table(values='f1', index='rung', columns='task', aggfunc='mean').reset_index()
    pivot_f1_melted = pivot_f1.melt(id_vars='rung', var_name='task', value_name='f1')
    pivot_f1_melted['task'] = pivot_f1_melted['task'].str.replace('_', ' ').str.title()
    
    chart_f1 = alt.Chart(pivot_f1_melted).mark_bar(opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('rung:N', title='Abstraction Rung', sort=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                axis=alt.Axis(labelAngle=0)),
        y=alt.Y('f1:Q', title='F1 Score', scale=alt.Scale(domain=[0, 1.05])),
        color=alt.Color('task:N',
                       scale=alt.Scale(domain=['Code Activity Classification', 'Multi File Classification', 'Anomaly Detection'],
                                      range=['#82a7a6', '#b57c61', '#73648a']),
                       legend=alt.Legend(title='Task', orient='right', columns=1)),
        tooltip=['rung', 'task', alt.Tooltip('f1:Q', format='.3f')]
    ).properties(
        width=500,
        height=350,
        title='F1 Score by Rung'
    )
    
    chart_f1_configured = configure_chart(chart_f1)
    chart_f1_configured.save(REPO_ROOT / 'research/results/f1_score_by_rung.png', scale_factor=3)
    print("✓ Saved f1_score_by_rung.png")

# ============================================================================
# 2. Context Retrieval Performance
# ============================================================================

retrieval_df = df_final[df_final['task'] == 'context_retrieval'].dropna(subset=['recall@1', 'recall@5', 'recall@10']).copy()
if not retrieval_df.empty:
    retrieval_melted = retrieval_df.melt(
        id_vars='rung',
        value_vars=['recall@1', 'recall@5', 'recall@10'],
        var_name='metric',
        value_name='recall'
    )
    retrieval_melted['metric'] = retrieval_melted['metric'].str.replace('recall@', 'Recall@')
    
    chart_retrieval = alt.Chart(retrieval_melted).mark_bar(opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('rung:N', title='Abstraction Rung', sort=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                axis=alt.Axis(labelAngle=0)),
        y=alt.Y('recall:Q', title='Recall', scale=alt.Scale(domain=[0, 1])),
        color=alt.Color('metric:N',
                       scale=alt.Scale(domain=['Recall@1', 'Recall@5', 'Recall@10'],
                                      range=['#82a7a6', '#b57c61', '#73648a']),
                       legend=alt.Legend(title='Metric', orient='right')),
        tooltip=['rung', 'metric', alt.Tooltip('recall:Q', format='.3f')]
    ).properties(
        width=500,
        height=350,
        title='Context Retrieval Performance'
    )
    
    chart_retrieval_configured = configure_chart(chart_retrieval)
    chart_retrieval_configured.save(REPO_ROOT / 'research/results/context_retrieval_performance.png', scale_factor=3)
    print("✓ Saved context_retrieval_performance.png")

# ============================================================================
# 3. Storage Efficiency vs Performance Trade-off
# ============================================================================

if not class_df.empty:
    task_df = class_df[class_df['task'] == 'code_activity_classification'].dropna(subset=['accuracy', 'storage_bytes']).copy()
    if not task_df.empty:
        chart_tradeoff = alt.Chart(task_df).mark_circle(size=300, opacity=0.8, stroke='black', strokeWidth=1.5).encode(
            x=alt.X('storage_bytes:Q', title='Storage (bytes)', scale=alt.Scale(type='log', base=10),
                    axis=alt.Axis(format='.0e')),
            y=alt.Y('accuracy:Q', title='Accuracy', scale=alt.Scale(domain=[0.65, 1.05])),
            color=alt.Color('rung:N', 
                           scale=alt.Scale(domain=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                                          range=['#82a7a6', '#b57c61', '#73648a', '#5c4e6d', '#453750']),
                           legend=alt.Legend(title='Rung', orient='right')),
            tooltip=['rung', alt.Tooltip('accuracy:Q', format='.3f'), alt.Tooltip('storage_bytes:Q', format=',.0f')]
        ).properties(
            width=500,
            height=350,
            title='Storage vs Accuracy Trade-off'
        )
        
        chart_tradeoff_configured = configure_chart(chart_tradeoff)
        chart_tradeoff_configured.save(REPO_ROOT / 'research/results/storage_vs_accuracy_tradeoff.png', scale_factor=3)
        print("✓ Saved storage_vs_accuracy_tradeoff.png")

# ============================================================================
# 4. Representation Characteristics Visualization
# ============================================================================

if 'df_summary' in locals() and not df_summary.empty:
    # Chart 1: Vocabulary Size
    chart_vocab = alt.Chart(df_summary).mark_bar(color='#82a7a6', opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('rung:N', title='Abstraction Rung', sort=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                axis=alt.Axis(labelAngle=0)),
        y=alt.Y('vocab_size:Q', title='Unique Terms', scale=alt.Scale(type='log', base=10)),
        tooltip=['rung', alt.Tooltip('vocab_size:Q', format=',.0f')]
    ).properties(
        width=500,
        height=350,
        title='Vocabulary Size by Rung'
    )
    
    chart_vocab_configured = configure_chart(chart_vocab)
    chart_vocab_configured.save(REPO_ROOT / 'research/results/vocabulary_size_by_rung.png', scale_factor=3)
    print("✓ Saved vocabulary_size_by_rung.png")
    
    # Chart 2: Average Representation Length
    chart_avg_length = alt.Chart(df_summary).mark_bar(color='#b57c61', opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('rung:N', title='Abstraction Rung', sort=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                axis=alt.Axis(labelAngle=0)),
        y=alt.Y('avg_length:Q', title='Avg Terms per Representation'),
        tooltip=['rung', alt.Tooltip('avg_length:Q', format='.1f')]
    ).properties(
        width=500,
        height=350,
        title='Average Representation Length'
    )
    
    chart_avg_length_configured = configure_chart(chart_avg_length)
    chart_avg_length_configured.save(REPO_ROOT / 'research/results/avg_representation_length.png', scale_factor=3)
    print("✓ Saved avg_representation_length.png")
    
    # Chart 3: Diversity (Type-Token Ratio)
    chart_diversity = alt.Chart(df_summary).mark_bar(color='#73648a', opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('rung:N', title='Abstraction Rung', sort=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                axis=alt.Axis(labelAngle=0)),
        y=alt.Y('diversity:Q', title='Type-Token Ratio'),
        tooltip=['rung', alt.Tooltip('diversity:Q', format='.3f')]
    ).properties(
        width=500,
        height=350,
        title='Representation Diversity'
    )
    
    chart_diversity_configured = configure_chart(chart_diversity)
    chart_diversity_configured.save(REPO_ROOT / 'research/results/representation_diversity.png', scale_factor=3)
    print("✓ Saved representation_diversity.png")
    
    # Chart 4: Performance vs Diversity
    task_df = class_df[class_df['task'] == 'code_activity_classification'].dropna(subset=['accuracy']).copy()
    if not task_df.empty and 'diversity' in df_summary.columns:
        merged = task_df.merge(df_summary[['rung', 'diversity']], on='rung', how='left')
        merged = merged.dropna(subset=['accuracy', 'diversity'])
        if not merged.empty:
            chart_diversity_acc = alt.Chart(merged).mark_circle(size=300, opacity=0.8, stroke='black', strokeWidth=1.5).encode(
                x=alt.X('diversity:Q', title='Diversity (Type-Token Ratio)'),
                y=alt.Y('accuracy:Q', title='Accuracy', scale=alt.Scale(domain=[0.65, 1.05])),
                color=alt.Color('rung:N', 
                               scale=alt.Scale(domain=['tokens', 'semantic_edits', 'functions', 'motifs', 'raw'],
                                              range=['#82a7a6', '#b57c61', '#73648a', '#5c4e6d', '#453750']),
                               legend=alt.Legend(title='Rung', orient='right')),
                tooltip=['rung', alt.Tooltip('diversity:Q', format='.3f'), alt.Tooltip('accuracy:Q', format='.3f')]
            ).properties(
                width=500,
                height=350,
                title='Diversity vs Accuracy'
            )
            
            chart_diversity_acc_configured = configure_chart(chart_diversity_acc)
            chart_diversity_acc_configured.save(REPO_ROOT / 'research/results/diversity_vs_accuracy.png', scale_factor=3)
            print("✓ Saved diversity_vs_accuracy.png")

# ============================================================================
# 5. Inter-Probe Difference Visualizations (NEW)
# ============================================================================

if 'df_inter_probe' in locals() and not df_inter_probe.empty:
    # Chart 1: Inter-Probe Difference by Task and Metric
    chart_inter_probe = alt.Chart(df_inter_probe).mark_bar(opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('task:N', title='Task', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('difference:Q', title='Inter-Probe Difference (Max - Min Performance)'),
        color=alt.Color('metric:N',
                       scale=alt.Scale(domain=['accuracy', 'f1', 'recall@1', 'recall@5', 'recall@10'],
                                      range=['#82a7a6', '#b57c61', '#73648a', '#5c4e6d', '#453750']),
                       legend=alt.Legend(title='Metric', orient='right')),
        tooltip=['task', 'metric', alt.Tooltip('difference:Q', format='.3f'), 
                 alt.Tooltip('best_rung:N', title='Best Rung'), alt.Tooltip('worst_rung:N', title='Worst Rung')]
    ).properties(
        width=600,
        height=400,
        title='Inter-Probe Difference by Task and Metric'
    )
    
    chart_inter_probe_configured = configure_chart(chart_inter_probe)
    chart_inter_probe_configured.save(REPO_ROOT / 'research/results/inter_probe_difference_by_task.png', scale_factor=3)
    print("✓ Saved inter_probe_difference_by_task.png")
    
    # Chart 2: Best vs Worst Rung Performance
    inter_probe_melted = df_inter_probe.melt(
        id_vars=['task', 'metric', 'best_rung', 'worst_rung'],
        value_vars=['max_performance', 'min_performance'],
        var_name='performance_type',
        value_name='performance'
    )
    inter_probe_melted['performance_type'] = inter_probe_melted['performance_type'].str.replace('_', ' ').str.title()
    
    chart_best_worst = alt.Chart(inter_probe_melted).mark_bar(opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('task:N', title='Task', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('performance:Q', title='Performance', scale=alt.Scale(domain=[0, 1.05])),
        color=alt.Color('performance_type:N',
                       scale=alt.Scale(domain=['Max Performance', 'Min Performance'],
                                      range=['#82a7a6', '#73648a']),
                       legend=alt.Legend(title='Performance Type', orient='right')),
        column=alt.Column('metric:N', title='Metric', header=alt.Header(labelAngle=-45)),
        tooltip=['task', 'metric', 'performance_type', alt.Tooltip('performance:Q', format='.3f')]
    ).properties(
        width=200,
        height=300,
        title='Best vs Worst Rung Performance'
    )
    
    chart_best_worst_configured = configure_chart(chart_best_worst, title_size=16)
    chart_best_worst_configured.save(REPO_ROOT / 'research/results/best_worst_rung_performance.png', scale_factor=3)
    print("✓ Saved best_worst_rung_performance.png")

# ============================================================================
# 6. Rung Selection Heuristic Visualization (NEW)
# ============================================================================

if 'df_heuristic' in locals() and not df_heuristic.empty:
    # Add match indicator color
    df_heuristic_viz = df_heuristic.copy()
    df_heuristic_viz['match_color'] = df_heuristic_viz['matches'].apply(lambda x: '#82a7a6' if x == '✓' else '#5c4e6d')
    
    chart_heuristic = alt.Chart(df_heuristic_viz).mark_bar(opacity=0.85, cornerRadius=5, stroke='white', strokeWidth=1).encode(
        x=alt.X('task:N', title='Task', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('confidence:Q', title='Confidence', scale=alt.Scale(domain=[0, 1])),
        color=alt.Color('matches:N',
                       scale=alt.Scale(domain=['✓', '✗'],
                                      range=['#82a7a6', '#5c4e6d']),
                       legend=alt.Legend(title='Match', orient='right')),
        tooltip=['task', 'selected_rung', 'actual_best_rung', 'matches', 
                 alt.Tooltip('confidence:Q', format='.2f'), 'reasoning']
    ).properties(
        width=500,
        height=350,
        title='Rung Selection Heuristic: Confidence and Accuracy'
    )
    
    chart_heuristic_configured = configure_chart(chart_heuristic)
    chart_heuristic_configured.save(REPO_ROOT / 'research/results/rung_selection_heuristic.png', scale_factor=3)
    print("✓ Saved rung_selection_heuristic.png")

# ============================================================================
# 7. Create Combined HTML Dashboards
# ============================================================================

# Performance dashboard
if not class_df.empty and not retrieval_df.empty:
    top_row = alt.hconcat(chart_acc, chart_f1, spacing=30)
    bottom_row = alt.hconcat(chart_retrieval, chart_tradeoff, spacing=30)
    combined = alt.vconcat(top_row, bottom_row, spacing=40).configure(**chart_config).configure_title(
        fontSize=18, fontWeight='bold', anchor='start', offset=10
    ).configure_axis(labelFontSize=12, titleFontSize=14, titleFontWeight='bold')
    
    combined.save(REPO_ROOT / 'research/results/probe_performance_analysis.html')
    print("✓ Saved interactive HTML to research/results/probe_performance_analysis.html")

# Representation characteristics dashboard
if 'df_summary' in locals() and not df_summary.empty:
    top_row_char = alt.hconcat(chart_vocab, chart_avg_length, spacing=30)
    bottom_row_char = alt.hconcat(chart_diversity, chart_diversity_acc, spacing=30)
    combined_char = alt.vconcat(top_row_char, bottom_row_char, spacing=40).configure(**chart_config).configure_title(
        fontSize=18, fontWeight='bold', anchor='start', offset=10
    ).configure_axis(labelFontSize=12, titleFontSize=14, titleFontWeight='bold')
    
    combined_char.save(REPO_ROOT / 'research/results/representation_characteristics.html')
    print("✓ Saved interactive HTML to research/results/representation_characteristics.html")

# ============================================================================
# 8. Copy images to landing page
# ============================================================================

LANDING_IMAGES_DIR = REPO_ROOT / 'telemetry-landing/public/images'
LANDING_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
images_to_copy = [
    'classification_accuracy_by_rung.png',
    'f1_score_by_rung.png',
    'context_retrieval_performance.png',
    'storage_vs_accuracy_tradeoff.png',
    'vocabulary_size_by_rung.png',
    'representation_diversity.png',
    'inter_probe_difference_by_task.png',
    'rung_selection_heuristic.png',
]
for img_name in images_to_copy:
    src = REPO_ROOT / 'research/results' / img_name
    dst = LANDING_IMAGES_DIR / img_name
    if src.exists():
        shutil.copy2(src, dst)
        print(f'✓ Copied {img_name} to landing page')
    else:
        print(f'⚠ {img_name} not found - run cell to generate')

print("\n" + "="*80)
print("All visualizations generated and saved!")
print("="*80)


In [None]:
# ============================================================================
# Detailed Results Interpretation
# ============================================================================

print("="*80)
print("DETAILED RESULTS INTERPRETATION")
print("="*80)

# Create comprehensive interpretation
interpretation = []

for rung in RUNG_FUNCS.keys():
    rung_data = df_final[df_final['rung'] == rung]
    
    # Classification performance
    class_data = rung_data[rung_data['task'].isin(['code_activity_classification', 'multi_file_classification'])]
    if not class_data.empty:
        avg_acc = class_data['accuracy'].mean()
        avg_f1 = class_data['f1'].mean()
    else:
        avg_acc = avg_f1 = np.nan
    
    # Retrieval performance
    retrieval_data = rung_data[rung_data['task'] == 'context_retrieval']
    if not retrieval_data.empty:
        recall1 = retrieval_data['recall@1'].iloc[0] if 'recall@1' in retrieval_data.columns else np.nan
        recall5 = retrieval_data['recall@5'].iloc[0] if 'recall@5' in retrieval_data.columns else np.nan
        recall10 = retrieval_data['recall@10'].iloc[0] if 'recall@10' in retrieval_data.columns else np.nan
    else:
        recall1 = recall5 = recall10 = np.nan
    
    # Storage
    storage = rung_data['storage_bytes'].iloc[0] if 'storage_bytes' in rung_data.columns else np.nan
    terms = rung_data['avg_terms'].iloc[0] if 'avg_terms' in rung_data.columns else np.nan
    
    # Stats
    stats = rung_stats.get(rung, {})
    
    interpretation.append({
        'rung': rung,
        'classification_accuracy': avg_acc,
        'classification_f1': avg_f1,
        'retrieval_recall@1': recall1,
        'retrieval_recall@5': recall5,
        'retrieval_recall@10': recall10,
        'storage_bytes': storage,
        'avg_terms': terms,
        'vocab_size': stats.get('vocab_size', np.nan),
        'diversity': stats.get('diversity', np.nan),
    })

df_interpretation = pd.DataFrame(interpretation)

print("\nComprehensive Performance Summary:")
print(df_interpretation.to_string(index=False))

# Key findings
print("\n\nKEY FINDINGS:")
print("-" * 80)

# Find best performers
best_class = df_interpretation.loc[df_interpretation['classification_f1'].idxmax()]
best_retrieval = df_interpretation.loc[df_interpretation['retrieval_recall@5'].idxmax()]
most_efficient = df_interpretation.loc[df_interpretation['storage_bytes'].idxmin()]

print(f"\n1. Best Classification Performance:")
print(f"   Rung: {best_class['rung']}")
print(f"   Accuracy: {best_class['classification_accuracy']:.4f}")
print(f"   F1: {best_class['classification_f1']:.4f}")
print(f"   Storage: {best_class['storage_bytes']:.1f} bytes")

print(f"\n2. Best Retrieval Performance:")
print(f"   Rung: {best_retrieval['rung']}")
print(f"   Recall@1: {best_retrieval['retrieval_recall@1']:.4f}")
print(f"   Recall@5: {best_retrieval['retrieval_recall@5']:.4f}")
print(f"   Recall@10: {best_retrieval['retrieval_recall@10']:.4f}")

print(f"\n3. Most Storage Efficient:")
print(f"   Rung: {most_efficient['rung']}")
print(f"   Storage: {most_efficient['storage_bytes']:.1f} bytes")
print(f"   Terms: {most_efficient['avg_terms']:.1f}")
print(f"   Classification F1: {most_efficient['classification_f1']:.4f}")
print(f"   Retrieval Recall@5: {most_efficient['retrieval_recall@5']:.4f}")

# Trade-off analysis
print(f"\n4. Privacy-Utility Trade-off Analysis:")
print(f"   {'Rung':<15} {'Classification':<15} {'Retrieval':<15} {'Storage':<12} {'Privacy Level'}")
print(f"   {'-'*15} {'-'*15} {'-'*15} {'-'*12} {'-'*15}")
for _, row in df_interpretation.iterrows():
    class_perf = f"{row['classification_f1']:.3f}" if not pd.isna(row['classification_f1']) else "N/A"
    retrieval_perf = f"{row['retrieval_recall@5']:.3f}" if not pd.isna(row['retrieval_recall@5']) else "N/A"
    storage = f"{row['storage_bytes']:.0f}" if not pd.isna(row['storage_bytes']) else "N/A"
    
    # Infer privacy level from storage (smaller = more private)
    if not pd.isna(row['storage_bytes']):
        if row['storage_bytes'] < 200:
            privacy = "High"
        elif row['storage_bytes'] < 500:
            privacy = "Medium"
        else:
            privacy = "Low"
    else:
        privacy = "Unknown"
    
    print(f"   {row['rung']:<15} {class_perf:<15} {retrieval_perf:<15} {storage:<12} {privacy}")

print("\n" + "="*80)


In [None]:
# ============================================================================
# Inter-Probe Difference Statistics
# ============================================================================
# Calculate how much performance varies across rungs for each task/metric
# This helps determine if rung choice is critical

print("="*80)
print("INTER-PROBE DIFFERENCE STATISTICS")
print("="*80)

# Collect all metrics for each task/rung combination
inter_probe_data = []

# Classification metrics
for task in ['code_activity_classification', 'multi_file_classification', 'anomaly_detection']:
    task_data = df_final[df_final['task'] == task]
    for metric in ['accuracy', 'f1']:
        if metric in task_data.columns:
            metric_values = task_data[metric].dropna()
            if len(metric_values) > 0:
                rung_perfs = task_data[['rung', metric]].dropna()
                if len(rung_perfs) > 0:
                    max_perf = rung_perfs[metric].max()
                    min_perf = rung_perfs[metric].min()
                    best_rung = rung_perfs.loc[rung_perfs[metric].idxmax(), 'rung']
                    worst_rung = rung_perfs.loc[rung_perfs[metric].idxmin(), 'rung']
                    diff = max_perf - min_perf
                    std_val = rung_perfs[metric].std() if len(rung_perfs) > 1 else 0.0
                    range_ratio = diff / max_perf if max_perf > 0 else 0.0
                    
                    inter_probe_data.append({
                        'task': task,
                        'metric': metric,
                        'max_performance': max_perf,
                        'min_performance': min_perf,
                        'difference': diff,
                        'range_ratio': range_ratio,
                        'std': std_val,
                        'best_rung': best_rung,
                        'worst_rung': worst_rung,
                        'num_rungs': len(rung_perfs)
                    })

# Retrieval metrics
retrieval_task = 'context_retrieval'
retrieval_data = df_final[df_final['task'] == retrieval_task]
for metric in ['recall@1', 'recall@5', 'recall@10']:
    if metric in retrieval_data.columns:
        metric_values = retrieval_data[metric].dropna()
        if len(metric_values) > 0:
            rung_perfs = retrieval_data[['rung', metric]].dropna()
            if len(rung_perfs) > 0:
                max_perf = rung_perfs[metric].max()
                min_perf = rung_perfs[metric].min()
                best_rung = rung_perfs.loc[rung_perfs[metric].idxmax(), 'rung']
                worst_rung = rung_perfs.loc[rung_perfs[metric].idxmin(), 'rung']
                diff = max_perf - min_perf
                std_val = rung_perfs[metric].std() if len(rung_perfs) > 1 else 0.0
                range_ratio = diff / max_perf if max_perf > 0 else 0.0
                
                inter_probe_data.append({
                    'task': retrieval_task,
                    'metric': metric,
                    'max_performance': max_perf,
                    'min_performance': min_perf,
                    'difference': diff,
                    'range_ratio': range_ratio,
                    'std': std_val,
                    'best_rung': best_rung,
                    'worst_rung': worst_rung,
                    'num_rungs': len(rung_perfs)
                })

df_inter_probe = pd.DataFrame(inter_probe_data)
print("\nInter-Probe Difference by Task and Metric:")
print(df_inter_probe.to_string(index=False))

# Save inter-probe difference statistics
inter_probe_output_path = RESULTS_DIR / 'inter_probe_difference_stats.json'
df_inter_probe.to_json(inter_probe_output_path, orient='records', indent=2)
print(f"\n✓ Results saved to {inter_probe_output_path}")
df_inter_probe.to_csv(RESULTS_DIR / 'inter_probe_difference_stats.csv', index=False)
print(f"✓ Also saved as CSV to {RESULTS_DIR / 'inter_probe_difference_stats.csv'}")


In [None]:
# ============================================================================
# Rung Selection Heuristic
# ============================================================================
# Use inter-probe difference to automatically select the best rung for each task
# High difference → rung choice is critical → use best-performing rung
# Low difference → all rungs similar → use most efficient rung (motifs)

print("\n" + "="*80)
print("RUNG SELECTION HEURISTIC RESULTS")
print("="*80)

# Define task types
task_types = {
    'code_activity_classification': 'classification',
    'multi_file_classification': 'classification',
    'anomaly_detection': 'other',
    'context_retrieval': 'retrieval'
}

# Threshold for "high" vs "low" inter-probe difference
DIFFERENCE_THRESHOLD = 0.15

heuristic_results = []

for task in task_types.keys():
    task_type = task_types[task]
    
    # Get inter-probe difference for this task
    task_diff_data = df_inter_probe[df_inter_probe['task'] == task]
    
    if len(task_diff_data) == 0:
        continue
    
    # Use the primary metric for the task type
    if task_type == 'classification':
        # Use accuracy as primary metric
        primary_metric = task_diff_data[task_diff_data['metric'] == 'accuracy']
        if len(primary_metric) == 0:
            primary_metric = task_diff_data[task_diff_data['metric'] == 'f1']
    elif task_type == 'retrieval':
        # Use recall@1 as primary metric
        primary_metric = task_diff_data[task_diff_data['metric'] == 'recall@1']
        if len(primary_metric) == 0:
            primary_metric = task_diff_data[task_diff_data['metric'] == 'recall@5']
    else:
        # Use first available metric
        primary_metric = task_diff_data.iloc[[0]]
    
    if len(primary_metric) == 0:
        continue
    
    inter_probe_diff = primary_metric['difference'].iloc[0]
    best_rung = primary_metric['best_rung'].iloc[0]
    
    # Heuristic logic
    if inter_probe_diff >= DIFFERENCE_THRESHOLD:
        # High difference → rung choice is critical → use best-performing rung
        selected_rung = best_rung
        confidence = 0.95
        reasoning = f"High inter-probe difference ({inter_probe_diff:.3f}) → rung choice critical → use {best_rung}"
    else:
        # Low difference → all rungs similar → use most efficient rung (motifs)
        selected_rung = 'motifs'  # Most compressed/efficient
        confidence = 0.85
        reasoning = f"Low inter-probe difference ({inter_probe_diff:.3f}) → all rungs similar → use motifs (most efficient)"
    
    # Validate against actual best rung
    matches = "✓" if selected_rung == best_rung else "✗"
    
    heuristic_results.append({
        'task': task,
        'task_type': task_type,
        'selected_rung': selected_rung,
        'actual_best_rung': best_rung,
        'matches': matches,
        'confidence': confidence,
        'reasoning': reasoning
    })

df_heuristic = pd.DataFrame(heuristic_results)
print("\nHeuristic Selection vs Actual Best Rung:")
print(df_heuristic.to_string(index=False))

# Calculate accuracy
correct = sum(1 for r in heuristic_results if r['matches'] == '✓')
total = len(heuristic_results)
accuracy = correct / total if total > 0 else 0.0

print(f"\n✓ Validation: {correct} correct, {total - correct} incorrect")
print(f"  Accuracy: {accuracy:.1%}")

# Save results
heuristic_output_path = RESULTS_DIR / 'rung_selection_heuristic.json'
df_heuristic.to_json(heuristic_output_path, orient='records', indent=2)
print(f"\n✓ Results saved to {heuristic_output_path}")

## Research Implications

### Key Findings

1. **Task-Specific Rung Selection is Critical**
   - Classification tasks → Motifs excel (100% accuracy, most efficient)
   - Retrieval tasks → Semantic edits or functions (motifs too abstract)
   - Tokens perform poorly across all tasks

2. **Privacy-Utility Trade-off is Task-Dependent**
   - Motifs: High privacy, excellent classification, poor retrieval
   - Semantic edits: Lower privacy, good retrieval, good classification  
   - Functions: Balanced privacy-utility for both tasks

3. **Storage Efficiency Scales Dramatically**
   - Motifs are 4-13x more efficient while achieving better classification
   - Critical for large-scale deployment and foundation model training

4. **Abstraction Can Improve Performance**
   - Higher abstraction (motifs) captures structural patterns that generalize better
   - Suggests structural patterns matter more than raw content for classification

### Next Research Directions

1. **Why Motifs Excel at Classification**
   - Analyze which motifs are most predictive
   - Understand structural pattern generalization

2. **Why Tokens Perform Poorly**
   - Investigate if structured token representations improve performance
   - Compare with canonicalized vs raw tokens

3. **Cross-Task Generalization**
   - Test patterns across different classification tasks
   - Identify retrieval tasks where motifs might work better

4. **Privacy Quantification**
   - Measure actual k-anonymity values
   - Quantify re-identification risk

5. **Foundation Model Training**
   - Test if motifs can train better code models
   - Validate storage efficiency enables larger datasets
