# 01a: Distribution Profiles

**Goal**: Establish baseline characterization of the oracle data.

**Key Questions**:
1. Does `n_states` follow a power law with depth? (fractal branching)
2. Does `v_unique` saturate early? (low intrinsic dimensionality)
3. Does `v_entropy` scale logarithmically or slower? (structure)
4. Is `v_std` predictable from depth? (exploitable for compression)

**Reference**: docs/analysis-draft.md Section 1.1

In [None]:
# === CONFIGURATION ===
DATA_DIR = "/mnt/d/shards-standard/"
PROJECT_ROOT = "/home/jason/v2/mk5-tailwind"

# === Setup imports ===
import sys
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from forge.analysis.utils import loading, features, compression, viz
from forge.oracle import schema

viz.setup_notebook_style()
print("✓ Ready")

## 1. Load Sample Seeds

Load multiple seeds to get robust statistics.

In [None]:
# Find available seeds
shard_files = loading.find_shard_files(DATA_DIR)
print(f"Total shards available: {len(shard_files)}")

# Sample 10-20 seeds for analysis
N_SEEDS = min(20, len(shard_files))
sample_files = shard_files[:N_SEEDS]
print(f"Analyzing {N_SEEDS} shards")

In [None]:
# Compute per-depth statistics for each shard
# This creates stats_df with columns: seed, decl_id, depth, n_states, v_mean, v_std, v_unique, v_entropy

# Limit to 10 seeds for faster analysis (adjust as needed)
N_SEEDS = min(10, len(shard_files))
sample_files = shard_files[:N_SEEDS]
print(f"Analyzing {N_SEEDS} shards...")

records = []
for path in tqdm(sample_files, desc="Loading shards"):
    df, seed, decl_id = schema.load_file(path)
    V = df['V'].values
    depths = features.depth(df['state'].values)
    
    # Compute stats per depth
    for d in np.unique(depths):
        mask = depths == d
        V_at_depth = V[mask]
        
        # Compute entropy
        unique_vals, counts = np.unique(V_at_depth, return_counts=True)
        probs = counts / counts.sum()
        entropy = -np.sum(probs * np.log2(probs + 1e-12))
        
        records.append({
            'seed': seed,
            'decl_id': decl_id,
            'depth': d,
            'n_states': len(V_at_depth),
            'v_mean': V_at_depth.mean(),
            'v_std': V_at_depth.std(),
            'v_unique': len(unique_vals),
            'v_entropy': entropy,
        })

stats_df = pd.DataFrame(records)
print(f"✓ Built stats_df: {len(stats_df)} rows ({len(sample_files)} shards × ~28 depths)")
print(f"Total states: {stats_df['n_states'].sum():,}")

## 2. State Count Scaling

Does `n_states` follow a power law with depth?

In [None]:
# Average n_states per depth
depth_avg = stats_df.groupby('depth').agg({
    'n_states': ['mean', 'std', 'count'],
    'v_mean': 'mean',
    'v_std': 'mean',
    'v_unique': 'mean',
    'v_entropy': 'mean',
}).reset_index()
depth_avg.columns = ['depth', 'n_states_mean', 'n_states_std', 'n_seeds', 'v_mean', 'v_std', 'v_unique', 'v_entropy']

print("Average statistics by depth:")
print(depth_avg[['depth', 'n_states_mean', 'v_unique', 'v_entropy']].to_string())

In [None]:
# Log-log plot of state count vs depth
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Linear scale
axes[0].scatter(depth_avg['depth'], depth_avg['n_states_mean'], alpha=0.7)
axes[0].set_xlabel('Depth (dominoes remaining)')
axes[0].set_ylabel('Average states')
axes[0].set_title('State Count vs Depth (Linear)')
axes[0].set_yscale('log')

# Log-log for power law fit
valid = depth_avg[depth_avg['n_states_mean'] > 0]
viz.plot_log_log(
    valid['depth'].values.astype(float),
    valid['n_states_mean'].values,
    ax=axes[1],
    title='State Count vs Depth (Log-Log)',
    xlabel='Depth',
    ylabel='Average states',
)

plt.tight_layout()
plt.show()

## 3. V Unique Values

Does `v_unique` saturate? (Low intrinsic dimensionality)

In [None]:
# Plot unique V values vs depth
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Unique values
axes[0].plot(depth_avg['depth'], depth_avg['v_unique'], 'o-', markersize=6)
axes[0].axhline(y=85, color='red', linestyle='--', alpha=0.5, label='Max possible (85)')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('Unique V values')
axes[0].set_title('V Unique Values vs Depth')
axes[0].legend()

# As fraction of max
axes[1].plot(depth_avg['depth'], depth_avg['v_unique'] / 85, 'o-', markersize=6)
axes[1].set_xlabel('Depth')
axes[1].set_ylabel('Fraction of possible values used')
axes[1].set_title('V Coverage vs Depth')
axes[1].set_ylim(0, 1.1)

plt.tight_layout()
plt.show()

# Check saturation
max_unique = depth_avg['v_unique'].max()
print(f"Maximum unique V values: {max_unique:.0f} / 85 ({100*max_unique/85:.1f}%)")

## 4. V Entropy Scaling

Does entropy scale logarithmically or slower?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Entropy vs depth
axes[0].plot(depth_avg['depth'], depth_avg['v_entropy'], 'o-', markersize=6)
max_entropy = np.log2(85)
axes[0].axhline(y=max_entropy, color='red', linestyle='--', alpha=0.5, label=f'Max ({max_entropy:.2f} bits)')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('H(V) (bits)')
axes[0].set_title('V Entropy vs Depth')
axes[0].legend()

# Entropy vs log(n_states) - check if entropy grows with sample size
valid = depth_avg[(depth_avg['n_states_mean'] > 0) & (depth_avg['v_entropy'] > 0)]
axes[1].scatter(np.log10(valid['n_states_mean']), valid['v_entropy'], alpha=0.7)
axes[1].set_xlabel('log10(n_states)')
axes[1].set_ylabel('H(V) (bits)')
axes[1].set_title('Entropy vs State Count')

plt.tight_layout()
plt.show()

# Summary
overall_entropy = stats_df.groupby('seed')['v_entropy'].mean().mean()
print(f"Average H(V) across seeds: {overall_entropy:.3f} bits")
print(f"Efficiency vs max: {100*overall_entropy/max_entropy:.1f}%")

## 5. V Standard Deviation

Is `v_std` predictable from depth?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# V std vs depth
axes[0].plot(depth_avg['depth'], depth_avg['v_std'], 'o-', markersize=6)
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('V std')
axes[0].set_title('V Standard Deviation vs Depth')

# V mean vs depth (should be near 0 for symmetric game)
axes[1].plot(depth_avg['depth'], depth_avg['v_mean'], 'o-', markersize=6)
axes[1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1].set_xlabel('Depth')
axes[1].set_ylabel('V mean')
axes[1].set_title('V Mean vs Depth')

plt.tight_layout()
plt.show()

# Check correlation
corr = np.corrcoef(depth_avg['depth'], depth_avg['v_std'])[0,1]
print(f"Correlation(depth, v_std): {corr:.4f}")

## 6. Per-Seed Variation

How much do distributions vary across seeds?

In [None]:
# Plot V distribution for multiple seeds
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, path in enumerate(sample_files[:6]):
    df, seed, decl_id = schema.load_file(path)
    V = df['V'].values
    viz.plot_v_distribution(
        V, ax=axes[i],
        title=f'Seed {seed}, {schema.DECL_NAMES[decl_id]}',
    )

plt.tight_layout()
plt.show()

In [None]:
# Cross-seed variance analysis
seed_stats = stats_df.groupby('seed').agg({
    'n_states': 'sum',
    'v_mean': 'mean',
    'v_entropy': 'mean',
}).reset_index()

print("Per-seed summary:")
print(seed_stats.describe())

print(f"\nEntropy coefficient of variation: {seed_stats['v_entropy'].std() / seed_stats['v_entropy'].mean():.4f}")

## 7. Declaration Type Analysis

Does distribution vary by declaration type?

In [None]:
# Group by declaration
decl_stats = stats_df.groupby('decl_id').agg({
    'n_states': 'sum',
    'v_mean': 'mean',
    'v_std': 'mean',
    'v_entropy': 'mean',
}).reset_index()

decl_stats['decl_name'] = decl_stats['decl_id'].map(lambda x: schema.DECL_NAMES.get(x, str(x)))
print("By declaration type:")
print(decl_stats[['decl_name', 'n_states', 'v_mean', 'v_entropy']])

## Summary

Key findings from distribution profiles:

In [None]:
# Compile key metrics
total_states = stats_df['n_states'].sum()
avg_entropy = stats_df['v_entropy'].mean()
max_unique = stats_df['v_unique'].max()
peak_depth = depth_avg.loc[depth_avg['n_states_mean'].idxmax(), 'depth']

summary = {
    'Total states analyzed': f"{total_states:,}",
    'Seeds analyzed': N_SEEDS,
    'Average H(V)': f"{avg_entropy:.3f} bits",
    'Max unique V values': f"{max_unique} / 85",
    'Peak state count depth': int(peak_depth),
    'V mean (overall)': f"{stats_df['v_mean'].mean():.3f}",
    'V std (overall)': f"{stats_df['v_std'].mean():.3f}",
}

print(viz.create_summary_table(summary, "Distribution Profile Summary"))

In [None]:
# Save results
stats_df.to_csv('../../results/tables/01a_depth_stats.csv', index=False)
print("Results saved to results/tables/01a_depth_stats.csv")