# 02b: Kolmogorov Compression Analysis

**Goal**: Estimate algorithmic complexity of V via compression.

**Key Questions**:
1. If depth ordering compresses much better, V has depth-coherent structure
2. If all orderings compress similarly, structure is global not local
3. Absolute ratio: 0.1 = very structured, 0.9 = near random

**Reference**: docs/analysis-draft.md Section 3.2

In [None]:
# === CONFIGURATION ===
DATA_DIR = "/mnt/d/shards-standard/"
PROJECT_ROOT = "/home/jason/v2/mk5-tailwind"

# === Setup imports ===
import sys
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from forge.analysis.utils import loading, features, compression, viz
from forge.oracle import schema

viz.setup_notebook_style()
print("âœ“ Ready")

## 1. Single Seed Analysis

Analyze compression for one seed in detail.

In [None]:
# Load first shard
shard_files = loading.find_shard_files(DATA_DIR)
df, seed, decl_id = schema.load_file(shard_files[0])

states = df['state'].values
V = df['V'].values.astype(np.int8)

print(f"Seed: {seed}, Declaration: {schema.DECL_NAMES[decl_id]}")
print(f"States: {len(states):,}")
print(f"V size: {V.nbytes:,} bytes")

In [None]:
# Compute compression under different orderings
comp_results = compression.compression_analysis(states, V)

print("Compression ratios by ordering:")
for name, ratio in comp_results.items():
    print(f"  {name}: {ratio:.4f}")

In [None]:
# Visualize
fig, ax = plt.subplots(figsize=(8, 5))
viz.plot_compression_comparison(comp_results, ax=ax, title=f"Compression by Ordering (seed={seed})")
plt.tight_layout()
plt.savefig('../../results/figures/02b_compression_single.png', dpi=150, bbox_inches='tight')
plt.show()

## 2. Multi-Seed Comparison

Is the pattern consistent across seeds?

In [None]:
# Analyze multiple seeds (limit to 10 for faster analysis)
N_SEEDS = min(10, len(shard_files))
multi_results = []

for path in tqdm(shard_files[:N_SEEDS], desc="Analyzing seeds"):
    df, seed, decl_id = schema.load_file(path)
    states = df['state'].values
    V = df['V'].values.astype(np.int8)
    
    comp = compression.compression_analysis(states, V)
    comp['seed'] = seed
    comp['decl_id'] = decl_id
    comp['n_states'] = len(states)
    multi_results.append(comp)

comp_df = pd.DataFrame(multi_results)
print(f"Analyzed {len(comp_df)} seeds")

In [None]:
# Summary statistics
print("\nCompression ratio statistics:")
print(comp_df[['depth_order', 'state_order', 'random_order']].describe())

In [None]:
# Plot comparison across seeds
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot comparison
comp_melted = comp_df.melt(
    id_vars=['seed', 'decl_id'],
    value_vars=['depth_order', 'state_order', 'random_order'],
    var_name='ordering',
    value_name='ratio'
)
import seaborn as sns
sns.boxplot(data=comp_melted, x='ordering', y='ratio', ax=axes[0])
axes[0].set_ylabel('Compression Ratio')
axes[0].set_title('Compression by Ordering (all seeds)')

# Scatter: depth vs random
axes[1].scatter(comp_df['random_order'], comp_df['depth_order'], alpha=0.6)
axes[1].plot([0.1, 0.6], [0.1, 0.6], 'r--', alpha=0.5, label='y=x')
axes[1].set_xlabel('Random Order Ratio')
axes[1].set_ylabel('Depth Order Ratio')
axes[1].set_title('Depth vs Random Ordering')
axes[1].legend()

plt.tight_layout()
plt.savefig('../../results/figures/02b_compression_multi.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Per-Depth Compression

Does compressibility vary with game depth?

In [None]:
# Analyze compression per depth
df, seed, decl_id = schema.load_file(shard_files[0])
states = df['state'].values
V = df['V'].values.astype(np.int8)
depths = features.depth(states)

depth_compression = []
for d in tqdm(range(1, int(depths.max()) + 1), desc="Analyzing depths"):
    mask = depths == d
    if mask.sum() < 100:  # Skip sparse depths
        continue
    
    v_d = V[mask]
    ratio = compression.lzma_ratio(v_d.tobytes())
    h = compression.entropy_bits(v_d)
    
    depth_compression.append({
        'depth': d,
        'n_states': mask.sum(),
        'lzma_ratio': ratio,
        'entropy': h,
        'v_unique': len(np.unique(v_d)),
    })

depth_comp_df = pd.DataFrame(depth_compression)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# LZMA ratio vs depth
axes[0].plot(depth_comp_df['depth'], depth_comp_df['lzma_ratio'], 'o-')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('LZMA Ratio')
axes[0].set_title('Compressibility vs Depth')

# Entropy vs depth
axes[1].plot(depth_comp_df['depth'], depth_comp_df['entropy'], 'o-', color='orange')
axes[1].set_xlabel('Depth')
axes[1].set_ylabel('H(V) (bits)')
axes[1].set_title('Entropy vs Depth')

# LZMA vs entropy (should correlate)
axes[2].scatter(depth_comp_df['entropy'], depth_comp_df['lzma_ratio'], alpha=0.6)
axes[2].set_xlabel('Entropy (bits)')
axes[2].set_ylabel('LZMA Ratio')
axes[2].set_title('Compression vs Entropy')

# Add correlation
corr = np.corrcoef(depth_comp_df['entropy'], depth_comp_df['lzma_ratio'])[0,1]
axes[2].text(0.05, 0.95, f'r = {corr:.3f}', transform=axes[2].transAxes, 
             verticalalignment='top', fontsize=12)

plt.tight_layout()
plt.show()

## 4. Alternative Compression Presets

Does stronger compression reveal more structure?

In [None]:
# Compare LZMA presets
df, seed, decl_id = schema.load_file(shard_files[0])
V = df['V'].values.astype(np.int8)

preset_results = {}
for preset in [0, 3, 6, 9]:
    ratio = compression.lzma_ratio(V.tobytes(), preset=preset)
    preset_results[f'preset_{preset}'] = ratio
    print(f"Preset {preset}: {ratio:.4f}")

# Also test gzip for comparison
import gzip
gzip_compressed = gzip.compress(V.tobytes(), compresslevel=9)
gzip_ratio = len(gzip_compressed) / len(V.tobytes())
print(f"gzip-9: {gzip_ratio:.4f}")

## 5. Q-Value Compression

How compressible are Q-values compared to V?

In [None]:
# Load Q-values
df, seed, decl_id = schema.load_file(shard_files[0])
V = df['V'].values.astype(np.int8)
q_values = df[['q0', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6']].values.astype(np.int8)

# Compare compression
v_ratio = compression.lzma_ratio(V.tobytes())
q_ratio = compression.lzma_ratio(q_values.tobytes())

print(f"V compression ratio: {v_ratio:.4f}")
print(f"Q compression ratio: {q_ratio:.4f}")
print(f"Q/V ratio: {q_ratio/v_ratio:.2f}")

In [None]:
# Per-column Q compression
q_col_ratios = {}
for i in range(7):
    q_col = q_values[:, i].astype(np.int8)
    ratio = compression.lzma_ratio(q_col.tobytes())
    q_col_ratios[f'q{i}'] = ratio

print("\nPer-column Q compression:")
for col, ratio in q_col_ratios.items():
    print(f"  {col}: {ratio:.4f}")

## 6. Delta Encoding

Does encoding V as differences improve compression?

In [None]:
# Delta encoding (differences between adjacent values)
depths = features.depth(df['state'].values)
order = np.argsort(depths)
V_ordered = V[order]

# Compute deltas
V_delta = np.diff(V_ordered).astype(np.int8)

# Compare compression
v_ordered_ratio = compression.lzma_ratio(V_ordered.tobytes())
v_delta_ratio = compression.lzma_ratio(V_delta.tobytes())

print(f"V ordered ratio: {v_ordered_ratio:.4f}")
print(f"V delta ratio: {v_delta_ratio:.4f}")
print(f"Improvement from delta: {100*(v_ordered_ratio - v_delta_ratio)/v_ordered_ratio:.1f}%")

In [None]:
# Delta distribution
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(V_delta, bins=100, alpha=0.7, edgecolor='black')
ax.set_xlabel('V delta (V[i+1] - V[i]) in depth order')
ax.set_ylabel('Count')
ax.set_title('Distribution of V Deltas')
ax.axvline(x=0, color='red', linestyle='--', alpha=0.5)
plt.show()

print(f"Delta mean: {V_delta.mean():.3f}")
print(f"Delta std: {V_delta.std():.3f}")
print(f"% zero deltas: {100*(V_delta == 0).mean():.1f}%")

## Summary

In [None]:
# Overall assessment
avg_depth = comp_df['depth_order'].mean()
avg_random = comp_df['random_order'].mean()
improvement = 100 * (avg_random - avg_depth) / avg_random

summary = {
    'Seeds analyzed': len(comp_df),
    'Avg depth-order ratio': f"{avg_depth:.4f}",
    'Avg random-order ratio': f"{avg_random:.4f}",
    'Improvement from ordering': f"{improvement:.1f}%",
    'V ratio (seed 0)': f"{v_ratio:.4f}",
    'Q ratio (seed 0)': f"{q_ratio:.4f}",
    'Delta encoding benefit': f"{100*(v_ordered_ratio - v_delta_ratio)/v_ordered_ratio:.1f}%",
}

print(viz.create_summary_table(summary, "Compression Analysis Summary"))

In [None]:
# Interpretation
if avg_depth < 0.3:
    interpretation = "HIGHLY STRUCTURED - V has strong depth-coherent patterns"
elif avg_depth < 0.5:
    interpretation = "MODERATELY STRUCTURED - V has exploitable patterns"
elif avg_depth < 0.7:
    interpretation = "WEAKLY STRUCTURED - Some patterns exist"
else:
    interpretation = "NEAR RANDOM - Little compressible structure"

print(f"\nInterpretation: {interpretation}")

In [None]:
# Save results
comp_df.to_csv('../../results/tables/02b_compression.csv', index=False)
depth_comp_df.to_csv('../../results/tables/02b_compression_by_depth.csv', index=False)
print("Results saved")