# 02a: Entropy Decomposition

**Goal**: Understand which features reduce uncertainty about V.

**Key Questions**:
1. Which features eliminate the most uncertainty?
2. Does any feature set drive H(V|features) -> 0? (V is deterministic function)
3. Diminishing returns curve: how many features to capture 90%, 99%, 99.9%?

**Reference**: docs/analysis-draft.md Section 3.1

In [None]:
# === CONFIGURATION ===
DATA_DIR = "/mnt/d/shards-standard/"
PROJECT_ROOT = "/home/jason/v2/mk5-tailwind"

# === Setup imports ===
import sys
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from forge.analysis.utils import loading, features, compression, viz
from forge.oracle import schema

viz.setup_notebook_style()
print("✓ Ready")

## 1. Load Sample Data

In [None]:
# Load multiple seeds for robust analysis
shard_files = loading.find_shard_files(DATA_DIR)
N_SEEDS = min(5, len(shard_files))  # Use fewer seeds due to memory
sample_files = shard_files[:N_SEEDS]
print(f"Analyzing {N_SEEDS} shards")

In [None]:
# Load and combine shards with sampling to manage memory
# Each shard has millions of states; sample to keep total manageable

SAMPLE_PER_SHARD = 50_000  # 50k samples per shard = 250k total for 5 seeds
print(f"Loading {N_SEEDS} shards ({SAMPLE_PER_SHARD:,} samples each)...")

dfs = []
for path in tqdm(sample_files, desc="Loading shards"):
    df, seed, decl_id = schema.load_file(path)
    
    # Sample if shard is large
    if len(df) > SAMPLE_PER_SHARD:
        df = df.sample(n=SAMPLE_PER_SHARD, random_state=seed)
    
    # Add metadata
    df['seed'] = seed
    df['decl_id'] = decl_id
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
print(f"✓ Built combined_df: {len(combined_df):,} sampled states from {N_SEEDS} shards")

## 2. Extract Features

In [None]:
# Extract all features
states = combined_df['state'].values
V = combined_df['V'].values

# Basic features (fast)
depth_vals = features.depth(states)
team_vals = features.team(states).astype(int)
player_vals = features.player(states)
balance_vals = features.hand_balance(states)

# Trick info
leader_vals, trick_len_vals = features.trick_info(states)

print(f"Features extracted for {len(states):,} states")

In [None]:
# Count domino features (per-seed, slower)
# For multi-seed analysis, we need to track seed
counts_remaining_vals = np.zeros(len(states), dtype=np.int32)
team0_counts_vals = np.zeros(len(states), dtype=np.int32)
team1_counts_vals = np.zeros(len(states), dtype=np.int32)

# Process per-seed (convert numpy int64 to Python int for Random)
for seed in tqdm(combined_df['seed'].unique(), desc="Computing count features"):
    seed_int = int(seed)  # Convert numpy int64 to Python int
    mask = combined_df['seed'].values == seed
    seed_states = states[mask]
    counts_remaining_vals[mask] = features.counts_remaining(seed_states, seed_int)
    t0, t1 = features.counts_by_team(seed_states, seed_int)
    team0_counts_vals[mask] = t0
    team1_counts_vals[mask] = t1

print("Count features computed")

## 3. Baseline Entropy

In [None]:
# Total entropy of V
H_V = compression.entropy_bits(V)
max_H = np.log2(85)  # 85 possible values

print(f"H(V) = {H_V:.4f} bits")
print(f"Max possible H = {max_H:.4f} bits")
print(f"Efficiency: {100*H_V/max_H:.1f}%")

## 4. Conditional Entropy Analysis

Compute H(V|feature) for each feature.

In [None]:
# Build feature dict
feature_dict = {
    'depth': depth_vals,
    'team': team_vals,
    'player': player_vals,
    'leader': leader_vals,
    'trick_len': trick_len_vals,
    'hand_balance': balance_vals,
    'counts_remaining': counts_remaining_vals,
    'team0_counts': team0_counts_vals,
    'team1_counts': team1_counts_vals,
    'seed': combined_df['seed'].values,
    'decl_id': combined_df['decl_id'].values,
}

# Compute information gain for each
info_results = compression.information_gain_ranking(V, feature_dict)

print("\nInformation gain ranking:")
print(f"{'Feature':<20} {'I(V;F)':<10} {'H(V|F)':<10} {'Reduction':<10}")
print("-" * 50)
for name, mi, h_cond in info_results:
    reduction = 100 * mi / H_V
    print(f"{name:<20} {mi:<10.4f} {h_cond:<10.4f} {reduction:<10.1f}%")

In [None]:
# Plot information gain
fig, ax = plt.subplots(figsize=(10, 8))
viz.plot_entropy_curve(info_results, ax=ax, title="Information Gain by Feature")
plt.tight_layout()
plt.savefig('../../results/figures/02a_info_gain.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Cumulative Information Gain

How much entropy reduction from combining features?

In [None]:
# Greedy feature selection - add features one by one
# This is expensive, so we'll use a subset

def greedy_feature_selection(V, feature_dict, max_features=5):
    """Greedily select features to minimize conditional entropy."""
    remaining = set(feature_dict.keys())
    selected = []
    history = [('(none)', H_V, 0)]
    
    current_key = None  # Combined feature key
    
    for i in range(max_features):
        best_name = None
        best_h_cond = H_V
        
        for name in remaining:
            # Combine with already selected features
            if current_key is None:
                combined = feature_dict[name].astype(str)
            else:
                combined = np.char.add(current_key.astype(str), '_' + feature_dict[name].astype(str))
            
            h_cond = compression.conditional_entropy(V, combined)
            if h_cond < best_h_cond:
                best_h_cond = h_cond
                best_name = name
                best_combined = combined
        
        if best_name is None:
            break
            
        selected.append(best_name)
        remaining.remove(best_name)
        current_key = best_combined
        reduction = 100 * (H_V - best_h_cond) / H_V
        history.append(('+' + best_name, best_h_cond, reduction))
        
        print(f"Selected: {best_name}, H(V|features) = {best_h_cond:.4f}, reduction = {reduction:.1f}%")
    
    return selected, history

# Run greedy selection
print(f"\nGreedy feature selection (starting H(V) = {H_V:.4f}):\n")
selected_features, selection_history = greedy_feature_selection(V, feature_dict, max_features=6)

In [None]:
# Plot cumulative reduction
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Conditional entropy curve
steps = [h[0] for h in selection_history]
h_conds = [h[1] for h in selection_history]
reductions = [h[2] for h in selection_history]

axes[0].plot(range(len(h_conds)), h_conds, 'o-', markersize=8)
axes[0].set_xticks(range(len(steps)))
axes[0].set_xticklabels(steps, rotation=45, ha='right')
axes[0].set_ylabel('H(V|features) (bits)')
axes[0].set_title('Conditional Entropy vs Features Added')
axes[0].axhline(y=0, color='red', linestyle='--', alpha=0.5)

# Cumulative reduction
axes[1].plot(range(len(reductions)), reductions, 'o-', markersize=8, color='green')
axes[1].set_xticks(range(len(steps)))
axes[1].set_xticklabels(steps, rotation=45, ha='right')
axes[1].set_ylabel('% Entropy Reduction')
axes[1].set_title('Cumulative Information Gain')
axes[1].axhline(y=90, color='orange', linestyle='--', alpha=0.5, label='90%')
axes[1].axhline(y=99, color='red', linestyle='--', alpha=0.5, label='99%')
axes[1].legend()

plt.tight_layout()
plt.savefig('../../results/figures/02a_cumulative_info.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Feature Interaction Analysis

Are there synergies between features?

In [None]:
# Pairwise information gain matrix
top_features = ['depth', 'seed', 'team', 'counts_remaining', 'leader']

# Single feature gains
single_gains = {}
for f in top_features:
    single_gains[f] = H_V - compression.conditional_entropy(V, feature_dict[f])

# Pairwise gains
pair_gains = np.zeros((len(top_features), len(top_features)))
for i, f1 in enumerate(top_features):
    for j, f2 in enumerate(top_features):
        if i <= j:
            combined = np.char.add(feature_dict[f1].astype(str), '_' + feature_dict[f2].astype(str))
            pair_gains[i, j] = H_V - compression.conditional_entropy(V, combined)
            pair_gains[j, i] = pair_gains[i, j]

# Compute synergy: pair_gain - max(single_gain_1, single_gain_2)
synergy = np.zeros_like(pair_gains)
for i, f1 in enumerate(top_features):
    for j, f2 in enumerate(top_features):
        expected = max(single_gains[f1], single_gains[f2])
        synergy[i, j] = pair_gains[i, j] - expected

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pairwise information gain heatmap
sns.heatmap(pair_gains, annot=True, fmt='.3f', xticklabels=top_features, 
            yticklabels=top_features, ax=axes[0], cmap='YlOrRd')
axes[0].set_title('Pairwise Information Gain I(V; F1,F2)')

# Synergy heatmap
sns.heatmap(synergy, annot=True, fmt='.3f', xticklabels=top_features,
            yticklabels=top_features, ax=axes[1], cmap='RdBu_r', center=0)
axes[1].set_title('Feature Synergy (excess over max single)')

plt.tight_layout()
plt.show()

## 7. Sufficient Statistics Analysis

Can we find a small set that captures most information?

In [None]:
# What if we have (seed, depth, team)? This should capture most info
# since seed determines the deal and depth+team determine game state

key_features = ['seed', 'depth', 'team']
combined_key = feature_dict['seed'].astype(str)
for f in key_features[1:]:
    combined_key = np.char.add(combined_key, '_' + feature_dict[f].astype(str))

h_cond_key = compression.conditional_entropy(V, combined_key)
reduction_key = 100 * (H_V - h_cond_key) / H_V

print(f"H(V | {', '.join(key_features)}) = {h_cond_key:.4f} bits")
print(f"Reduction: {reduction_key:.1f}%")
print(f"Remaining uncertainty: {h_cond_key:.4f} bits ({100*h_cond_key/H_V:.1f}% of original)")

In [None]:
# Add more features to key
extended_keys = ['seed', 'depth', 'team', 'leader', 'trick_len']
combined_ext = feature_dict['seed'].astype(str)
for f in extended_keys[1:]:
    combined_ext = np.char.add(combined_ext, '_' + feature_dict[f].astype(str))

h_cond_ext = compression.conditional_entropy(V, combined_ext)
reduction_ext = 100 * (H_V - h_cond_ext) / H_V

print(f"H(V | {', '.join(extended_keys)}) = {h_cond_ext:.4f} bits")
print(f"Reduction: {reduction_ext:.1f}%")

## 8. Per-Seed Analysis

Does the pattern hold across seeds?

In [None]:
# Analyze each seed separately
seed_entropy_results = []

for seed in combined_df['seed'].unique():
    mask = combined_df['seed'].values == seed
    V_seed = V[mask]
    depth_seed = depth_vals[mask]
    
    h_v = compression.entropy_bits(V_seed)
    h_v_depth = compression.conditional_entropy(V_seed, depth_seed)
    
    seed_entropy_results.append({
        'seed': seed,
        'H_V': h_v,
        'H_V_depth': h_v_depth,
        'reduction_depth': 100 * (h_v - h_v_depth) / h_v,
    })

seed_entropy_df = pd.DataFrame(seed_entropy_results)
print("Per-seed entropy analysis:")
print(seed_entropy_df)

## Summary

In [None]:
# Find how many features needed for 90%, 99%
features_for_90 = None
features_for_99 = None
for i, (name, h, red) in enumerate(selection_history[1:], 1):
    if red >= 90 and features_for_90 is None:
        features_for_90 = i
    if red >= 99 and features_for_99 is None:
        features_for_99 = i

summary = {
    'Total states': f"{len(V):,}",
    'H(V) baseline': f"{H_V:.4f} bits",
    'Top feature': info_results[0][0],
    'Top feature I(V;F)': f"{info_results[0][1]:.4f} bits",
    'Top feature reduction': f"{100*info_results[0][1]/H_V:.1f}%",
    'Features for 90%': features_for_90 or 'N/A',
    'Features for 99%': features_for_99 or 'N/A',
    'Final H(V|features)': f"{selection_history[-1][1]:.4f} bits",
    'Final reduction': f"{selection_history[-1][2]:.1f}%",
}

print(viz.create_summary_table(summary, "Entropy Decomposition Summary"))

In [None]:
# Save results
results_df = pd.DataFrame(info_results, columns=['feature', 'mutual_info', 'conditional_entropy'])
results_df['reduction_pct'] = 100 * results_df['mutual_info'] / H_V
results_df.to_csv('../../results/tables/02a_info_gain.csv', index=False)
print("Results saved to results/tables/02a_info_gain.csv")