# 12b: Unified Feature Extraction (n=201)

**Goal**: Create a single consistent feature dataset for all 201 base seeds.

**Output**: `results/tables/12b_unified_features.csv` - master feature file for all downstream analyses.

In [1]:
# === CONFIGURATION ===
PROJECT_ROOT = "/home/jason/v2/mk5-tailwind"
DATA_DIR = f"{PROJECT_ROOT}/data/shards-marginalized/train"

# === Setup imports ===
import sys
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

from forge.analysis.utils.seed_db import SeedDB
from forge.analysis.utils.hand_features import extract_hand_features, HAND_FEATURE_NAMES
from forge.oracle.rng import deal_from_seed

print(f"Data directory: {DATA_DIR}")
print(f"Feature columns: {HAND_FEATURE_NAMES}")
print("Ready")

Data directory: /home/jason/v2/mk5-tailwind/data/shards-marginalized/train
Feature columns: ['n_doubles', 'trump_count', 'max_suit_length', 'n_6_high', 'n_5_high', 'n_4_high', 'count_points', 'n_count_dominoes', 'total_pips', 'has_trump_double', 'n_voids', 'n_singletons']
Ready


## 1. Enumerate All Base Seeds

In [2]:
data_path = Path(DATA_DIR)
files = sorted(data_path.glob("seed_*_opp0_decl_*.parquet"))
base_seeds = [int(f.stem.split('_')[1]) for f in files]

print(f"Total base seeds: {len(base_seeds)}")
print(f"Seed range: [{min(base_seeds)}, {max(base_seeds)}]")

Total base seeds: 201
Seed range: [0, 200]


## 2. Extract Features for All Seeds

For each base_seed:
1. Get P0's hand (fixed across opponent configs)
2. Extract hand features using unified module
3. Get V values across all 3 opponent configs
4. Compute E[V], σ(V), V_spread

In [3]:
def extract_unified_features(db: SeedDB, base_seed: int) -> dict | None:
    """Extract unified features for one base seed."""
    decl_id = base_seed % 10
    trump_suit = decl_id
    
    # Get P0's hand
    p0_hand = deal_from_seed(base_seed)[0]
    
    # Get V values across all 3 opponent configs
    V_values = []
    for opp_seed in range(3):
        filename = f"seed_{base_seed:08d}_opp{opp_seed}_decl_{decl_id}.parquet"
        filepath = Path(DATA_DIR) / filename
        
        if not filepath.exists():
            continue
            
        result = db.get_root_v(filename)
        if result.data is not None:
            V_values.append(float(result.data))
    
    # Need all 3 opponent configs
    if len(V_values) != 3:
        return None
    
    # Extract hand features using unified module
    hand_features = extract_hand_features(p0_hand, trump_suit)
    
    return {
        'base_seed': base_seed,
        'decl_id': decl_id,
        'trump_suit': trump_suit,
        # V statistics
        'V_mean': np.mean(V_values),
        'V_std': np.std(V_values),
        'V_spread': max(V_values) - min(V_values),
        'V_min': min(V_values),
        'V_max': max(V_values),
        # Hand features
        **hand_features
    }

In [4]:
# Extract features for all seeds
db = SeedDB(DATA_DIR)

all_results = []
for base_seed in tqdm(base_seeds, desc="Extracting features"):
    result = extract_unified_features(db, base_seed)
    if result:
        all_results.append(result)

db.close()

df = pd.DataFrame(all_results)
print(f"\nExtracted features for {len(df)} seeds")

Extracting features:   0%|          | 0/201 [00:00<?, ?it/s]


Extracted features for 200 seeds


## 3. Verify Feature Quality

In [5]:
print("Feature Summary:")
print("="*60)

print(f"\nV Statistics:")
print(f"  E[V] range: [{df['V_mean'].min():.1f}, {df['V_mean'].max():.1f}]")
print(f"  σ(V) range: [{df['V_std'].min():.1f}, {df['V_std'].max():.1f}]")
print(f"  V_spread range: [{df['V_spread'].min():.0f}, {df['V_spread'].max():.0f}]")

print(f"\nHand Features:")
for col in HAND_FEATURE_NAMES:
    print(f"  {col}: mean={df[col].mean():.2f}, range=[{df[col].min()}, {df[col].max()}]")

Feature Summary:

V Statistics:
  E[V] range: [-29.3, 42.0]
  σ(V) range: [0.0, 37.7]
  V_spread range: [0, 82]

Hand Features:
  n_doubles: mean=1.73, range=[0, 4]
  trump_count: mean=1.32, range=[0, 5]
  max_suit_length: mean=3.25, range=[2, 5]
  n_6_high: mean=1.74, range=[0, 4]
  n_5_high: mean=1.50, range=[0, 4]
  n_4_high: mean=1.35, range=[0, 4]
  count_points: mean=9.20, range=[0, 25]
  n_count_dominoes: mean=1.29, range=[0, 3]
  total_pips: mean=42.40, range=[22, 60]
  has_trump_double: mean=0.17, range=[0, 1]
  n_voids: mean=0.67, range=[0, 3]
  n_singletons: mean=2.27, range=[0, 5]


In [6]:
# Check for any missing values
missing = df.isnull().sum()
if missing.sum() > 0:
    print("Missing values:")
    print(missing[missing > 0])
else:
    print("No missing values - dataset is complete.")

No missing values - dataset is complete.


## 4. Quick Correlation Check

In [7]:
from scipy import stats

print("Feature Correlations with E[V]:")
print("="*60)

correlations = []
for col in HAND_FEATURE_NAMES:
    r, p = stats.pearsonr(df[col], df['V_mean'])
    correlations.append({'feature': col, 'r_with_EV': r, 'p_value': p})
    print(f"  {col}: r = {r:+.3f} (p = {p:.2e})")

corr_df = pd.DataFrame(correlations)
print(f"\nStrongest predictor: {corr_df.loc[corr_df['r_with_EV'].abs().idxmax(), 'feature']}")

Feature Correlations with E[V]:
  n_doubles: r = +0.395 (p = 6.91e-09)
  trump_count: r = +0.229 (p = 1.12e-03)
  max_suit_length: r = -0.084 (p = 2.39e-01)
  n_6_high: r = -0.161 (p = 2.32e-02)
  n_5_high: r = +0.078 (p = 2.72e-01)
  n_4_high: r = +0.026 (p = 7.18e-01)
  count_points: r = +0.197 (p = 5.23e-03)
  n_count_dominoes: r = +0.148 (p = 3.59e-02)
  total_pips: r = +0.035 (p = 6.19e-01)
  has_trump_double: r = +0.242 (p = 5.55e-04)
  n_voids: r = +0.200 (p = 4.45e-03)
  n_singletons: r = +0.001 (p = 9.91e-01)

Strongest predictor: n_doubles


In [8]:
print("\nFeature Correlations with σ(V):")
print("="*60)

for col in HAND_FEATURE_NAMES:
    r, p = stats.pearsonr(df[col], df['V_std'])
    print(f"  {col}: r = {r:+.3f} (p = {p:.2e})")


Feature Correlations with σ(V):
  n_doubles: r = -0.136 (p = 5.49e-02)
  trump_count: r = -0.090 (p = 2.04e-01)
  max_suit_length: r = +0.050 (p = 4.86e-01)
  n_6_high: r = +0.191 (p = 6.77e-03)
  n_5_high: r = -0.101 (p = 1.53e-01)
  n_4_high: r = +0.074 (p = 2.97e-01)
  count_points: r = -0.089 (p = 2.12e-01)
  n_count_dominoes: r = -0.058 (p = 4.11e-01)
  total_pips: r = +0.149 (p = 3.51e-02)
  has_trump_double: r = -0.095 (p = 1.80e-01)
  n_voids: r = -0.051 (p = 4.75e-01)
  n_singletons: r = -0.000 (p = 9.95e-01)


## 5. Save Unified Features

In [9]:
# Save to results
output_path = Path(PROJECT_ROOT) / "forge/analysis/results/tables/12b_unified_features.csv"
df.to_csv(output_path, index=False)

print(f"Saved {len(df)} rows to {output_path}")
print(f"Columns: {list(df.columns)}")

Saved 200 rows to /home/jason/v2/mk5-tailwind/forge/analysis/results/tables/12b_unified_features.csv
Columns: ['base_seed', 'decl_id', 'trump_suit', 'V_mean', 'V_std', 'V_spread', 'V_min', 'V_max', 'n_doubles', 'trump_count', 'max_suit_length', 'n_6_high', 'n_5_high', 'n_4_high', 'count_points', 'n_count_dominoes', 'total_pips', 'has_trump_double', 'n_voids', 'n_singletons']


## Summary

Created unified feature dataset with:
- **n = 200** base seeds (all with complete 3-opponent-config data)
- **V statistics**: E[V], σ(V), V_spread, V_min, V_max
- **12 hand features**: n_doubles, trump_count, max_suit_length, n_6_high, n_5_high, n_4_high, count_points, n_count_dominoes, total_pips, has_trump_double, n_voids, n_singletons

This replaces the ad-hoc feature extraction in individual run_11*.py scripts with a single source of truth.