# Layer 2 — Feature Exploration

Profile all features (null %, variance, correlations).
Summarize cohort features. Output `feature_store_overview.md`.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys

ROOT = Path('.').resolve().parent
sys.path.insert(0, str(ROOT))
from utils.reporting import write_report, md_table, save_plot, timestamp_line

DATA_PATH = ROOT / 'data' / 'cfm_pltv.csv'
if not DATA_PATH.exists():
    DATA_PATH = ROOT / 'data' / 'cfm_pltv_sample.csv'
df = pd.read_csv(DATA_PATH)
print(f'Loaded {len(df):,} rows')

In [None]:
NUMERIC = [
    'login_rows_d7', 'active_days_d7', 'loginchannel_variety_d7',
    'network_variety_d7', 'clientversion_variety_d7', 'max_level_seen_d7',
    'max_ladderscore_d7', 'games_d7', 'win_rate_d7', 'avg_game_duration_d7',
    'avg_score_d7', 'kills_d7', 'deaths_d7', 'assists_d7', 'kd_d7',
    'max_level_game_d7', 'max_ladderlevel_d7', 'rev_d7', 'txn_cnt_d7',
]

# Feature profiling
profile = df[NUMERIC].describe().T
profile['null_pct'] = (df[NUMERIC].isnull().sum() / len(df) * 100).round(1)
profile[['null_pct', 'mean', 'std', 'min', '50%', 'max']]

In [None]:
# Spearman correlation with LTV30
corr = df[NUMERIC + ['ltv30']].corr(method='spearman')['ltv30'].drop('ltv30').sort_values(ascending=False)

fig = px.bar(x=corr.values, y=corr.index, orientation='h',
             title='Spearman Correlation with LTV30',
             labels={'x': 'Spearman ρ', 'y': 'Feature'},
             color=corr.values, color_continuous_scale='RdYlGn')
fig.update_layout(height=500, yaxis=dict(autorange='reversed'))
fig.show()

In [None]:
# Cohort distributions
fig_ms = px.histogram(df, x='media_source', color='is_payer_30',
                      barmode='group', title='Users by Media Source')
fig_ms.show()

fig_cc = px.histogram(df, x='first_country_code', color='is_payer_30',
                      barmode='group', title='Users by Country')
fig_cc.show()

In [None]:
# Scatter: rev_d7 vs ltv30
payers = df[df['ltv30'] > 0].sample(min(5000, len(df[df['ltv30'] > 0])), random_state=42)
fig = px.scatter(payers, x='rev_d7', y='ltv30', color='first_country_code',
                 title='D7 Revenue vs LTV30 (payers)', opacity=0.5,
                 labels={'rev_d7': 'Revenue D7', 'ltv30': 'LTV30'})
fig.show()

In [None]:
# Generate report
corr_rows = [[feat, f'{val:.3f}'] for feat, val in corr.head(10).items()]

report = f"""# Layer 2 — Feature Store Overview

{timestamp_line()}

## Feature Categories

| Category | # Features | Examples |
|----------|-----------|----------|
| UA Attribution | 8 | media_source, country, OS |
| Login D7 | 7 | active_days_d7, login_rows_d7 |
| Gameplay D7 | 10 | games_d7, win_rate_d7, kd_d7 |
| Payment D7 | 3 | rev_d7, txn_cnt_d7 |

## Top Correlations with LTV30

{md_table(['Feature', 'Spearman ρ'], corr_rows)}
"""

write_report('feature_store_overview.md', report)
print('Done!')