# 03. Feature Selection

**Step 1**: Train-first split with correlation filtering
**Step 2**: Univariate (t-test/chi-square) + LGBM importance → combined rank → top-N core features (global union)

In [None]:
import sys
sys.path.insert(0, "..")

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, chi2_contingency, fisher_exact
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import PROJECT_ROOT, SPLIT_SEED, MODEL_SEED, LABELS
from src.variables import (
    CATEGORY_COLS, LLM_COLS, LAB_COLS, CODE_COLS, TARGET_COLS, ID_COLS
)

## Step 1: Train-first split & correlation filtering

In [None]:
# ── Paths ──
RAW_DATA_PATH = PROJECT_ROOT / "data/raw/ADER_windowday_dataset_number_with_llm_v2.csv"
OUTPUT_DIR = PROJECT_ROOT / "data/new_analysis/corr0.7_filtered_data/split_result"
CORR_OUTPUT_DIR = PROJECT_ROOT / "results/new_analysis/split_correlation"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CORR_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

raw_df = pd.read_csv(RAW_DATA_PATH)

# Split first, then filter (to avoid data leakage)
train_df, test_df = train_test_split(
    raw_df, test_size=0.3, random_state=SPLIT_SEED, stratify=raw_df["label_30d"]
)
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

In [None]:
# Missing rate > 60% removal (train-based)
missing_rate_train = train_df.isna().mean()
high_missing_cols = missing_rate_train[missing_rate_train > 0.60].index.tolist()
if high_missing_cols:
    print(f"Removing {len(high_missing_cols)} cols with >60% missing: {high_missing_cols}")
    train_df = train_df.drop(columns=high_missing_cols)
    test_df = test_df.drop(columns=high_missing_cols)

# Zero variance removal (train-based)
numeric_cols_for_var = train_df.select_dtypes(include=np.number).columns.drop(
    ID_COLS + TARGET_COLS, errors="ignore"
)
zero_var_cols = [col for col in numeric_cols_for_var if train_df[col].std() == 0]
if zero_var_cols:
    print(f"Removing {len(zero_var_cols)} zero-variance cols: {zero_var_cols}")
    train_df = train_df.drop(columns=zero_var_cols)
    test_df = test_df.drop(columns=zero_var_cols)

print(f"After filtering: Train {train_df.shape}")

In [None]:
# Correlation > 0.7 removal (train-based, continuous cols only)
all_cols = train_df.columns.tolist()
exclude_cols = set(ID_COLS + TARGET_COLS + CATEGORY_COLS + LLM_COLS + CODE_COLS)
continuous_cols = [
    c for c in all_cols
    if c not in exclude_cols and pd.api.types.is_numeric_dtype(train_df[c])
]

X_train_temp = train_df[continuous_cols].fillna(train_df[continuous_cols].median())
corr_matrix = X_train_temp.corr(method="pearson")

THRESH = 0.70
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
pairs = upper_tri.stack().reset_index()
pairs.columns = ["feature_1", "feature_2", "r"]
pairs["abs_r"] = pairs["r"].abs()
high_corr_pairs = pairs[pairs["abs_r"] > THRESH].sort_values("abs_r", ascending=False)
high_corr_pairs["missing_rate_1"] = high_corr_pairs["feature_1"].map(missing_rate_train)
high_corr_pairs["missing_rate_2"] = high_corr_pairs["feature_2"].map(missing_rate_train)
high_corr_pairs.to_csv(CORR_OUTPUT_DIR / "high_corr_pairs_train_based.csv", index=False)

to_drop = set()
for _, row in high_corr_pairs.iterrows():
    f1, f2 = row["feature_1"], row["feature_2"]
    if missing_rate_train.get(f1, 0) >= missing_rate_train.get(f2, 0):
        to_drop.add(f1)
    else:
        to_drop.add(f2)

if to_drop:
    print(f"Removing {len(to_drop)} high-correlation cols")
    train_df = train_df.drop(columns=sorted(to_drop))
    test_df = test_df.drop(columns=sorted(to_drop))

print(f"Final: Train {train_df.shape}")

train_df.to_csv(OUTPUT_DIR / "final_train_dataset.csv", index=False, encoding="utf-8-sig")
test_df.to_csv(OUTPUT_DIR / "final_test_dataset.csv", index=False, encoding="utf-8-sig")

## Step 2: Global union feature selection (univariate + LGBM)

In [None]:
# ── Paths ──
DATA_DIR = PROJECT_ROOT / "data/processed_imp/260114_split_corr_LLM_ADER/imputation/simple_imput"  # ⚠️ Adjust per experiment
OUT_DIR = PROJECT_ROOT / "results/new_analysis/260114_qwen/Feature_Selection/simple_20/step2_FS"  # ⚠️ Adjust per experiment
OUT_DIR.mkdir(parents=True, exist_ok=True)

CORE_N = 20
FINAL_N = 20
TOP_N_FOR_CORE = 30

# For FS, CATEGORY_COLS includes CODE_COLS (sleep/appetite/weight treated as categorical here)
FS_CATEGORY_COLS = list(CATEGORY_COLS) + [c for c in CODE_COLS if c not in CATEGORY_COLS]


In [None]:
def calculate_p_values(df, target):
    results = []
    y = df[target]
    feature_cols = [col for col in df.columns if col != target]

    for col in feature_cols:
        if df[col].nunique(dropna=True) <= 1:
            continue
        s = df[col]
        p_value = np.nan

        if pd.api.types.is_numeric_dtype(s):
            g0, g1 = s[y == 0].dropna(), s[y == 1].dropna()
            if len(g0) > 1 and len(g1) > 1:
                _, p_value = ttest_ind(g0, g1, equal_var=False)
        else:
            ct = pd.crosstab(s, y)
            if ct.shape[0] > 1 and ct.shape[1] > 1:
                if ct.shape == (2, 2) and (ct.values < 5).any():
                    _, p_value = fisher_exact(ct)
                else:
                    _, p_value, _, _ = chi2_contingency(ct)
        results.append({"feature": col, "p_value": p_value})

    res_df = pd.DataFrame(results).dropna().sort_values("p_value").reset_index(drop=True)
    res_df["p_value_rank"] = res_df.index + 1
    return res_df


def calculate_model_importance(df, target):
    y = df[target]
    X = df.drop(columns=[target])
    for col in X.columns:
        if col in FS_CATEGORY_COLS:
            X[col] = X[col].astype("category")

    model = LGBMClassifier(random_state=MODEL_SEED, verbose=-1, is_unbalance=True)
    model.fit(X, y)

    res_df = pd.DataFrame({"feature": X.columns, "importance": model.feature_importances_})
    res_df = res_df.sort_values("importance", ascending=False).reset_index(drop=True)
    res_df["model_rank"] = res_df.index + 1
    return res_df

In [None]:
# Run feature selection for each time point (excluding LLM cols)
all_rankings = {}

for label in LABELS:
    file_path = DATA_DIR / f"simple_{label}_train.csv"  # ⚠️ Adjust prefix per imputation
    if not file_path.exists():
        print(f"Skipping {label}: file not found")
        continue

    df_train = pd.read_csv(file_path)
    features_for_selection = [col for col in df_train.columns if col not in LLM_COLS]
    df_for_fs = df_train[features_for_selection]

    p_ranks = calculate_p_values(df_for_fs, label)
    p_ranks.to_csv(OUT_DIR / f"univariate_results_{label}.csv", index=False)

    m_ranks = calculate_model_importance(df_for_fs, label)
    m_ranks.to_csv(OUT_DIR / f"model_importance_{label}.csv", index=False)

    merged = pd.merge(p_ranks, m_ranks, on="feature", how="outer").fillna(len(df_for_fs.columns))
    merged["combined_rank"] = (merged["p_value_rank"] + merged["model_rank"]) / 2
    merged = merged.sort_values("combined_rank").reset_index(drop=True)
    all_rankings[label] = merged
    merged.to_csv(OUT_DIR / f"ranking_{label}.csv", index=False)
    print(f"[{label}] Ranking complete: {len(merged)} features")

In [None]:
# Global core features (shared across all time points)
rank_table = pd.concat([
    df.set_index("feature")["combined_rank"].rename(label)
    for label, df in all_rankings.items()
], axis=1)
max_rank = max(df["combined_rank"].max() for df in all_rankings.values())
rank_table = rank_table.fillna(max_rank + 1)
rank_table["mean_rank"] = rank_table.mean(axis=1)

core_features = rank_table.sort_values("mean_rank").head(CORE_N).index.tolist()

# Save per-label final feature sets (all use the same core features)
for label in LABELS:
    if label not in all_rankings:
        continue
    pd.DataFrame({"feature": core_features}).to_csv(
        OUT_DIR / f"final_features_{label}.csv", index=False, encoding="utf-8-sig"
    )

pd.DataFrame({"feature": core_features}).to_csv(
    OUT_DIR / "core_features.csv", index=False, encoding="utf-8-sig"
)

print(f"\nCore features ({len(core_features)}): {core_features}")