# 03 — Feature Selection

Techniques:
- Chi-Square (for categorical/positive features)
- RFE (with LogisticRegression or RandomForest)
- Model-based importances (RandomForest / XGBoost)

In [None]:
import numpy as np, pandas as pd, joblib
from pathlib import Path
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

RANDOM_STATE = 42

train = pd.read_csv("../data/processed/train.csv")
target = next((t for t in ["target","num","condition","disease"] if t in train.columns), None)
X_train, y_train = train.drop(columns=[target]), train[target]

preprocessor = joblib.load("../models/preprocessor.pkl")
X_proc = preprocessor.fit_transform(X_train)
if hasattr(X_proc, "toarray"):
    X_proc = X_proc.toarray()

# --- Chi2 (requires non-negative features) ---
X_nonneg = MinMaxScaler().fit_transform(X_proc)
k = min(10, X_nonneg.shape[1])
skb = SelectKBest(chi2, k=k).fit(X_nonneg, y_train)

plt.figure()
plt.bar(range(k), skb.scores_[np.argsort(skb.scores_)[-k:]])
plt.title("Top-k Chi² Scores")
plt.xlabel("Feature Index (post-transform)")
plt.ylabel("Score")
plt.show()

# --- RFE with Logistic Regression ---
est = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
rfe = RFE(estimator=est, n_features_to_select=k).fit(X_proc, y_train)
rfe_support = rfe.support_
print("RFE selected features:", np.where(rfe_support)[0][:k])

# --- Model-based (RandomForest) ---
rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)
rf.fit(X_proc, y_train)
importances = rf.feature_importances_
top_idx = np.argsort(importances)[-k:]

plt.figure()
plt.bar(range(k), importances[top_idx])
plt.title("Top-k RF Importances")
plt.xlabel("Feature Index (post-transform)")
plt.ylabel("Importance")
plt.show()

# Save a simple mask (by RF top-k) for downstream use
feature_mask = np.zeros(X_proc.shape[1], dtype=bool)
feature_mask[top_idx] = True
np.save("../models/feature_mask.npy", feature_mask)
print("Saved ../models/feature_mask.npy")