# Feature Selection

This notebook covers:
- Feature importance using Random Forest
- Recursive Feature Elimination (RFE)
- Chi-Square test for feature significance
- Feature selection visualization


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the preprocessed data
X_scaled = pd.read_csv('data/X_scaled.csv')
y = pd.read_csv('data/y_target.csv').values.ravel()

print("Data loaded successfully!")
print(f"Features shape: {X_scaled.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {list(X_scaled.columns)}")


In [None]:
# 1. Feature Importance using Random Forest
print("1. Feature Importance using Random Forest:")
print("=" * 45)

# Train Random Forest to get feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

# Get feature importance
feature_importance = rf.feature_importances_
feature_names = X_scaled.columns

# Create importance DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Feature Importance Scores:")
for idx, row in importance_df.iterrows():
    print(f"{row['feature']:12}: {row['importance']:.4f}")

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()


In [None]:
# 2. Recursive Feature Elimination (RFE)
print("\n2. Recursive Feature Elimination (RFE):")
print("=" * 40)

# Use Logistic Regression as base estimator for RFE
estimator = LogisticRegression(random_state=42, max_iter=1000)

# Apply RFE to select top 8 features
rfe = RFE(estimator=estimator, n_features_to_select=8)
rfe.fit(X_scaled, y)

# Get selected features
selected_features_rfe = X_scaled.columns[rfe.support_]
print(f"Selected features (RFE): {list(selected_features_rfe)}")

# Get feature rankings
feature_rankings = pd.DataFrame({
    'feature': X_scaled.columns,
    'ranking': rfe.ranking_,
    'selected': rfe.support_
}).sort_values('ranking')

print("\nFeature Rankings (RFE):")
for idx, row in feature_rankings.iterrows():
    status = "✓" if row['selected'] else "✗"
    print(f"{row['feature']:12}: Rank {row['ranking']:2d} {status}")

# Visualize RFE results
plt.figure(figsize=(10, 6))
colors = ['green' if x else 'red' for x in rfe.support_]
sns.barplot(data=feature_rankings, x='ranking', y='feature', palette=colors)
plt.title('RFE Feature Rankings (Green = Selected)')
plt.xlabel('Ranking (1 = Best)')
plt.tight_layout()
plt.show()


In [None]:
# 3. Statistical Tests for Feature Selection
print("\n3. Statistical Tests for Feature Selection:")
print("=" * 45)

# F-test for feature selection
f_selector = SelectKBest(score_func=f_classif, k=8)
X_f_selected = f_selector.fit_transform(X_scaled, y)

# Get F-scores and p-values
f_scores = f_selector.scores_
p_values = f_selector.pvalues_

# Create statistical test results DataFrame
statistical_results = pd.DataFrame({
    'feature': X_scaled.columns,
    'f_score': f_scores,
    'p_value': p_values,
    'selected': f_selector.get_support()
}).sort_values('f_score', ascending=False)

print("F-test Results:")
for idx, row in statistical_results.iterrows():
    status = "✓" if row['selected'] else "✗"
    print(f"{row['feature']:12}: F={row['f_score']:8.2f}, p={row['p_value']:.4f} {status}")

# Visualize F-scores
plt.figure(figsize=(12, 8))

# F-scores plot
plt.subplot(2, 1, 1)
colors = ['green' if x else 'red' for x in f_selector.get_support()]
sns.barplot(data=statistical_results, x='f_score', y='feature', palette=colors)
plt.title('F-test Scores (Green = Selected)')
plt.xlabel('F-Score')

# P-values plot (log scale)
plt.subplot(2, 1, 2)
sns.barplot(data=statistical_results, x='p_value', y='feature', palette=colors)
plt.title('P-values (Green = Selected)')
plt.xlabel('P-value')
plt.xscale('log')

plt.tight_layout()
plt.show()


In [None]:
# 4. Combine Feature Selection Results
print("\n4. Feature Selection Summary:")
print("=" * 30)

# Get selected features from different methods
rf_top_features = importance_df.head(8)['feature'].tolist()
rfe_selected = list(selected_features_rfe)
f_test_selected = X_scaled.columns[f_selector.get_support()].tolist()

print("Top 8 features by Random Forest Importance:")
print(rf_top_features)

print("\nSelected features by RFE:")
print(rfe_selected)

print("\nSelected features by F-test:")
print(f_test_selected)

# Find common features across methods
common_features = set(rf_top_features) & set(rfe_selected) & set(f_test_selected)
print(f"\nCommon features across all methods: {list(common_features)}")

# Create final feature selection
final_features = list(set(rf_top_features + rfe_selected + f_test_selected))
print(f"\nFinal selected features (union): {final_features}")

# Save selected features
X_selected = X_scaled[final_features]
X_selected.to_csv('data/X_selected.csv', index=False)

print(f"\nSelected features saved to 'data/X_selected.csv'")
print(f"Original features: {X_scaled.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")
print(f"Reduction: {X_scaled.shape[1] - X_selected.shape[1]} features removed")
