# 4. Seleção de Features

Este notebook avalia e seleciona as features mais relevantes para o modelo.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Configurações de visualização
plt.style.use('seaborn')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

In [None]:
# Carregar dados processados
data_path = Path('../data/heart_disease_processed.csv')
df = pd.read_csv(data_path)

## 1. Análise de Correlação com Target

In [None]:
# Calcular correlação com target
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
correlations = df[numeric_cols].corr()['target'].sort_values(ascending=False)

# Visualizar
plt.figure(figsize=(12, 6))
correlations.drop('target').plot(kind='bar')
plt.title('Correlação das Features com Target')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2. Seleção via ANOVA F-value

In [None]:
# Preparar dados
X = df[numeric_cols.drop('target')]
y = df['target']

# Aplicar SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X, y)

# Visualizar scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values('Score', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_scores, x='Score', y='Feature')
plt.title('Feature Importance (ANOVA F-value)')
plt.show()

## 3. Seleção via Random Forest

In [None]:
# Normalizar features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Treinar Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

# Visualizar importância
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Feature Importance (Random Forest)')
plt.show()

## 4. Seleção Recursiva de Features

In [None]:
# Aplicar RFE
rfe = RFE(estimator=rf, n_features_to_select=10)
rfe.fit(X_scaled, y)

# Visualizar resultados
rfe_results = pd.DataFrame({
    'Feature': X.columns,
    'Selected': rfe.support_,
    'Ranking': rfe.ranking_
}).sort_values('Ranking')

print("Features selecionadas via RFE:")
rfe_results