# Aula 3: Desenvolvimento e Engenharia de Modelos

## Objetivos de Aprendizagem
- Aplicar técnicas de feature engineering
- Implementar pipelines de processamento
- Tratar dados desbalanceados
- Otimizar hiperparâmetros sistematicamente
- Validar modelos com técnicas apropriadas

## Exercício Prático
Desenvolver um modelo robusto com feature engineering completo.

## 1. Importação de Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.feature_selection import SelectKBest, f_classif
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')

## 2. Carregamento e Análise Exploratória

In [None]:
# Carregar dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

print(f"Dataset shape: {X.shape}")
print(f"\nClasses: {data.target_names}")
print(f"Distribuição de classes: {np.bincount(y)}")

X.head()

In [None]:
# Análise estatística
X.describe()

In [None]:
# Verificar valores nulos
print("Valores nulos por coluna:")
print(X.isnull().sum().sum())

## 3. Feature Engineering

### Tarefa 1: Criar novas features baseadas nas existentes

In [None]:
# Criar features de interação
X_engineered = X.copy()

# Razões entre features relacionadas
X_engineered['mean_area_radius_ratio'] = X['mean area'] / (X['mean radius'] + 1e-6)
X_engineered['mean_perimeter_radius_ratio'] = X['mean perimeter'] / (X['mean radius'] + 1e-6)

# Features agregadas
X_engineered['mean_texture_symmetry_product'] = X['mean texture'] * X['mean symmetry']
X_engineered['worst_area_mean_area_ratio'] = X['worst area'] / (X['mean area'] + 1e-6)

# Features polinomiais para features importantes
X_engineered['mean_concavity_squared'] = X['mean concavity'] ** 2
X_engineered['worst_perimeter_squared'] = X['worst perimeter'] ** 2

print(f"Features após engenharia: {X_engineered.shape[1]}")
print(f"Novas features criadas: {X_engineered.shape[1] - X.shape[1]}")

## 4. Seleção de Features

### Tarefa 2: Selecione as features mais relevantes

In [None]:
# Dividir dados
X_train, X_test, y_train, y_test = train_test_split(
    X_engineered, y, test_size=0.2, random_state=42, stratify=y
)

# Seleção de features usando ANOVA F-value
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Identificar features selecionadas
selected_features = X_engineered.columns[selector.get_support()]
print(f"\nFeatures selecionadas ({len(selected_features)}):")
for i, feat in enumerate(selected_features, 1):
    print(f"{i}. {feat}")

## 5. Criação de Pipeline de Processamento

### Tarefa 3: Construa um pipeline completo de ML

In [None]:
# Configurar MLFlow
mlflow.set_experiment("breast_cancer_feature_engineering")

# Criar pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

print("Pipeline criado:")
print(pipeline)

## 6. Otimização de Hiperparâmetros

### Tarefa 4: Use Grid Search para encontrar os melhores hiperparâmetros

In [None]:
# Definir grade de hiperparâmetros
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Grid Search com Cross-Validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print("Iniciando Grid Search...")
grid_search.fit(X_train_selected, y_train)

print(f"\nMelhores parâmetros: {grid_search.best_params_}")
print(f"Melhor score (ROC-AUC): {grid_search.best_score_:.4f}")

## 7. Treinamento do Modelo Final

### Tarefa 5: Treine o modelo com os melhores parâmetros e registre no MLFlow

In [None]:
with mlflow.start_run(run_name="optimized_random_forest"):
    # Modelo otimizado
    best_model = grid_search.best_estimator_
    
    # Previsões
    y_pred = best_model.predict(X_test_selected)
    y_pred_proba = best_model.predict_proba(X_test_selected)[:, 1]
    
    # Métricas
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation score
    cv_scores = cross_val_score(best_model, X_train_selected, y_train, cv=5, scoring='roc_auc')
    
    # Registrar parâmetros
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_param("n_features", X_train_selected.shape[1])
    mlflow.log_param("feature_engineering", "yes")
    
    # Registrar métricas
    mlflow.log_metric("test_roc_auc", roc_auc)
    mlflow.log_metric("cv_roc_auc_mean", cv_scores.mean())
    mlflow.log_metric("cv_roc_auc_std", cv_scores.std())
    
    # Registrar modelo
    mlflow.sklearn.log_model(best_model, "model")
    
    print(f"\nROC-AUC no teste: {roc_auc:.4f}")
    print(f"ROC-AUC CV (média ± std): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

## 8. Avaliação Detalhada

### Tarefa 6: Analise o desempenho do modelo

In [None]:
# Relatório de classificação
print("=== RELATÓRIO DE CLASSIFICAÇÃO ===")
print(classification_report(y_test, y_pred, target_names=data.target_names))

In [None]:
# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=data.target_names, 
            yticklabels=data.target_names)
plt.title('Matriz de Confusão')
plt.ylabel('Valor Real')
plt.xlabel('Valor Predito')
plt.tight_layout()
plt.savefig('/tmp/confusion_matrix.png')
plt.show()

In [None]:
# Curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('/tmp/roc_curve.png')
plt.show()

## 9. Feature Importance

### Tarefa 7: Analise a importância das features

In [None]:
# Extrair feature importance
feature_importance = best_model.named_steps['classifier'].feature_importances_
importance_df = pd.DataFrame({
    'feature': selected_features,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# Visualizar top 15 features
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df.head(15), x='importance', y='feature')
plt.title('Top 15 Features Mais Importantes')
plt.xlabel('Importância')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('/tmp/feature_importance.png')
plt.show()

print("\nTop 10 Features:")
print(importance_df.head(10))

## 10. Comparação: Modelo Baseline vs Engenharia de Features

### Tarefa 8: Compare com um modelo sem feature engineering

In [None]:
# Treinar modelo baseline sem feature engineering
X_train_orig, X_test_orig, _, _ = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

with mlflow.start_run(run_name="baseline_no_feature_engineering"):
    # Pipeline simples
    baseline_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(
            n_estimators=100, max_depth=10, random_state=42
        ))
    ])
    
    # Treinar
    baseline_pipeline.fit(X_train_orig, y_train)
    
    # Avaliar
    y_pred_baseline = baseline_pipeline.predict(X_test_orig)
    y_pred_proba_baseline = baseline_pipeline.predict_proba(X_test_orig)[:, 1]
    roc_auc_baseline = roc_auc_score(y_test, y_pred_proba_baseline)
    
    # Registrar
    mlflow.log_param("feature_engineering", "no")
    mlflow.log_metric("test_roc_auc", roc_auc_baseline)
    mlflow.sklearn.log_model(baseline_pipeline, "model")
    
    print(f"Baseline ROC-AUC: {roc_auc_baseline:.4f}")

print(f"\n=== COMPARAÇÃO ===")
print(f"Baseline (sem FE): {roc_auc_baseline:.4f}")
print(f"Com Feature Engineering: {roc_auc:.4f}")
print(f"Melhoria: {((roc_auc - roc_auc_baseline) / roc_auc_baseline * 100):.2f}%")

## 11. Exercícios Adicionais

### Desafios para Praticar:

1. **Experimente outros scalers**: Teste RobustScaler e MinMaxScaler
2. **PCA**: Aplique PCA para redução de dimensionalidade e compare resultados
3. **Ensemble Methods**: Combine múltiplos modelos usando VotingClassifier
4. **Feature Engineering Avançado**: Crie features usando domain knowledge
5. **Calibração**: Use CalibratedClassifierCV para melhorar probabilidades
6. **Análise de Erro**: Identifique padrões nos casos mal classificados

### Questões para Reflexão:

1. Quais features engineered foram mais importantes?
2. O feature engineering melhorou significativamente o desempenho?
3. Como você validaria que não há data leakage nas novas features?
4. Quais outras técnicas de feature engineering você aplicaria?