In [2]:
# --- 13. Feature Selection Experiment ---
# Goal: Train a model using ONLY the 15 clinical features from the form.
# Target: Binary Severity (0=Mild, 1=Sintomático)
# Model: Logistic Regression (Balanced)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import joblib
from pathlib import Path

# ==============================================================================
# --- 1. Load Data, Model and Preprocess (Standard Block) ---
# ==============================================================================
print("--- Loading and Preprocessing Data ---")
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)
df_processed = df.copy()
# ... (código de pré-processamento, limpeza de NaNs, etc.)
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis']
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)
print("Data preprocessing complete.")

# --- 2. Get Training Columns ---
X_train_cols, _, y_train_cols, _ = train_test_split(X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric)
training_cols = X_train_cols.columns.tolist()
X_aligned = X_numeric.reindex(columns=training_cols, fill_value=0)
print(f"Data aligned to {len(training_cols)} training columns.")

# ==============================================================================
# --- 3. Get Feature Importances (from our previous RF model) ---
# ==============================================================================
# We need to load the model that gave us the feature importances
# Let's re-train the Unbalanced RF model from notebook 11 to get them
from sklearn.ensemble import RandomForestClassifier
rf_unbalanced = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_unbalanced.fit(X_aligned, y_numeric)
importances = rf_unbalanced.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': training_cols,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# --- Define our list of TOP features ---
# Let's select the Top 15 features based on our previous analysis
TOP_FEATURES = feature_importance_df.head(15)['feature'].tolist()

print(f"\n--- Selected Top 15 Features ---")
print(TOP_FEATURES)

# ==============================================================================
# --- 4. Split Data using ONLY Top Features ---
# ==============================================================================
X_top_features = X_aligned[TOP_FEATURES]
y = y_numeric

X_train, X_test, y_train, y_test = train_test_split(
    X_top_features, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\n--- Data Split using Top Features ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# ==============================================================================
# --- 5. Train our OLD Champion (Balanced LR) on the NEW Top Features ---
# ==============================================================================
print("\n--- Training BALANCED Logistic Regression on TOP FEATURES ---")
lr_balanced_top_features = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced'
)
lr_balanced_top_features.fit(X_train, y_train)
y_pred = lr_balanced_top_features.predict(X_test)

print("\n--- Model Performance (Balanced LR + Top Features) ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ==============================================================================
# --- 6. Run the "Normal Patient" Test (Check Calibration) ---
# ==============================================================================
print("\n--- Running 'Normal Patient' Test (Top Features Model) ---")
normal_patient_data = {
    'general_state': 'Bom', 'ectoparasites': 'Ausente', 'nutritional_state': 'Adequado/Eutrófico',
    'coat': 'Normal', 'nails': 'Normal', 'mucosa_color': 'Normal (Rosa-claro)',
    'muzzle_ear_lesion': 'Ausente', 'lymph_nodes': 'Normal', 'blepharitis': 'Ausente',
    'conjunctivitis': 'Ausente', 'alopecia': 'Ausente', 'bleeding': 'Ausente',
    'skin_lesion': 'Ausente', 'muzzle_lip_depigmentation': 'Ausente',
    'animal_sex': 'M', 'breed_name': 'SRD'
}
input_df = pd.DataFrame([normal_patient_data])
input_encoded = pd.get_dummies(input_df)
final_df = input_encoded.reindex(columns=training_cols, fill_value=0)
final_df_top_features = final_df[TOP_FEATURES] # Select only the top features

probabilities_normal = lr_balanced_top_features.predict_proba(final_df_top_features)[0]
print(f"Probabilities for 'Normal' patient (Balanced LR + Top Features):")
print(f"  P(Negativo): {(probabilities_normal[0] * 100):.1f}%")
print(f"  P(Positivo): {(probabilities_normal[1] * 100):.1f}%")

--- Loading and Preprocessing Data ---
Data preprocessing complete.
Data aligned to 43 training columns.

--- Selected Top 15 Features ---
['ectoparasites_leve', 'muzzle_lip_depigmentation_presente', 'animal_sex_M', 'muzzle_ear_lesion_presente', 'nails_normal', 'nutritional_state_leve_moderado', 'lymph_nodes_normal', 'alopecia_presente', 'skin_lesion_Leve/Moderada', 'ectoparasites_grave', 'skin_lesion_Grave/Generalizada', 'mucosa_color_levemente_hipercorada', 'coat_normal', 'coat_leves_moderadas', 'conjunctivitis_Conjuntivite Leve']

--- Data Split using Top Features ---
Training set shape: (364, 15)
Test set shape: (92, 15)

--- Training BALANCED Logistic Regression on TOP FEATURES ---

--- Model Performance (Balanced LR + Top Features) ---
Confusion Matrix:
[[43 22]
 [12 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.66      0.72        65
           1       0.41      0.56      0.47        27

    accuracy           