In [3]:
# --- 07-Ensemble_Techniques.ipynb (Light Version) ---

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.ensemble import EasyEnsembleClassifier 
import time

# ==============================================================================
# --- 1. Load and Preprocess the Data (Standard Block) ---
# ==============================================================================
print("--- Loading and Preprocessing Data ---")

file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)
df_processed = df.copy()
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis']
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)

print("--- Data Preprocessing Complete ---")
print(f"Shape of numerical features (X): {X_numeric.shape}")

# ==============================================================================
# --- 2. Split Data AFTER Preprocessing ---
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)
print("\n--- Data Split ---")
print(f"Original positive cases in training set: {y_train.sum()} ({y_train.mean()*100:.1f}%)")

# ==============================================================================
# --- 3. Train and Evaluate EasyEnsembleClassifier (Light Version) ---
# ==============================================================================
print("\n--- Training EasyEnsembleClassifier (Light Version) ---")
start_time = time.time()

# --- INÍCIO DA CORREÇÃO ---
# Reduce n_estimators from 10 to 5 for a much faster run.
eec = EasyEnsembleClassifier(
    n_estimators=50, # Reduced to 5 models instead of 10
    random_state=42,
    n_jobs=1,
    verbose=1
)
# --- FIM DA CORREÇÃO ---

eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
end_time = time.time()
print(f"\nTraining completed in {end_time - start_time:.2f} seconds.")

# --- 4. Evaluate the Model ---
print("\n--- EasyEnsembleClassifier (Light) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_eec))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_eec))

--- Loading and Preprocessing Data ---
--- Data Preprocessing Complete ---
Shape of numerical features (X): (456, 43)

--- Data Split ---
Original positive cases in training set: 109 (29.9%)

--- Training EasyEnsembleClassifier (Light Version) ---

Training completed in 2.34 seconds.

--- EasyEnsembleClassifier (Light) Performance ---
Confusion Matrix:
[[42 23]
 [ 9 18]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.65      0.72        65
           1       0.44      0.67      0.53        27

    accuracy                           0.65        92
   macro avg       0.63      0.66      0.63        92
weighted avg       0.71      0.65      0.67        92



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
