In [2]:
# --- Advanced Techniques for Imbalanced Learning Notebook ---

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE # Import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Import Pipeline from imblearn

# ==============================================================================
# --- 1. Load and Preprocess the Data ---
# ==============================================================================
print("--- Loading and Preprocessing Data ---")

# Load the raw dataset
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)

# Create a copy for preprocessing
df_processed = df.copy()

# Handle Missing Values
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')

# Encode the Target Variable
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)

# Separate features from the target BEFORE encoding features
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis'] # Use a different name for the numeric target

# Apply One-Hot Encoding to categorical features
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)

print("--- Data Preprocessing Complete ---")
print(f"Shape of numerical features (X): {X_numeric.shape}")

# ==============================================================================
# --- 2. Split Data AFTER Preprocessing ---
# ==============================================================================
# Now we split the fully numeric X_numeric and y_numeric
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)
print("\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
# This print statement will now work correctly
print(f"Original positive cases in training set: {y_train.sum()} ({y_train.mean()*100:.1f}%)")

# ==============================================================================
# --- 3. Apply SMOTE within Pipelines ---
# ==============================================================================
print("\n--- Applying SMOTE within Pipelines ---")

# --- Model 1: Logistic Regression with SMOTE ---
pipeline_logreg = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

print("\n--- Training Logistic Regression with SMOTE ---")
pipeline_logreg.fit(X_train, y_train)
y_pred_logreg_smote = pipeline_logreg.predict(X_test)

print("\n--- Logistic Regression (SMOTE) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg_smote))

# --- Model 2: XGBoost with SMOTE ---
pipeline_xgb = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    ))
])

print("\n--- Training XGBoost with SMOTE ---")
pipeline_xgb.fit(X_train, y_train)
y_pred_xgb_smote = pipeline_xgb.predict(X_test)

print("\n--- XGBoost (SMOTE) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb_smote))

--- Loading and Preprocessing Data ---
--- Data Preprocessing Complete ---
Shape of numerical features (X): (456, 43)

--- Data Split ---
Training set shape: (364, 43)
Test set shape: (92, 43)
Original positive cases in training set: 109 (29.9%)

--- Applying SMOTE within Pipelines ---

--- Training Logistic Regression with SMOTE ---

--- Logistic Regression (SMOTE) Performance ---
Confusion Matrix:
[[44 21]
 [14 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.68      0.72        65
           1       0.38      0.48      0.43        27

    accuracy                           0.62        92
   macro avg       0.57      0.58      0.57        92
weighted avg       0.65      0.62      0.63        92


--- Training XGBoost with SMOTE ---

--- XGBoost (SMOTE) Performance ---
Confusion Matrix:
[[45 20]
 [16 11]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.69    