In [2]:
# --- Undersampling Experiment Notebook ---

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler # Import the undersampler
from imblearn.pipeline import Pipeline as ImbPipeline

# ==============================================================================
# --- 1. Load and Preprocess the Data ---
# (This block ensures data is ready before splitting)
# ==============================================================================
print("--- Loading and Preprocessing Data ---")

# Load the raw dataset
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)

# Create a copy for preprocessing
df_processed = df.copy()

# Handle Missing Values
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')

# Encode the Target Variable
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)

# Separate features from the target BEFORE encoding features
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis'] # Use a different name for the numeric target

# Apply One-Hot Encoding to categorical features
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)

print("--- Data Preprocessing Complete ---")
print(f"Shape of numerical features (X): {X_numeric.shape}")

# ==============================================================================
# --- 2. Split Data AFTER Preprocessing ---
# ==============================================================================
# Now we split the fully numeric X_numeric and y_numeric
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)
print("\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
# This print statement will now work correctly
print(f"Original positive cases in training set: {y_train.sum()} ({y_train.mean()*100:.1f}%)")


# ==============================================================================
# --- 3. Define and Train RandomForest with RandomUnderSampler ---
# ==============================================================================
print("\n--- Applying RandomUnderSampler within Pipeline ---")

# Use the best parameters found for RandomForest previously
best_rf_params = {
    'class_weight': 'balanced_subsample', 'criterion': 'entropy',
    'max_depth': 20, 'min_samples_leaf': 4,
    'min_samples_split': 10, 'n_estimators': 300
}

# --- Create the pipeline ---
pipeline_rf_rus = ImbPipeline([
    ('undersampler', RandomUnderSampler(random_state=42)),
    # IMPORTANT: Remove class_weight from best_rf_params when using undersampling
    # Let's create a copy without it
    ('classifier', RandomForestClassifier(random_state=42,
                                         criterion=best_rf_params['criterion'],
                                         max_depth=best_rf_params['max_depth'],
                                         min_samples_leaf=best_rf_params['min_samples_leaf'],
                                         min_samples_split=best_rf_params['min_samples_split'],
                                         n_estimators=best_rf_params['n_estimators']
                                         ))
])

print("\n--- Training RandomForest with RandomUnderSampler ---")
pipeline_rf_rus.fit(X_train, y_train)
y_pred_rf_rus = pipeline_rf_rus.predict(X_test)

# --- 4. Evaluate the Model ---
print("\n--- RandomForest (RandomUnderSampler) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_rus))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf_rus))

--- Loading and Preprocessing Data ---
--- Data Preprocessing Complete ---
Shape of numerical features (X): (456, 43)

--- Data Split ---
Training set shape: (364, 43)
Test set shape: (92, 43)
Original positive cases in training set: 109 (29.9%)

--- Applying RandomUnderSampler within Pipeline ---

--- Training RandomForest with RandomUnderSampler ---

--- RandomForest (RandomUnderSampler) Performance ---
Confusion Matrix:
[[41 24]
 [14 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.63      0.68        65
           1       0.35      0.48      0.41        27

    accuracy                           0.59        92
   macro avg       0.55      0.56      0.54        92
weighted avg       0.63      0.59      0.60        92

