In [1]:
# --- ADASYN (Adaptive Synthetic Sampling) Experiment Notebook ---

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import ADASYN # Import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline
import time

# ==============================================================================
# --- 1. Load and Preprocess the Data (Our Standard Block) ---
# ==============================================================================
print("--- Loading and Preprocessing Data ---")

# Load the raw dataset
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)

# Create a copy for preprocessing
df_processed = df.copy()

# Handle Missing Values
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')

# Encode the Target Variable
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)

# Separate features from the target BEFORE encoding features
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis']

# Apply One-Hot Encoding to categorical features
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)

print("--- Data Preprocessing Complete ---")
print(f"Shape of numerical features (X): {X_numeric.shape}")

# ==============================================================================
# --- 2. Split Data AFTER Preprocessing ---
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)
print("\n--- Data Split ---")
print(f"Original positive cases in training set: {y_train.sum()} ({y_train.mean()*100:.1f}%)")

# ==============================================================================
# --- 3. Define Models and Apply ADASYN using a Pipeline ---
# ==============================================================================
print("\n--- Applying ADASYN within Pipelines ---")
start_time = time.time()

# --- Model 1: Logistic Regression with ADASYN ---
pipeline_logreg_adasyn = ImbPipeline([
    ('sampler', ADASYN(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42)) # No class_weight
])

print("\n--- Training Logistic Regression with ADASYN ---")
pipeline_logreg_adasyn.fit(X_train, y_train)
y_pred_logreg_adasyn = pipeline_logreg_adasyn.predict(X_test)

print("\n--- Logistic Regression (ADASYN) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg_adasyn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg_adasyn))

# --- Model 2: XGBoost with ADASYN ---
pipeline_xgb_adasyn = ImbPipeline([
    ('sampler', ADASYN(random_state=42)),
    ('classifier', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        # scale_pos_weight is NOT needed
        random_state=42
    ))
])

print("\n--- Training XGBoost with ADASYN ---")
pipeline_xgb_adasyn.fit(X_train, y_train)
y_pred_xgb_adasyn = pipeline_xgb_adasyn.predict(X_test)

print("\n--- XGBoost (ADASYN) Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb_adasyn))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb_adasyn))

end_time = time.time()
print(f"\n--- ADASYN experiments completed in {end_time - start_time:.2f} seconds. ---")

--- Loading and Preprocessing Data ---
--- Data Preprocessing Complete ---
Shape of numerical features (X): (456, 43)

--- Data Split ---
Original positive cases in training set: 109 (29.9%)

--- Applying ADASYN within Pipelines ---

--- Training Logistic Regression with ADASYN ---

--- Logistic Regression (ADASYN) Performance ---
Confusion Matrix:
[[41 24]
 [14 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.63      0.68        65
           1       0.35      0.48      0.41        27

    accuracy                           0.59        92
   macro avg       0.55      0.56      0.54        92
weighted avg       0.63      0.59      0.60        92


--- Training XGBoost with ADASYN ---

--- XGBoost (ADASYN) Performance ---
Confusion Matrix:
[[44 21]
 [15 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.68      0.71        65
           1       0.36      0.44