In [3]:
# --- Model Training and Evaluation Notebook ---

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import models and metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ==============================================================================
# --- 1. Load and Preprocess the Data ---
# ==============================================================================
print("--- Loading and Preprocessing Data ---")

# Load the raw dataset created by our loader script
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)

# Create a copy for preprocessing
df_processed = df.copy()

# Handle Missing Values
for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')

# Encode the Target Variable
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)

# Separate features from the target before encoding
X_categorical = df_processed.drop('diagnosis', axis=1)
y = df_processed['diagnosis']

# Apply One-Hot Encoding to categorical features
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)

print("--- Data Preprocessing Complete ---")
print(f"Shape of numerical features (X): {X_numeric.shape}")

# ==============================================================================
# --- 2. Split the Data into Training and Test sets ---
# ==============================================================================
# We now use the processed X_numeric and y
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y, test_size=0.2, random_state=42, stratify=y
)

print("\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Positive cases in test set: {y_test.sum()} out of {len(y_test)}")

# ==============================================================================
# --- 3. Model 1: Logistic Regression (Balanced) ---
# ==============================================================================
print("\n--- Training Model 1: Logistic Regression (Balanced) ---")
log_reg_balanced = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced'
)
log_reg_balanced.fit(X_train, y_train)
y_pred_log_reg = log_reg_balanced.predict(X_test)

print("\n--- Logistic Regression Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log_reg))

# ==============================================================================
# --- 4. Model 2: Random Forest (Optimized) ---
# ==============================================================================
print("\n--- Training Model 2: Random Forest (Optimized) ---")
# Using the best parameters found during our hyperparameter tuning
best_rf_params = {
    'class_weight': 'balanced_subsample', 'criterion': 'entropy',
    'max_depth': 20, 'min_samples_leaf': 4,
    'min_samples_split': 10, 'n_estimators': 300
}
rf_clf_best = RandomForestClassifier(random_state=42, **best_rf_params)
rf_clf_best.fit(X_train, y_train)
y_pred_rf = rf_clf_best.predict(X_test)

print("\n--- Random Forest Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# ==============================================================================
# --- 5. Model 3: XGBoost Classifier ---
# ==============================================================================
print("\n--- Training Model 3: XGBoost Classifier ---")

# Calculate the scale_pos_weight for handling class imbalance
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    # use_label_encoder=False is deprecated and has been removed
    random_state=42
)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

print("\n--- XGBoost Performance ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))

--- Loading and Preprocessing Data ---
--- Data Preprocessing Complete ---
Shape of numerical features (X): (456, 43)

--- Data Split ---
Training set shape: (364, 43)
Test set shape: (92, 43)
Positive cases in test set: 27 out of 92

--- Training Model 1: Logistic Regression (Balanced) ---

--- Logistic Regression Performance ---
Confusion Matrix:
[[45 20]
 [11 16]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.69      0.74        65
           1       0.44      0.59      0.51        27

    accuracy                           0.66        92
   macro avg       0.62      0.64      0.63        92
weighted avg       0.70      0.66      0.67        92


--- Training Model 2: Random Forest (Optimized) ---

--- Random Forest Performance ---
Confusion Matrix:
[[44 21]
 [14 13]]

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.68      0.72        65
           1       0.

In [4]:
# --- 6. Save the Best Model ---
import joblib
from pathlib import Path

print("\n--- Saving the Best Performing Model ---")

# Our best model is the Logistic Regression
best_model = log_reg_balanced

# Define the output path for the model file
model_dir = Path('../models')
model_file = model_dir / 'leish_model_v1.joblib'

# Create the directory if it doesn't exist
model_dir.mkdir(parents=True, exist_ok=True)

# Save the trained model to the file
joblib.dump(best_model, model_file)

print(f"Model successfully saved to: {model_file}")

# --- You can also save the columns used for training ---
# This is crucial to ensure that the API preprocesses new data in the exact same way
training_columns = X_train.columns.tolist()
columns_file = model_dir / 'training_columns_v1.joblib'
joblib.dump(training_columns, columns_file)

print(f"Training columns saved to: {columns_file}")


--- Saving the Best Performing Model ---
Model successfully saved to: ../models/leish_model_v1.joblib
Training columns saved to: ../models/training_columns_v1.joblib


In [5]:
# --- 12. Hyperparameter Tuning for XGBoost ---
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
import time

print("--- Starting Hyperparameter Tuning for XGBoost ---")

# We use the same scale_pos_weight as it is crucial for our imbalanced dataset
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Define the grid of parameters to search
# This is a focused grid with some of the most impactful hyperparameters
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'gamma': [0, 0.1],
    'subsample': [0.8, 1.0]
}
# Total candidates: 3 * 3 * 2 * 2 * 2 = 72 fits

# Create a new XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# Set up the Grid Search
grid_search_xgb = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid_xgb,
    scoring='recall', # We continue to optimize for recall on the positive class
    cv=5,
    n_jobs=1,         # Keep it sequential to ensure stability
    verbose=2
)

# Fit the grid search to the data
print("GridSearchCV configured. Starting the fit process...")
start_time = time.time()

grid_search_xgb.fit(X_train, y_train)

end_time = time.time()
print(f"\nGridSearch fitting completed in {end_time - start_time:.2f} seconds.")


print("\n--- Hyperparameter Tuning Complete ---")
print("Best parameters found for XGBoost: ", grid_search_xgb.best_params_)

# Use the best model found by the grid search for evaluation
best_xgb_model = grid_search_xgb.best_estimator_

# --- Evaluate the Optimized Model ---
y_pred_best_xgb = best_xgb_model.predict(X_test)

print("\n--- Optimized XGBoost Model Performance ---")
print("\nNew Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_xgb))

print("\nNew Classification Report:")
print(classification_report(y_test, y_pred_best_xgb))

--- Starting Hyperparameter Tuning for XGBoost ---
GridSearchCV configured. Starting the fit process...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.0s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END g