In [1]:
# --- 12. Final Model Training and Saving ---
# This cell loads, preprocesses, and trains the final champion model
# (EasyEnsembleClassifier n=50) and saves the artifacts for the API.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.ensemble import EasyEnsembleClassifier # Our champion model
import joblib
from pathlib import Path
import time

print("--- Running Final Training and Saving Pipeline ---")

# ==============================================================================
# --- 1. Load and Preprocess Data (Standard Block) ---
# ==============================================================================
print("Loading and preprocessing data...")
file_path = '../data/raw/leish_dataset.csv'
df = pd.read_csv(file_path)
df_processed = df.copy()

for col in df_processed.select_dtypes(include=['object']).columns:
    df_processed[col] = df_processed[col].fillna('Unknown')
target_map = {'positivo': 1, 'negativo': 0, 'Unknown': 0}
df_processed['diagnosis'] = df_processed['diagnosis'].map(target_map).astype(int)
X_categorical = df_processed.drop('diagnosis', axis=1)
y_numeric = df_processed['diagnosis']
X_numeric = pd.get_dummies(X_categorical, drop_first=True, dtype=int)
print("Data preprocessing complete.")

# ==============================================================================
# --- 2. Split Data (to get final training columns) ---
# ==============================================================================
# We split the data here mainly to get the final list of feature names (X_train.columns)
# that our API will need to use for preprocessing.
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)
print(f"Data split. Training set has {X_train.shape[1]} features.")

# ==============================================================================
# --- 3. Train the Champion Model ---
# ==============================================================================
print("\nTraining the final EasyEnsembleClassifier (n_estimators=50)...")
start_time = time.time()

# We define our final model with the champion parameters
final_model = EasyEnsembleClassifier(
    n_estimators=50, 
    random_state=42,
    n_jobs=1, # Keep it stable
    verbose=0  # No need for verbose logging in the final run
)
# We fit the model on the training data
final_model.fit(X_train, y_train)

end_time = time.time()
print(f"Final model training completed in {end_time - start_time:.2f} seconds.")

# ==============================================================================
# --- 4. Save Model and Column Artifacts ---
# ==============================================================================
print("\n--- Saving the Best Performing Model and Columns ---")

# Define paths to save the artifacts
model_dir = Path('../models')
model_file = model_dir / 'leish_model_v1.joblib'
columns_file = model_dir / 'training_columns_v1.joblib'

# Create the directory if it doesn't exist
model_dir.mkdir(parents=True, exist_ok=True)

# Save the trained model
joblib.dump(final_model, model_file)
print(f"Model successfully saved to: {model_file}")

# Save the list of training columns
# This is CRITICAL for the API to preprocess new data correctly
training_columns = X_train.columns.tolist()
joblib.dump(training_columns, columns_file)
print(f"Training columns (total: {len(training_columns)}) saved to: {columns_file}")

print("\n--- Artifacts saved. IA Model phase is complete. ---")

--- Running Final Training and Saving Pipeline ---
Loading and preprocessing data...
Data preprocessing complete.
Data split. Training set has 43 features.

Training the final EasyEnsembleClassifier (n_estimators=50)...
Final model training completed in 2.11 seconds.

--- Saving the Best Performing Model and Columns ---
Model successfully saved to: ../models/leish_model_v1.joblib
Training columns (total: 43) saved to: ../models/training_columns_v1.joblib

--- Artifacts saved. IA Model phase is complete. ---
