In [1]:
import os

file_path = '/content/drive/MyDrive/mundos_risk_directory/leads.csv'

if os.path.exists(file_path):
    print(f"The file '{file_path}' exists.")
else:
    print(f"The file '{file_path}' does not exist.")

The file '/content/drive/MyDrive/mundos_risk_directory/leads.csv' exists.


In [2]:
# Install uv
!pip install uv

# Install necessary libraries using uv
!uv pip install pandas scikit-learn optuna transformers torch plotly kaleido --system

Collecting uv
  Downloading uv-0.8.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.8.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.8.11
[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m60 packages[0m [2min 403ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/18)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/18)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/18)
[2mcolorlog            [0m [32m[2m------------------------------[0m[0m     0 B/11.16 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/18)
[2mcolorlog            [0m [32m------------------------------[2m[0m[0m 11.16 KiB/11.16 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/18)
[2mcolorlog            [0m [3

In [2]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, log_loss, precision_recall_fscore_support, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import optuna
import torch
from transformers import AutoModel, AutoTokenizer
import plotly.express as px
import plotly.figure_factory as ff
import joblib # for saving the model

# Define the main directory
MAIN_DIR = '/content/drive/MyDrive/mundos_risk_directory'
os.makedirs(MAIN_DIR, exist_ok=True)

# Define file paths based on the main directory and plan placeholders
DATA_FILE = os.path.join(MAIN_DIR, 'leads.csv') # Assuming leads.csv is the DATA_FILE
DATA_INTEGRITY_REPORT = os.path.join(MAIN_DIR, 'data_integrity_report.json')
EMBEDDINGS_FILE = os.path.join(MAIN_DIR, 'embeddings_interest.npz')
SENTIMENT_FILE = os.path.join(MAIN_DIR, 'sentiment_interest.csv')
# MODEL_PICKLE_FILE = os.path.join(MAIN_DIR, 'best_interest_pipeline_1.pkl')
ARTIFACTS_DIR = os.path.join(MAIN_DIR, 'artifacts')
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
BASELINE_MODEL_FILE = os.path.join(ARTIFACTS_DIR, f'baseline_interest_{pd.Timestamp.now().strftime("%Y%m%dT%H%M%SZ")}.pkl')
METRICS_FILE = os.path.join(ARTIFACTS_DIR, f'metrics_interest_{pd.Timestamp.now().strftime("%Y%m%dT%H%M%SZ")}.json')
PCA_RESULTS_FILE = os.path.join(MAIN_DIR, 'pca_interest_results.json')
TEST_RESULTS_FILE = os.path.join(MAIN_DIR, 'test_results_multiclass.json')
README_FILE = os.path.join(MAIN_DIR, 'README.md')
PROJECT_OVERVIEW_FILE = os.path.join(MAIN_DIR, 'PROJECT_OVERVIEW.md')


# Define constants from the plan
ID_COL = 'lead_id'
TEXT_COLS = ['customer_summary', 'agent_summary']
LABEL_COL = 'interest_label'
CLASS_NAMES = ["no_interest", "mild_interest", "strong_interest"]
N_CLASSES = 3

### Step 1: Data Validation

Load the dataset and perform initial checks for missing columns, duplicate IDs, nulls in text columns, label space, and sample counts per class.

In [3]:
# Step 1: Data Validation
print("Starting Data Validation...")

# Load the dataset
try:
    df = pd.read_csv(DATA_FILE)
    print(f"Successfully loaded data from {DATA_FILE}. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: The file {DATA_FILE} was not found.")
    # Exit or handle the error appropriately
    # For a notebook, you might want to inform the user and stop execution here
    raise # Re-raise the exception to stop execution if file is not found
except Exception as e:
    print(f"Error loading data: {e}")
    raise # Re-raise other exceptions as well

# Rename 'stage' column to 'interest_label'
if 'stage' in df.columns:
    df.rename(columns={'stage': LABEL_COL}, inplace=True)
    print(f"Renamed 'stage' column to '{LABEL_COL}'.")
else:
    print("Warning: 'stage' column not found. Proceeding with existing columns.")


# Initialize data integrity report
integrity_report = {
    "file_path": DATA_FILE,
    "shape": {
        "rows": df.shape[0],
        "columns": df.shape[1]
    },
    "checks": {}
}

# Check for essential columns
required_cols = [ID_COL] + TEXT_COLS + [LABEL_COL]
missing_cols = [col for col in required_cols if col not in df.columns]
integrity_report["checks"]["missing_required_columns"] = missing_cols
if missing_cols:
    print(f"Error: Missing required columns: {missing_cols}")
    # Depending on severity, you might stop execution here
    # raise ValueError(f"Missing required columns: {missing_cols}") # Example of stopping
else:
    print("All required columns are present.")

# Check for duplicates in ID_COL
duplicate_ids = df[ID_COL].duplicated().sum()
integrity_report["checks"]["duplicate_ids"] = int(duplicate_ids) # Ensure JSON serializable
if duplicate_ids > 0:
    print(f"Warning: Found {duplicate_ids} duplicate entries in '{ID_COL}'.")
    # Decide if duplicate IDs should stop the process or be handled later (e.g., dropping)
else:
    print(f"No duplicate entries found in '{ID_COL}'.")

# Check for nulls in text columns
null_text_counts = df[TEXT_COLS].isnull().sum().to_dict()
integrity_report["checks"]["null_counts_in_text_columns"] = null_text_counts
if any(null_text_counts.values()):
    print(f"Warning: Found null values in text columns: {null_text_counts}")
    # Decide how to handle nulls - imputation, dropping rows, etc.
else:
    print("No null values found in text columns.")

# Check label space
unique_labels = df[LABEL_COL].unique().tolist()
integrity_report["checks"]["unique_labels_found"] = sorted(unique_labels)

expected_labels = list(range(N_CLASSES))
unexpected_labels = [label for label in unique_labels if label not in expected_labels]
integrity_report["checks"]["unexpected_labels"] = unexpected_labels

if sorted(unique_labels) == expected_labels and not unexpected_labels:
    print(f"Label space is exactly {expected_labels}.")
elif unexpected_labels:
     print(f"Error: Found unexpected labels: {unexpected_labels}. Expected labels: {expected_labels}")
     # Decide if unexpected labels should stop the process
     # raise ValueError(f"Found unexpected labels: {unexpected_labels}") # Example of stopping
else:
    print(f"Warning: Label space is not exactly {expected_labels}. Found: {sorted(unique_labels)}")


# Check at least 5 samples per class
label_counts = df[LABEL_COL].value_counts().to_dict()
integrity_report["checks"]["label_counts"] = label_counts

low_count_labels = {label: count for label, count in label_counts.items() if count < 5}
integrity_report["checks"]["labels_with_less_than_5_samples"] = low_count_labels

if low_count_labels:
    print(f"Warning: The following labels have less than 5 samples: {low_count_labels}. Stratified split might be challenging.")
    # Consider resampling techniques if needed
else:
    print("All labels have at least 5 samples.")

# Save the integrity report
try:
    with open(DATA_INTEGRITY_REPORT, 'w') as f:
        json.dump(integrity_report, f, indent=4)
    print(f"Data integrity report saved to {DATA_INTEGRITY_REPORT}")
except Exception as e:
    print(f"Error saving data integrity report: {e}")


print("Data Validation Complete.")

Starting Data Validation...
Successfully loaded data from /content/drive/MyDrive/mundos_risk_directory/leads.csv. Shape: (5000, 4)
Renamed 'stage' column to 'interest_label'.
All required columns are present.
No duplicate entries found in 'lead_id'.
No null values found in text columns.
Label space is exactly [0, 1, 2].
All labels have at least 5 samples.
Data integrity report saved to /content/drive/MyDrive/mundos_risk_directory/data_integrity_report.json
Data Validation Complete.


### Step 2: Feature Extraction (Embeddings)

This step generates embeddings from the text columns using a pre-trained transformer model (MiniLM) and saves them.

In [6]:
# Step 2: Feature Extraction - Embeddings
print("Starting Feature Extraction (Embeddings)...")

# Load the MiniLM model and tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    print("Successfully loaded MiniLM model and tokenizer.")
except Exception as e:
    print(f"Error loading transformer model: {e}")
    # Exit or handle the error appropriately
    raise # Re-raise the exception to stop execution if model loading fails

# Function to generate embeddings
def get_embeddings(text_list):
    # Replace NaN with empty strings
    text_list = [str(text) if pd.notna(text) else "" for text in text_list]
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings.numpy()

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Generate embeddings for each text column
customer_summary_embeddings = get_embeddings(df['customer_summary'].tolist())
agent_summary_embeddings = get_embeddings(df['agent_summary'].tolist())

print(f"Generated customer_summary embeddings with shape: {customer_summary_embeddings.shape}")
print(f"Generated agent_summary embeddings with shape: {agent_summary_embeddings.shape}")


# Concatenate embeddings
combined_embeddings = np.concatenate((customer_summary_embeddings, agent_summary_embeddings), axis=1)
print(f"Combined embeddings shape: {combined_embeddings.shape}")

# Save embeddings
try:
    np.savez_compressed(EMBEDDINGS_FILE, embeddings=combined_embeddings)
    print(f"Embeddings saved to {EMBEDDINGS_FILE}")
except Exception as e:
    print(f"Error saving embeddings: {e}")
    # Decide how to handle this - likely stop if embeddings can't be saved
    raise # Re-raise the exception

print("Feature Extraction (Embeddings) Complete.")

Starting Feature Extraction (Embeddings)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Successfully loaded MiniLM model and tokenizer.
Generated customer_summary embeddings with shape: (5000, 384)
Generated agent_summary embeddings with shape: (5000, 384)
Combined embeddings shape: (5000, 768)
Embeddings saved to /content/drive/MyDrive/mundos_risk_directory/embeddings_interest.npz
Feature Extraction (Embeddings) Complete.


### Step 2 (Continued): Feature Extraction (Sentiment Analysis - Optional)

This step performs sentiment analysis on the text columns and combines the resulting sentiment features with the previously generated embeddings.

In [7]:
# Step 2 (Continued): Feature Extraction (Sentiment Analysis - Optional)
print("Starting Feature Extraction (Sentiment Analysis)...")

try:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    import torch.nn.functional as F

    # Using a model finetuned for sentiment, e.g., on SST-2 (binary)
    # This model outputs logits for positive/negative classes.
    sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
    print(f"Successfully loaded sentiment classification model and tokenizer: {sentiment_model_name}")

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sentiment_model.to(device)
    print(f"Using device: {device}")

    # Function to get sentiment features (probabilities for positive/negative)
    def get_sentiment_probs(text_list):
        # Replace NaN with empty strings
        text_list = [str(text) if pd.notna(text) else "" for text in text_list]

        encoded_input = sentiment_tokenizer(text_list, padding=True, truncation=True, return_tensors='pt')

        # Move tensors to device
        encoded_input = {key: val.to(device) for key, val in encoded_input.items()}

        with torch.no_grad():
            model_output = sentiment_model(**encoded_input)

        # Get probabilities (softmax over logits)
        probabilities = F.softmax(model_output.logits, dim=1).cpu().numpy() # Move back to CPU for numpy conversion

        # The model outputs probabilities for 2 classes. Need to check the mapping (usually 0: negative, 1: positive)
        # We can verify this by checking model.config.id2label
        # Assuming 0: negative, 1: positive
        neg_probs = probabilities[:, 0]
        pos_probs = probabilities[:, 1]

        return pos_probs, neg_probs

    # Get sentiment probabilities for customer and agent summaries
    customer_pos_probs, customer_neg_probs = get_sentiment_probs(df['customer_summary'].tolist())
    agent_pos_probs, agent_neg_probs = get_sentiment_probs(df['agent_summary'].tolist())

    print(f"Generated customer sentiment probabilities (pos/neg) with shape: {customer_pos_probs.shape}")
    print(f"Generated agent sentiment probabilities (pos/neg) with shape: {agent_pos_probs.shape}")

    # Calculate the 7 sentiment features
    sentiment_cust = customer_pos_probs - customer_neg_probs # Simple sentiment score (-1 to 1)
    sentiment_agent = agent_pos_probs - agent_neg_probs     # Simple sentiment score (-1 to 1)
    sentiment_cust_pos = customer_pos_probs
    sentiment_cust_neg = customer_neg_probs
    sentiment_agent_pos = agent_pos_probs
    sentiment_agent_neg = agent_neg_probs
    sentiment_gap = sentiment_cust - sentiment_agent # Difference in sentiment scores

    # Create a DataFrame for sentiment features
    sentiment_df = pd.DataFrame({
        'lead_id': df[ID_COL], # Include lead_id for merging later
        'sentiment_cust': sentiment_cust,
        'sentiment_agent': sentiment_agent,
        'sentiment_cust_pos': sentiment_cust_pos,
        'sentiment_cust_neg': sentiment_cust_neg,
        'sentiment_agent_pos': sentiment_agent_pos,
        'sentiment_agent_neg': sentiment_agent_neg,
        'sentiment_gap': sentiment_gap
    })

    print(f"Generated sentiment features DataFrame with shape: {sentiment_df.shape}")
    display(sentiment_df.head())

    # Save sentiment features to CSV
    try:
        sentiment_df.to_csv(SENTIMENT_FILE, index=False)
        print(f"Sentiment features saved to {SENTIMENT_FILE}")
    except Exception as e:
        print(f"Error saving sentiment features to CSV: {e}")


    # Load existing embeddings to combine
    try:
        loaded_embeddings = np.load(EMBEDDINGS_FILE)['embeddings']
        print(f"Loaded embeddings with shape: {loaded_embeddings.shape}")

        # Combine embeddings and sentiment features
        # We need to merge sentiment_df with the main df or align correctly if not using lead_id for merge
        # Assuming the order of rows in sentiment_df is the same as in df
        combined_sentiment_features_np = sentiment_df[['sentiment_cust', 'sentiment_agent',
                                                     'sentiment_cust_pos', 'sentiment_cust_neg',
                                                     'sentiment_agent_pos', 'sentiment_agent_neg',
                                                     'sentiment_gap']].values

        final_features = np.concatenate((loaded_embeddings, combined_sentiment_features_np), axis=1)
        print(f"Final feature set shape (embeddings + 7 sentiment features): {final_features.shape}")

        # Update the combined_embeddings variable to hold the final feature set
        combined_embeddings = final_features
        print("Combined embeddings and 7 sentiment features.")

        # Optionally save combined features to a new file if needed for later steps
        COMBINED_FEATURES_FILE = os.path.join(MAIN_DIR, 'combined_features_interest.npz')
        np.savez_compressed(COMBINED_FEATURES_FILE, features=final_features)
        print(f"Combined features saved to {COMBINED_FEATURES_FILE}")


    except FileNotFoundError:
        print(f"Error: Embeddings file {EMBEDDINGS_FILE} not found. Cannot combine with sentiment features.")
        # In this case, combined_embeddings should just be the sentiment features if they were generated
        if 'combined_sentiment_features_np' in locals():
            combined_embeddings = combined_sentiment_features_np
            print(f"Proceeding with 7 sentiment features only with shape: {combined_embeddings.shape}")
        else:
             print("No sentiment features generated either. Cannot proceed without features.")
             exit()

    except Exception as e:
        print(f"Error combining embeddings and sentiment features: {e}")
        print("Error during feature combination. Proceeding with embeddings only if available.")
        try:
             combined_embeddings = np.load(EMBEDDINGS_FILE)['embeddings']
             print(f"Using embeddings only with shape: {combined_embeddings.shape}")
        except FileNotFoundError:
            print(f"Error: Embeddings file {EMBEDDINGS_FILE} not found. Cannot proceed without features.")
            exit()

except Exception as e:
    print(f"An error occurred during sentiment feature extraction: {e}")
    print("Sentiment analysis will be skipped. Proceeding with embeddings only.")
    # Ensure combined_embeddings variable still holds the embeddings if sentiment failed
    try:
         combined_embeddings = np.load(EMBEDDINGS_FILE)['embeddings']
         print(f"Using embeddings only with shape: {combined_embeddings.shape}")
    except FileNotFoundError:
        print(f"Error: Embeddings file {EMBEDDINGS_FILE} not found. Cannot proceed without features.")
        exit()


# Note: The combined_embeddings variable now holds the feature set (embeddings + sentiment if generated)
print("Feature Extraction (Sentiment Analysis) Complete.")

Starting Feature Extraction (Sentiment Analysis)...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Successfully loaded sentiment classification model and tokenizer: distilbert-base-uncased-finetuned-sst-2-english
Using device: cuda
Generated customer sentiment probabilities (pos/neg) with shape: (5000,)
Generated agent sentiment probabilities (pos/neg) with shape: (5000,)
Generated sentiment features DataFrame with shape: (5000, 8)


Unnamed: 0,lead_id,sentiment_cust,sentiment_agent,sentiment_cust_pos,sentiment_cust_neg,sentiment_agent_pos,sentiment_agent_neg,sentiment_gap
0,1,-0.974439,-0.986235,0.01278,0.98722,0.006882,0.993117,0.011796
1,2,-0.996158,0.986215,0.001921,0.998079,0.993107,0.006893,-1.982373
2,3,-0.991958,0.956741,0.004021,0.995979,0.978371,0.021629,-1.948699
3,4,-0.964047,-0.98094,0.017976,0.982024,0.00953,0.99047,0.016893
4,5,-0.977945,0.547085,0.011028,0.988972,0.773543,0.226457,-1.52503


Sentiment features saved to /content/drive/MyDrive/mundos_risk_directory/sentiment_interest.csv
Loaded embeddings with shape: (5000, 768)
Final feature set shape (embeddings + 7 sentiment features): (5000, 775)
Combined embeddings and 7 sentiment features.
Combined features saved to /content/drive/MyDrive/mundos_risk_directory/combined_features_interest.npz
Feature Extraction (Sentiment Analysis) Complete.


# Task
Build an NLP pipeline for predicting interest levels (no_interest, mild_interest, strong_interest) from text summaries in the "leads.csv" file located at "/content/drive/MyDrive/mundos_risk_directory/leads.csv". The pipeline should include data validation, feature extraction (embeddings and 7 sentiment features), hyperparameter tuning using Optuna with k-fold cross-validation, optional PCA compression, model training (Multinomial Logistic Regression), evaluation with specified metrics, building an inference pipeline, testing, and documentation. Save all generated artifacts (reports, features, models, metrics, graphs) within the "/content/drive/MyDrive/mundos_risk_directory" directory. Ensure GPU utilization where applicable and address potential issues like overfitting and data leakage.

## Data preparation for model training

### Subtask:
Split the data into features (X) and labels (y). Apply standard scaling to the features.


**Reasoning**:
Split the data into features and labels, and then apply standard scaling to the features.



In [8]:
# Split data into features (X) and labels (y)
X = df.drop(columns=[LABEL_COL, ID_COL])
y = df[LABEL_COL]

# Apply standard scaling to features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data splitting and scaling complete.")
print(f"Features shape (X_scaled): {X_scaled.shape}")
print(f"Labels shape (y): {y.shape}")

ValueError: could not convert string to float: "Inquired about the cost of a routine cleaning and a check-up, mentioning they haven't been to a dentist in over a year."

### Step 3: Data Preparation and K-Fold Cross-Validation Setup

Split the data into features and labels and set up Stratified K-Fold cross-validation.

In [1]:
# Step 3: Data Preparation for Model Training

# Split data into features (X) and labels (y)
X = combined_embeddings # Use the combined features (embeddings + sentiment)
y = df[LABEL_COL]

print(f"Features shape (X): {X.shape}")
print(f"Labels shape (y): {y.shape}")

# Split data into training and testing sets (70/30 split)
# Use stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Original data shape: {X.shape}")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")

# Apply Standard Scaling to features
# Fit scaler only on training data to prevent data leakage
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler.")

print("Data Preparation for Model Training Complete.")

NameError: name 'combined_embeddings' is not defined

### Step 5: Hyperparameter Tuning with Optuna (including PCA)

Use Optuna to find the best hyperparameters for the Logistic Regression model and the optimal number of PCA components, evaluated using Stratified K-Fold cross-validation.

In [11]:
# Step 5: Hyperparameter Tuning with Optuna (including PCA)
print("Starting Hyperparameter Tuning with Optuna...")

from sklearn.model_selection import cross_val_score

# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to tune for Logistic Regression
    # Note: These keys should NOT have the pipeline step prefix here
    lr_params_trial = {
        'C': trial.suggest_float('logisticregression__C', 1e-4, 1e4, log=True),
        'solver': trial.suggest_categorical('logisticregression__solver', ['lbfgs', 'liblinear']), # 'liblinear' is good for small datasets and L1/L2 penalty
        'penalty': trial.suggest_categorical('logisticregression__penalty', ['l2']), # 'lbfgs' only supports l2 or None. If adding 'liblinear', could add 'l1'
        'class_weight': 'balanced', # Keep class weight balanced
        'max_iter': 1000 # Fixed max_iter
    }

    # Optional PCA step
    use_pca = trial.suggest_categorical('use_pca', [True, False])

    if use_pca:
        # Tune the number of PCA components
        # Max components is min(n_samples, n_features) - 1. Here, n_features is X.shape[1]
        n_components = trial.suggest_int('pca__n_components', 10, min(X.shape[0], X.shape[1]) - 1)
        pca = PCA(n_components=n_components, random_state=42)
        # print(f"Trial {trial.number}: Using PCA with {n_components} components.") # Keep print less verbose during trials
    else:
        pca = 'passthrough' # No PCA
        # print(f"Trial {trial.number}: Not using PCA.") # Keep print less verbose during trials


    # Create the pipeline
    pipeline_steps = [
        ('scaler', StandardScaler())
    ]
    if use_pca:
        pipeline_steps.append(('pca', pca))

    # Pass the parameters WITHOUT the prefix to the LogisticRegression constructor
    pipeline_steps.append(('logisticregression', LogisticRegression(multi_class='multinomial', random_state=42, **lr_params_trial)))

    model_pipeline = Pipeline(pipeline_steps)


    # Evaluate the pipeline using Stratified K-Fold Cross-Validation
    # Use macro_roc_auc_ovr as the optimization metric
    # Need to handle potential errors during cross-validation for some hyperparameter combinations
    try:
        scores = cross_val_score(
            model_pipeline,
            X, # Use the full feature set X for cross_val_score
            y,
            cv=skf, # Use the StratifiedKFold object
            scoring='roc_auc_ovr', # Use One-vs-Rest ROC AUC for multiclass
            error_score='raise' # Raise errors to debug
        )
        # Optuna minimizes the objective, so we minimize the negative AUC
        mean_roc_auc = scores.mean()
        # print(f"Trial {trial.number}: Mean ROC AUC (OvR) across folds: {mean_roc_auc}") # Keep print less verbose during trials
        return -mean_roc_auc # Minimize negative AUC
    except Exception as e:
        print(f"Trial {trial.number}: Error during cross-validation: {e}")
        return float('inf') # Return infinity to signal failure for Optuna


# Set up the Optuna study
# Direction 'minimize' because we are minimizing the negative ROC AUC
study = optuna.create_study(direction='minimize')

# Run the Optuna study
N_TRIALS = 50 # You can adjust the number of trials
print(f"Running Optuna study for {N_TRIALS} trials...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print("Optuna study complete.")

# Print the best trial's results
print("\nBest trial:")
print(f"  Value (Negative ROC AUC): {study.best_value}")
print(f"  Params: {study.best_params}")

# Store the best parameters and whether PCA was used
best_params = study.best_params
use_pca_best = best_params.pop('use_pca') # Remove use_pca from LR params

best_pca_n_components = None
if use_pca_best:
    best_pca_n_components = best_params.pop('pca__n_components') # Remove n_components from LR params

print(f"Best use_pca: {use_pca_best}")
if use_pca_best:
    print(f"Best PCA n_components: {best_pca_n_components}")
print(f"Best Logistic Regression params: {best_params}")

# Save the PCA results (best n_components and variance explained)
pca_results = {
    "best_use_pca": use_pca_best,
    "best_n_components": best_pca_n_components,
    # We can calculate variance explained later when fitting the final PCA
    "variance_explained": None
}

# If PCA was used, fit a temporary PCA to get variance explained for the best n_components
if use_pca_best:
    temp_pca = PCA(n_components=best_pca_n_components, random_state=42)
    # Fit on the full data after scaling for reporting total variance explained
    temp_scaler = StandardScaler()
    X_scaled_full = temp_scaler.fit_transform(X)
    temp_pca.fit(X_scaled_full)
    pca_results["variance_explained"] = float(np.sum(temp_pca.explained_variance_ratio_)) # Ensure serializable

try:
    with open(PCA_RESULTS_FILE, 'w') as f:
        json.dump(pca_results, f, indent=4)
    print(f"PCA results saved to {PCA_RESULTS_FILE}")
except Exception as e:
    print(f"Error saving PCA results: {e}")


print("Hyperparameter Tuning with Optuna Complete.")

[I 2025-08-17 12:22:41,773] A new study created in memory with name: no-name-0caea7c4-6ea8-48bd-a46e-a0a81a5ce199


Starting Hyperparameter Tuning with Optuna...
Running Optuna study for 50 trials...


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2025-08-17 12:22:43,566] Trial 0 finished with value: -0.9989900722772481 and parameters: {'logisticregression__C': 19.03509903590418, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 0 with value: -0.9989900722772481.




[I 2025-08-17 12:22:48,537] Trial 1 finished with value: -0.9993229127203841 and parameters: {'logisticregression__C': 0.077602708482895, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 283}. Best is trial 1 with value: -0.9993229127203841.
Trial 2: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:22:48,624] Trial 2 finished with value: inf and parameters: {'logisticregression__C': 254.1363981004536, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 1 with value: -0.9993229127203841.




[I 2025-08-17 12:22:50,784] Trial 3 finished with value: -0.9994061993943225 and parameters: {'logisticregression__C': 0.0003757466023201523, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 68}. Best is trial 3 with value: -0.9994061993943225.
Trial 4: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:22:50,867] Trial 4 finished with value: inf and parameters: {'logisticregression__C': 4.8861318363361015, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 3 with value: -0.9994061993943225.




[I 2025-08-17 12:23:01,094] Trial 5 finished with value: -0.9995813180444196 and parameters: {'logisticregression__C': 0.0010911359016099716, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 597}. Best is trial 5 with value: -0.9995813180444196.
Trial 6: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:23:01,155] Trial 6 finished with value: inf and parameters: {'logisticregression__C': 4.8826514500658185, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 5 with value: -0.9995813180444196.




Trial 7: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:23:02,339] Trial 7 finished with value: inf and parameters: {'logisticregression__C': 0.00014809180377844752, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 355}. Best is trial 5 with value: -0.9995813180444196.




[I 2025-08-17 12:23:05,722] Trial 8 finished with value: -0.9994554993865833 and parameters: {'logisticregression__C': 0.026802141289756803, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 5 with value: -0.9995813180444196.




[I 2025-08-17 12:23:12,947] Trial 9 finished with value: -0.9989610733904501 and parameters: {'logisticregression__C': 411.8621123512712, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 451}. Best is trial 5 with value: -0.9995813180444196.




[I 2025-08-17 12:23:18,544] Trial 10 finished with value: -0.9994782378001011 and parameters: {'logisticregression__C': 0.008188696673662049, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 764}. Best is trial 5 with value: -0.9995813180444196.




[I 2025-08-17 12:23:26,441] Trial 11 finished with value: -0.9995107625810156 and parameters: {'logisticregression__C': 0.006645071353882225, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 760}. Best is trial 5 with value: -0.9995813180444196.




[I 2025-08-17 12:23:31,228] Trial 12 finished with value: -0.9995952041500468 and parameters: {'logisticregression__C': 0.001742153724148741, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 772}. Best is trial 12 with value: -0.9995952041500468.




[I 2025-08-17 12:23:40,656] Trial 13 finished with value: -0.9995770481829215 and parameters: {'logisticregression__C': 0.0009943201150723433, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 571}. Best is trial 12 with value: -0.9995952041500468.




[I 2025-08-17 12:23:44,776] Trial 14 finished with value: -0.9992151889633978 and parameters: {'logisticregression__C': 0.2749261010091291, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 622}. Best is trial 12 with value: -0.9995952041500468.




[I 2025-08-17 12:23:49,563] Trial 15 finished with value: -0.999539197730764 and parameters: {'logisticregression__C': 0.002709426306110036, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 640}. Best is trial 12 with value: -0.9995952041500468.




[I 2025-08-17 12:23:57,870] Trial 16 finished with value: -0.9992239819282754 and parameters: {'logisticregression__C': 0.20456053852805198, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 499}. Best is trial 12 with value: -0.9995952041500468.




[I 2025-08-17 12:24:01,872] Trial 17 finished with value: -0.9993596369368457 and parameters: {'logisticregression__C': 0.00024292214374334388, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 685}. Best is trial 12 with value: -0.9995952041500468.




Trial 18: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:24:04,138] Trial 18 finished with value: inf and parameters: {'logisticregression__C': 0.028265922383205272, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 530}. Best is trial 12 with value: -0.9995952041500468.




[I 2025-08-17 12:24:09,887] Trial 19 finished with value: -0.9995972692501774 and parameters: {'logisticregression__C': 0.0016226216802560952, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 225}. Best is trial 19 with value: -0.9995972692501774.




[I 2025-08-17 12:24:12,862] Trial 20 finished with value: -0.9990443015170681 and parameters: {'logisticregression__C': 1.189197369059155, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 182}. Best is trial 19 with value: -0.9995972692501774.




[I 2025-08-17 12:24:17,046] Trial 21 finished with value: -0.9996007401632355 and parameters: {'logisticregression__C': 0.001564571524483743, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 247}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:24,248] Trial 22 finished with value: -0.9995313850989341 and parameters: {'logisticregression__C': 0.0042190041772690165, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 229}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:27,067] Trial 23 finished with value: -0.998477531880984 and parameters: {'logisticregression__C': 4935.940536182803, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 126}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:33,290] Trial 24 finished with value: -0.9995694126125997 and parameters: {'logisticregression__C': 0.0009152597710093827, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 325}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:35,575] Trial 25 finished with value: -0.967275559364036 and parameters: {'logisticregression__C': 0.02326781397982727, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 11}. Best is trial 21 with value: -0.9996007401632355.
Trial 26: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:24:35,694] Trial 26 finished with value: inf and parameters: {'logisticregression__C': 0.00011497230095719807, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:41,982] Trial 27 finished with value: -0.9995948468232881 and parameters: {'logisticregression__C': 0.0018285192748541876, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 418}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:49,532] Trial 28 finished with value: -0.9994786314654803 and parameters: {'logisticregression__C': 0.011230379209148945, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 242}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:52,375] Trial 29 finished with value: -0.999263140449899 and parameters: {'logisticregression__C': 0.11192375523462716, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:24:56,045] Trial 30 finished with value: -0.9990939983090863 and parameters: {'logisticregression__C': 0.700425690355712, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 146}. Best is trial 21 with value: -0.9996007401632355.




[I 2025-08-17 12:25:04,764] Trial 31 finished with value: -0.9996012685674184 and parameters: {'logisticregression__C': 0.0013470201758870066, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 405}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:10,060] Trial 32 finished with value: -0.9994966594336891 and parameters: {'logisticregression__C': 0.0004551350426234065, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 308}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:18,618] Trial 33 finished with value: -0.9993561819514394 and parameters: {'logisticregression__C': 0.0529323102479979, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 372}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:23,616] Trial 34 finished with value: -0.999538151055181 and parameters: {'logisticregression__C': 0.0027066346519488715, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 221}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:26,885] Trial 35 finished with value: -0.9994694053706672 and parameters: {'logisticregression__C': 0.0004375356731070597, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 100}. Best is trial 31 with value: -0.9996012685674184.
Trial 36: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:25:26,992] Trial 36 finished with value: inf and parameters: {'logisticregression__C': 0.013138761274376597, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:34,637] Trial 37 finished with value: -0.998984660929667 and parameters: {'logisticregression__C': 29.93738244087915, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 453}. Best is trial 31 with value: -0.9996012685674184.




Trial 38: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:25:35,907] Trial 38 finished with value: inf and parameters: {'logisticregression__C': 0.0007574058386790837, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 283}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:38,043] Trial 39 finished with value: -0.9990728460187013 and parameters: {'logisticregression__C': 0.00012976815358910855, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:42,024] Trial 40 finished with value: -0.9995357353380588 and parameters: {'logisticregression__C': 0.0027020335306665614, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 173}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:51,168] Trial 41 finished with value: -0.9995530762784838 and parameters: {'logisticregression__C': 0.0021034820918730004, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 424}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:25:56,908] Trial 42 finished with value: -0.9995967663601223 and parameters: {'logisticregression__C': 0.001301381197601984, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 397}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:26:03,750] Trial 43 finished with value: -0.9994487038565119 and parameters: {'logisticregression__C': 0.00035236504858062687, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 257}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:26:09,759] Trial 44 finished with value: -0.9994785699005716 and parameters: {'logisticregression__C': 0.008419114510456729, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 340}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:26:18,966] Trial 45 finished with value: -0.9995814900158903 and parameters: {'logisticregression__C': 0.0010984001302399996, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 490}. Best is trial 31 with value: -0.9996012685674184.




Trial 46: Error during cross-validation: Solver liblinear does not support a multinomial backend.
[I 2025-08-17 12:26:20,268] Trial 46 finished with value: inf and parameters: {'logisticregression__C': 0.005227200746893511, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 295}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:26:22,431] Trial 47 finished with value: -0.9993766637793301 and parameters: {'logisticregression__C': 0.00025770826919420105, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:26:30,375] Trial 48 finished with value: -0.9993557975721292 and parameters: {'logisticregression__C': 0.0559992756875371, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 387}. Best is trial 31 with value: -0.9996012685674184.




[I 2025-08-17 12:26:33,419] Trial 49 finished with value: -0.9989668774373482 and parameters: {'logisticregression__C': 42.49646344294965, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 708}. Best is trial 31 with value: -0.9996012685674184.
Optuna study complete.

Best trial:
  Value (Negative ROC AUC): -0.9996012685674184
  Params: {'logisticregression__C': 0.0013470201758870066, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 405}
Best use_pca: True
Best PCA n_components: 405
Best Logistic Regression params: {'logisticregression__C': 0.0013470201758870066, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2'}
PCA results saved to /content/drive/MyDrive/mundos_risk_directory/pca_interest_results.json
Hyperparameter Tuning with Optuna Complete.


### Step 6: Model Training with Best Hyperparameters

Train the final pipeline (including scaling, optional PCA, and Logistic Regression) on the full dataset using the best hyperparameters found by Optuna.

In [13]:
# Step 6: Model Training with Best Hyperparameters
print("Starting Model Training with Best Hyperparameters...")

# Build the final pipeline with the best parameters
final_pipeline_steps = [
    ('scaler', StandardScaler())
]

if use_pca_best:
    final_pipeline_steps.append(('pca', PCA(n_components=best_pca_n_components, random_state=42)))

# The best_params dictionary from Optuna already contains the LR parameters
# Extract LR parameters without the pipeline prefix
lr_final_params = {k.replace('logisticregression__', ''): v for k, v in best_params.items()}

final_pipeline_steps.append(('logisticregression', LogisticRegression(multi_class='multinomial', random_state=42, **lr_final_params)))

final_model_pipeline = Pipeline(final_pipeline_steps)

# Train the final model on the full dataset (X, y) as cross_val_score was performed on full data
# If you had an explicit train/test split earlier, you would train on X_train, y_train here.
print("Training final model on the full dataset...")
final_model_pipeline.fit(X, y)

print("Final model training complete.")

# Save the trained model pipeline
try:
    joblib.dump(final_model_pipeline, MODEL_PICKLE_FILE)
    print(f"Trained model pipeline saved to {MODEL_PICKLE_FILE}")
except Exception as e:
    print(f"Error saving model pipeline: {e}")
    # Decide how to handle this - might not be able to proceed without saving
    raise # Re-raise the exception


print("Model Training with Best Hyperparameters Complete.")

Starting Model Training with Best Hyperparameters...
Training final model on the full dataset...




Final model training complete.
Trained model pipeline saved to /content/drive/MyDrive/mundos_risk_directory/best_interest_pipeline.pkl
Model Training with Best Hyperparameters Complete.


### Step 8: Model Evaluation

Evaluate the trained model on the test set using specified metrics and save the results.

In [1]:
# Step 8: Model Evaluation
print("Starting Model Evaluation...")

# We need X_test and y_test from the data splitting step.
# If that cell was not run or variables are not available, rerun the data splitting cell.
# Assuming X_test and y_test are available in the environment.

if 'X_test' not in locals() or 'y_test' not in locals():
    print("Error: X_test or y_test not found. Please run the Data Preparation cell first.")
    # Exit or handle appropriately
    exit()

# Make predictions on the test set
y_pred = final_model_pipeline.predict(X_test)
y_pred_proba = final_model_pipeline.predict_proba(X_test)

print("Predictions made on the test set.")

# Calculate evaluation metrics
metrics = {}

# Macro ROC AUC (OvR)
try:
    macro_roc_auc_ovr = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    metrics['macro_roc_auc_ovr'] = float(macro_roc_auc_ovr) # Ensure serializable
    print(f"Macro ROC AUC (OvR): {macro_roc_auc_ovr:.4f}")
except Exception as e:
    print(f"Could not calculate Macro ROC AUC (OvR): {e}")
    metrics['macro_roc_auc_ovr'] = None


# Macro F1
macro_f1 = f1_score(y_test, y_pred, average='macro')
metrics['macro_f1'] = float(macro_f1) # Ensure serializable
print(f"Macro F1: {macro_f1:.4f}")

# Weighted F1
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
metrics['weighted_f1'] = float(weighted_f1) # Ensure serializable
print(f"Weighted F1: {weighted_f1:.4f}")

# Log Loss
try:
    logloss = log_loss(y_test, y_pred_proba)
    metrics['log_loss'] = float(logloss) # Ensure serializable
    print(f"Log Loss: {logloss:.4f}")
except Exception as e:
    print(f"Could not calculate Log Loss: {e}")
    metrics['log_loss'] = None

# Per-class Precision, Recall, F1
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None, labels=np.unique(y_test))

per_class_metrics = {}
for i, class_name in enumerate(CLASS_NAMES):
    # Map class index to class name from constants
    class_index = np.unique(y_test)[i] # Get the actual index from the unique labels in y_test
    per_class_metrics[class_name] = {
        'precision': float(precision[i]), # Ensure serializable
        'recall': float(recall[i]),       # Ensure serializable
        'f1-score': float(f1[i])          # Ensure serializable
    }
metrics['per_class'] = per_class_metrics
print("\nPer-Class Metrics:")
for class_name, scores in per_class_metrics.items():
    print(f"  {class_name}: Precision={scores['precision']:.4f}, Recall={scores['recall']:.4f}, F1-Score={scores['f1-score']:.4f}")


# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
metrics['confusion_matrix'] = conf_matrix.tolist() # Convert numpy array to list for JSON serializability
print("\nConfusion Matrix:")
print(conf_matrix)


# Save metrics to JSON file
try:
    with open(METRICS_FILE, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"\nEvaluation metrics saved to {METRICS_FILE}")
except Exception as e:
    print(f"Error saving evaluation metrics: {e}")


print("Model Evaluation Complete.")

Starting Model Evaluation...
Error: X_test or y_test not found. Please run the Data Preparation cell first.


NameError: name 'final_model_pipeline' is not defined

In [4]:
MAIN_DIR = '/content/drive/MyDrive/mundos_risk_directory'

In [6]:
# Step 3: Data Preparation and K-Fold Cross-Validation Setup
print("Starting Data Preparation and K-Fold Cross-Validation Setup...")
import os
# Load combined features from the .npz file
COMBINED_FEATURES_FILE = os.path.join(MAIN_DIR, 'combined_features_interest.npz')
try:
    loaded_features = np.load(COMBINED_FEATURES_FILE)['features']
    print(f"Successfully loaded combined features from {COMBINED_FEATURES_FILE}. Shape: {loaded_features.shape}")
    X = loaded_features # Use the loaded features as X
except FileNotFoundError:
    print(f"Error: Combined features file {COMBINED_FEATURES_FILE} not found.")
    print("Please run the Feature Extraction steps (Step 2) first to generate this file.")
    # Exit or handle appropriately
    exit()
except Exception as e:
    print(f"Error loading combined features: {e}")
    exit()

# Split data into features (X) and labels (y) - y still comes from the original dataframe
# Ensure df is available. If not, you might need to load it here as well,
# but assuming it's still in memory from Step 1.
if 'df' not in locals():
     print("Error: Original dataframe 'df' not found. Please run Step 1 first.")
     exit()
y = df[LABEL_COL]


print(f"Features shape (X): {X.shape}")
print(f"Labels shape (y): {y.shape}")

# Split data into training and testing sets (80/20 split)
# Use stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Original data shape: {X.shape}")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")
# Set up Stratified K-Fold Cross-Validation - will be used on TRAINING data during Optuna
from sklearn.model_selection import StratifiedKFold

N_SPLITS = 3 # Number of folds as requested (still 3 folds for cross-validation on training data)
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

print(f"Set up Stratified K-Fold with {N_SPLITS} splits (for training data).")

# Apply Standard Scaling to features
# Fit scaler only on training data to prevent data leakage
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler.")
N_SPLITS = 3 # Number of folds as requested (still 3 folds for cross-validation on training data)
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

print(f"Set up Stratified K-Fold with {N_SPLITS} splits (for training data).")

print("Data Preparation and K-Fold Cross-Validation Setup Complete.")

Starting Data Preparation and K-Fold Cross-Validation Setup...
Successfully loaded combined features from /content/drive/MyDrive/mundos_risk_directory/combined_features_interest.npz. Shape: (5000, 775)
Features shape (X): (5000, 775)
Labels shape (y): (5000,)
Original data shape: (5000, 775)
Training data shape: (4000, 775)
Testing data shape: (1000, 775)
Training labels shape: (4000,)
Testing labels shape: (1000,)
Set up Stratified K-Fold with 3 splits (for training data).
Features scaled using StandardScaler.
Set up Stratified K-Fold with 3 splits (for training data).
Data Preparation and K-Fold Cross-Validation Setup Complete.


### Step 5: Hyperparameter Tuning with Optuna (including PCA)

Use Optuna to find the best hyperparameters for the Logistic Regression model and the optimal number of PCA components, evaluated using Stratified K-Fold cross-validation.

In [7]:
# Step 5: Hyperparameter Tuning with Optuna (including PCA)
print("Starting Hyperparameter Tuning with Optuna...")

from sklearn.model_selection import cross_val_score

# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to tune for Logistic Regression
    # Note: These keys should NOT have the pipeline step prefix here
    lr_params_trial = {
        'C': trial.suggest_float('logisticregression__C', 1e-4, 1e4, log=True),
        'solver': trial.suggest_categorical('logisticregression__solver', ['lbfgs', 'liblinear']), # 'liblinear' is good for small datasets and L1/L2 penalty
        'penalty': trial.suggest_categorical('logisticregression__penalty', ['l2']), # 'lbfgs' only supports l2 or None. If adding 'liblinear', could add 'l1'
        'class_weight': 'balanced', # Keep class weight balanced
        'max_iter': 1000 # Fixed max_iter
    }

    # Optional PCA step
    use_pca = trial.suggest_categorical('use_pca', [True, False])

    if use_pca:
        # Tune the number of PCA components
        # Max components for PCA is min(n_samples, n_features) - 1. Here, n_features is X_train.shape[1]
        # and n_samples is X_train.shape[0] // N_SPLITS (approx for a fold) or X_train.shape[0]
        # Let's use X_train.shape[1] as the upper bound for safety and Optuna will explore.
        n_components = trial.suggest_int('pca__n_components', 10, X_train.shape[1])
        pca = PCA(n_components=n_components, random_state=42)
        # print(f"Trial {trial.number}: Using PCA with {n_components} components.") # Keep print less verbose during trials
    else:
        pca = 'passthrough' # No PCA
        # print(f"Trial {trial.number}: Not using PCA.") # Keep print less verbose during trials


    # Create the pipeline
    pipeline_steps = [
        ('scaler', StandardScaler())
    ]
    if use_pca:
        pipeline_steps.append(('pca', pca))

    # Pass the parameters WITHOUT the prefix to the LogisticRegression constructor
    # Removed multi_class='multinomial' as it's deprecated and the default for multiclass
    pipeline_steps.append(('logisticregression', LogisticRegression(random_state=42, **lr_params_trial)))

    model_pipeline = Pipeline(pipeline_steps)


    # Evaluate the pipeline using Stratified K-Fold Cross-Validation on the TRAINING data
    # Use macro_roc_auc_ovr as the optimization metric
    # Need to handle potential errors during cross-validation for some hyperparameter combinations
    try:
        scores = cross_val_score(
            model_pipeline,
            X_train, # Use the TRAINING data for cross_val_score
            y_train,
            cv=skf, # Use the StratifiedKFold object set up for the training data
            scoring='roc_auc_ovr', # Use One-vs-Rest ROC AUC for multiclass
            error_score='raise' # Raise errors to debug
        )
        # Optuna minimizes the objective, so we minimize the negative AUC
        mean_roc_auc = scores.mean()
        # print(f"Trial {trial.number}: Mean ROC AUC (OvR) across folds: {mean_roc_auc}") # Keep print less verbose during trials
        return -mean_roc_auc # Minimize negative AUC
    except Exception as e:
        print(f"Trial {trial.number}: Error during cross-validation: {e}")
        return float('inf') # Return infinity to signal failure for Optuna


# Set up the Optuna study
# Direction 'minimize' because we are minimizing the negative ROC AUC
study = optuna.create_study(direction='minimize')

# Run the Optuna study
N_TRIALS = 50 # You can adjust the number of trials
print(f"Running Optuna study for {N_TRIALS} trials...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print("Optuna study complete.")

# Print the best trial's results
print("\nBest trial:")
print(f"  Value (Negative ROC AUC): {study.best_value}")
print(f"  Params: {study.best_params}")

# Store the best parameters and whether PCA was used
best_params = study.best_params
use_pca_best = best_params.pop('use_pca') # Remove use_pca from LR params

best_pca_n_components = None
if use_pca_best:
    best_pca_n_components = best_params.pop('pca__n_components') # Remove n_components from LR params

print(f"Best use_pca: {use_pca_best}")
if use_pca_best:
    print(f"Best PCA n_components: {best_pca_n_components}")
print(f"Best Logistic Regression params: {best_params}")

# Save the PCA results (best n_components and variance explained)
pca_results = {
    "best_use_pca": use_pca_best,
    "best_n_components": best_pca_n_components,
    # We can calculate variance explained later when fitting the final PCA
    "variance_explained": None
}

# If PCA was used, fit a temporary PCA on the scaled training data to get variance explained for the best n_components
if use_pca_best:
    temp_pca = PCA(n_components=best_pca_n_components, random_state=42)
    # Use the fitted scaler from Step 3 to transform X_train for PCA fitting
    X_train_scaled_temp = scaler.transform(X_train) # Use the scaler fitted in Step 3
    temp_pca.fit(X_train_scaled_temp)
    pca_results["variance_explained"] = float(np.sum(temp_pca.explained_variance_ratio_)) # Ensure serializable


try:
    with open(PCA_RESULTS_FILE, 'w') as f:
        json.dump(pca_results, f, indent=4)
    print(f"PCA results saved to {PCA_RESULTS_FILE}")
except Exception as e:
    print(f"Error saving PCA results: {e}")


print("Hyperparameter Tuning with Optuna Complete.")

[I 2025-08-17 13:25:52,435] A new study created in memory with name: no-name-44b039a1-4f51-430a-86fc-628d5b510d22


Starting Hyperparameter Tuning with Optuna...
Running Optuna study for 50 trials...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-08-17 13:25:54,982] Trial 0 finished with value: -0.9987289258275515 and parameters: {'logisticregression__C': 0.0005059387500080754, 'logisticregression__solver': 'liblinear', 'logisticregression__penalty': 'l2', 'use_pca': False}. Best is trial 0 with value: -0.9987289258275515.
[I 2025-08-17 13:26:02,864] Trial 1 finished with value: -0.9999476443811793 and parameters: {'logisticregression__C': 883.4743443300043, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 169}. Best is trial 1 with value: -0.9999476443811793.
[I 2025-08-17 13:26:06,932] Trial 2 finished with value: -0.9997695853171628 and parameters: {'logisticregression__C': 0.0030404291546291926, 'logisticregression__solver': 'lbfgs', 'logisticregression__penalty': 'l2', 'use_pca': True, 'pca__n_components': 746}. Best is trial 1 with value: -0.9999476443811793.
[I 2025-08-17 13:26:08,539] Trial 3 finished with value: -0.9994616069738491 and parameters:

### Step 6: Model Training with Best Hyperparameters

Train the final pipeline (including scaling, optional PCA, and Logistic Regression) on the full training dataset using the best hyperparameters found by Optuna.

In [8]:
# Step 6: Model Training with Best Hyperparameters
print("Starting Model Training with Best Hyperparameters...")

# Build the final pipeline with the best parameters found by Optuna
final_pipeline_steps = [
    ('scaler', StandardScaler())
]

# Include PCA if Optuna determined it was best
if use_pca_best:
    final_pipeline_steps.append(('pca', PCA(n_components=best_pca_n_components, random_state=42)))

# The best_params dictionary from Optuna already contains the LR parameters without the prefix
# Ensure the LR parameters are extracted correctly
lr_final_params = {k.replace('logisticregression__', ''): v for k, v in best_params.items()}


final_pipeline_steps.append(('logisticregression', LogisticRegression(random_state=42, **lr_final_params)))

final_model_pipeline = Pipeline(final_pipeline_steps)

# Train the final model on the entire TRAINING dataset (X_train, y_train)
print("Training final model on the training dataset...")
final_model_pipeline.fit(X_train, y_train)

print("Final model training complete.")
MODEL_PICKLE_FILE = os.path.join(MAIN_DIR, 'best_interest_pipeline_1.pkl')
# Save the trained model pipeline
try:
    joblib.dump(final_model_pipeline, MODEL_PICKLE_FILE)
    print(f"Trained model pipeline saved to {MODEL_PICKLE_FILE}")
except Exception as e:
    print(f"Error saving model pipeline: {e}")
    # Decide how to handle this - might not be able to proceed without saving
    raise # Re-raise the exception


print("Model Training with Best Hyperparameters Complete.")

Starting Model Training with Best Hyperparameters...
Training final model on the training dataset...
Final model training complete.
Trained model pipeline saved to /content/drive/MyDrive/mundos_risk_directory/best_interest_pipeline_1.pkl
Model Training with Best Hyperparameters Complete.


### Step 8: Model Evaluation

Evaluate the trained model on the test set using specified metrics and save the results.

In [9]:
# Step 8: Model Evaluation
print("Starting Model Evaluation...")

# We need X_test and y_test from the data splitting step.
# If that cell was not run or variables are not available, rerun the data splitting cell.
# Assuming X_test and y_test are available in the environment from Step 3.

if 'X_test' not in locals() or 'y_test' not in locals():
    print("Error: X_test or y_test not found. Please run the Data Preparation cell first.")
    # Exit or handle appropriately
    exit()

# Make predictions on the test set
y_pred = final_model_pipeline.predict(X_test)
y_pred_proba = final_model_pipeline.predict_proba(X_test)

print("Predictions made on the test set.")

# Calculate evaluation metrics
metrics = {}

# Macro ROC AUC (OvR)
try:
    macro_roc_auc_ovr = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    metrics['macro_roc_auc_ovr'] = float(macro_roc_auc_ovr) # Ensure serializable
    print(f"Macro ROC AUC (OvR): {macro_roc_auc_ovr:.4f}")
except Exception as e:
    print(f"Could not calculate Macro ROC AUC (OvR): {e}")
    metrics['macro_roc_auc_ovr'] = None


# Macro F1
macro_f1 = f1_score(y_test, y_pred, average='macro')
metrics['macro_f1'] = float(macro_f1) # Ensure serializable
print(f"Macro F1: {macro_f1:.4f}")

# Weighted F1
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
metrics['weighted_f1'] = float(weighted_f1) # Ensure serializable
print(f"Weighted F1: {weighted_f1:.4f}")

# Log Loss
try:
    logloss = log_loss(y_test, y_pred_proba)
    metrics['log_loss'] = float(logloss) # Ensure serializable
    print(f"Log Loss: {logloss:.4f}")
except Exception as e:
    print(f"Could not calculate Log Loss: {e}")
    metrics['log_loss'] = None

# Per-class Precision, Recall, F1
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None, labels=np.unique(y_test))

per_class_metrics = {}
# Ensure the order of class names matches the order of labels returned by precision_recall_fscore_support
# and confusion_matrix. np.unique(y_test) gives sorted unique labels.
sorted_unique_labels = sorted(np.unique(y_test))
label_to_class_name = {i: CLASS_NAMES[i] for i in sorted(np.unique(y))} # Map the actual label value to class name

for i, label in enumerate(sorted_unique_labels):
    class_name = label_to_class_name.get(label, f"Label_{label}") # Use get with a default in case of missing labels
    per_class_metrics[class_name] = {
        'precision': float(precision[i]), # Ensure serializable
        'recall': float(recall[i]),       # Ensure serializable
        'f1-score': float(f1[i])          # Ensure serializable
    }
metrics['per_class'] = per_class_metrics
print("\nPer-Class Metrics:")
for class_name, scores in per_class_metrics.items():
    print(f"  {class_name}: Precision={scores['precision']:.4f}, Recall={scores['recall']:.4f}, F1-Score={scores['f1-score']:.4f}")


# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
metrics['confusion_matrix'] = conf_matrix.tolist() # Convert numpy array to list for JSON serializability
print("\nConfusion Matrix:")
print(conf_matrix)

# Add class names to metrics for clarity in the report
metrics['class_names'] = CLASS_NAMES


# Save metrics to JSON file
try:
    # Use the METRICS_FILE variable defined earlier, which includes a timestamp
    with open(METRICS_FILE, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"\nEvaluation metrics saved to {METRICS_FILE}")
except Exception as e:
    print(f"Error saving evaluation metrics: {e}")


print("Model Evaluation Complete.")

Starting Model Evaluation...
Predictions made on the test set.
Macro ROC AUC (OvR): 0.9974
Macro F1: 0.9842
Weighted F1: 0.9890
Log Loss: 0.0669

Per-Class Metrics:
  no_interest: Precision=0.9701, Recall=0.9701, F1-Score=0.9701
  mild_interest: Precision=0.9890, Recall=0.9837, F1-Score=0.9863
  strong_interest: Precision=0.9940, Recall=0.9980, F1-Score=0.9960

Confusion Matrix:
[[130   4   0]
 [  3 361   3]
 [  1   0 498]]

Evaluation metrics saved to /content/drive/MyDrive/mundos_risk_directory/artifacts/metrics_interest_20250817T131233Z.json
Model Evaluation Complete.


### Step 8 (Continued): Model Evaluation - Visualizations

Generate visualizations for PCA explained variance and ROC AUC curves.

In [15]:
# Step 8 (Continued): Model Evaluation - Visualizations
print("Generating Model Evaluation Visualizations...")
import plotly.graph_objects as go
# 1. PCA Explained Variance Plot (if PCA was used)
if use_pca_best:
    print("Generating PCA Explained Variance Plot...")
    # We need to fit a PCA on the scaled training data to get explained variance ratio
    # Use the scaler fitted in Step 3
    X_train_scaled = scaler.transform(X_train)

    temp_pca = PCA(random_state=42)
    temp_pca.fit(X_train_scaled)

    explained_variance_ratio = temp_pca.explained_variance_ratio_
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)

    # Create a DataFrame for plotting
    pca_variance_df = pd.DataFrame({
        'Number of Components': range(1, len(explained_variance_ratio) + 1),
        'Explained Variance Ratio': explained_variance_ratio,
        'Cumulative Explained Variance': cumulative_explained_variance
    })

    # Find the cumulative variance at the best_pca_n_components
    if best_pca_n_components is not None:
         variance_at_best_n = cumulative_explained_variance[best_pca_n_components - 1]
         print(f"Cumulative variance explained by {best_pca_n_components} components: {variance_at_best_n:.4f}")


    fig_pca_variance = px.line(
        pca_variance_df,
        x='Number of Components',
        y='Cumulative Explained Variance',
        title='PCA Cumulative Explained Variance',
        labels={'Cumulative Explained Variance': 'Cumulative Explained Variance Ratio'}
    )

    # Add a vertical line for the chosen number of components
    if best_pca_n_components is not None:
        fig_pca_variance.add_vline(
            x=best_pca_n_components,
            line_dash="dash",
            line_color="red",
            annotation_text=f"Chosen {best_pca_n_components} Components",
            annotation_position="bottom right"
        )
        # Add horizontal line for the explained variance at the chosen components
        fig_pca_variance.add_hline(
            y=variance_at_best_n,
            line_dash="dash",
            line_color="green",
            annotation_text=f"{variance_at_best_n:.4f} Variance Explained",
            annotation_position="top right"
        )


    fig_pca_variance.show()

    # Save the PCA variance plot
    try:
        PCA_VARIANCE_PLOT = os.path.join(ARTIFACTS_DIR, f'pca_variance_plot_{pd.Timestamp.now().strftime("%Y%m%dT%H%M%SZ")}.png')
        fig_pca_variance.write_image(PCA_VARIANCE_PLOT)
        print(f"PCA variance plot saved to {PCA_VARIANCE_PLOT}")
    except Exception as e:
        print(f"Error saving PCA variance plot: {e}")

else:
    print("PCA was not used. Skipping PCA Explained Variance Plot.")


# 2. ROC AUC Curves (One-vs-Rest)
print("\nGenerating ROC AUC Curves...")
from sklearn.metrics import roc_curve, auc

# Need y_test (true labels) and y_pred_proba (predicted probabilities) from evaluation step
if 'y_test' in locals() and 'y_pred_proba' in locals():
    fpr = {}
    tpr = {}
    roc_auc = {}

    # Compute ROC curve and ROC area for each class (OvR)
    for i in range(N_CLASSES):
        # Need to handle multi-class ROC AUC calculation carefully
        # For OvR, we treat one class as positive and all others as negative
        # y_test is 0, 1, or 2. y_pred_proba has probabilities for each class.
        # For class i, the true labels are 1 if y_test == i, else 0.
        # The predicted probabilities for class i are y_pred_proba[:, i].

        y_true_class_i = (y_test == i).astype(int)
        y_score_class_i = y_pred_proba[:, i]

        try:
            fpr[i], tpr[i], _ = roc_curve(y_true_class_i, y_score_class_i)
            roc_auc[i] = auc(fpr[i], tpr[i])
            print(f"ROC AUC for class {CLASS_NAMES[i]} ({i}): {roc_auc[i]:.4f}")
        except Exception as e:
             print(f"Could not compute ROC curve/AUC for class {CLASS_NAMES[i]} ({i}): {e}")
             fpr[i], tpr[i], roc_auc[i] = None, None, None


    # Plot ROC curves
    fig_roc_auc = go.Figure()
    for i in range(N_CLASSES):
        if fpr[i] is not None:
            fig_roc_auc.add_trace(go.Scatter(x=fpr[i], y=tpr[i],
                mode='lines',
                name=f'{CLASS_NAMES[i]} (AUC = {roc_auc[i]:.4f})'))

    fig_roc_auc.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    fig_roc_auc.update_layout(
        title='ROC Curve (One-vs-Rest)',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain="domain"),
        width=700,
        height=500
    )
    fig_roc_auc.show()

    # Save the ROC AUC plot
    try:
        ROC_AUC_PLOT = os.path.join(ARTIFACTS_DIR, f'roc_auc_plot_{pd.Timestamp.now().strftime("%Y%m%dT%H%M%SZ")}.png')
        fig_roc_auc.write_image(ROC_AUC_PLOT)
        print(f"ROC AUC plot saved to {ROC_AUC_PLOT}")
    except Exception as e:
        print(f"Error saving ROC AUC plot: {e}")

else:
    print("Test labels (y_test) or predicted probabilities (y_pred_proba) not found. Skipping ROC AUC plot.")
    print("Please ensure the Model Evaluation step (Step 8) was run successfully.")


print("Model Evaluation Visualizations Complete.")

Generating Model Evaluation Visualizations...
Generating PCA Explained Variance Plot...
Cumulative variance explained by 130 components: 0.9944


Error saving PCA variance plot: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido


Generating ROC AUC Curves...
ROC AUC for class no_interest (0): 0.9936
ROC AUC for class mild_interest (1): 0.9990
ROC AUC for class strong_interest (2): 0.9995


Error saving ROC AUC plot: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Model Evaluation Visualizations Complete.


In [14]:
! pip install -U kaleido

