# Heart Disease Classification - Training WITH Optuna

This notebook trains 8 models (4 algorithms √ó 2 PCA conditions) with Optuna hyperparameter tuning:

**Algorithms:** Logistic Regression, Random Forest, SVM, XGBoost  
**Conditions:** With PCA, Without PCA  
**Metric:** F1-Score (for classification)  
**Optimization:** Optuna with 20 trials per model

## Experiment Matrix (8 total experiments)

| Algorithm | No PCA + Optuna | With PCA + Optuna |
|-----------|-----------------|-------------------|
| Logistic Regression | ‚úì | ‚úì |
| Random Forest | ‚úì | ‚úì |
| SVM | ‚úì | ‚úì |
| XGBoost | ‚úì | ‚úì |


In [4]:
import os
import sys
import time
from pathlib import Path
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import joblib
import optuna
from optuna.samplers import TPESampler

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

import mlflow
from mlflow.models import infer_signature

# Set base folder
base_folder = Path(os.getcwd()).parent
sys.path.insert(0, str(base_folder))

print(f"Base folder: {base_folder}")
start_time = time.monotonic()

# Suppress Optuna logs
optuna.logging.set_verbosity(optuna.logging.WARNING)

Base folder: /Users/kusumareddy/python_final


In [5]:
# Load environment variables for MLflow/Dagshub
env_path = base_folder /  ".env"
if env_path.exists():
    load_dotenv(env_path)
    print(f"‚úì Loaded environment from {env_path}")
else:
    print(f"‚ö†Ô∏è  No .env file found at {env_path}")
    print("   Create notebooks/.env with your Dagshub credentials for experiment tracking")

# Set up MLflow
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME", "")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD", "")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

if MLFLOW_TRACKING_URI:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    print(f"‚úì MLflow tracking URI: {MLFLOW_TRACKING_URI}")
else:
    print("‚ö†Ô∏è  No MLflow tracking URI configured")

‚úì Loaded environment from /Users/kusumareddy/python_final/.env
‚úì MLflow tracking URI: https://dagshub.com/kusumayanna9/python_final.mlflow


## Load Data from PostgreSQL Database

In [6]:
# Load data from PostgreSQL database using utilities
from db_utils import load_heart_data, test_database_connection

# Test database connection first
test_result = test_database_connection()
if not test_result["success"]:
    print(f"‚ùå Database connection failed: {test_result['error']}")
    print("Please check your DATABASE_URL in .env file")
    raise ConnectionError("Database connection failed")

print(f"‚úÖ Connected to {test_result['database_type'].upper()} database")
print(f"   Patient count: {test_result['patient_count']}")

# Load heart disease data
heart_data = load_heart_data()
print(f"  Target distribution: {heart_data['target'].value_counts().to_dict()}")
heart_data.head()

‚úÖ Connected to POSTGRESQL database
   Patient count: 1025


  df = pd.read_sql_query(query, conn)


‚úì Loaded 1025 patients from PostgreSQL database
  Target distribution: {1: 526, 0: 499}


Unnamed: 0,patient_id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


## Split Data and Setup Preprocessing

In [7]:
# Split data
X = heart_data.drop(['patient_id', 'target'], axis=1)
y = heart_data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print(f"‚úì Train size: {len(X_train)}, Test size: {len(X_test)}")

# Import preprocessing
from classification_pipeline import build_preprocessing, FEATURE_NAMES
preprocessing = build_preprocessing()
print(f"‚úì Preprocessing pipeline created")

‚úì Train size: 820, Test size: 205
‚úì Preprocessing pipeline created


## Define Optuna Objective Functions

In [8]:
# Optuna objective functions (MAXIMIZE F1-score)

def objective_logistic(trial):
    """Logistic Regression hyperparameter tuning"""
    C = trial.suggest_float("C", 0.001, 100, log=True)
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
    model = make_pipeline(
        preprocessing,
        LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42)
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

def objective_rf(trial):
    """Random Forest hyperparameter tuning"""
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    
    model = make_pipeline(
        preprocessing,
        RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42,
            n_jobs=-1
        )
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

def objective_svm(trial):
    """SVM hyperparameter tuning"""
    C = trial.suggest_float("C", 0.1, 100, log=True)
    kernel = trial.suggest_categorical("kernel", ["rbf", "linear"])
    
    model = make_pipeline(
        preprocessing,
        SVC(C=C, kernel=kernel, probability=True, random_state=42)
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

def objective_xgb(trial):
    """XGBoost hyperparameter tuning"""
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    
    model = make_pipeline(
        preprocessing,
        XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42,
            use_label_encoder=False,
            eval_metric="logloss"
        )
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

print("‚úì Objective functions defined")

‚úì Objective functions defined


## Experiment 1-4: Models WITHOUT PCA + Optuna

In [9]:
print("\n" + "="*80)
print("TRAINING 4 MODELS WITHOUT PCA (WITH OPTUNA)")
print("="*80)

model_configs = [
    ("logistic_optuna", objective_logistic),
    ("randomforest_optuna", objective_rf),
    ("svm_optuna", objective_svm),
    ("xgboost_optuna", objective_xgb),
]

results = {}
N_TRIALS = 20
models_dir = base_folder / "models"
models_dir.mkdir(exist_ok=True)

for name, objective in model_configs:
    print(f"\nüîç Optimizing {name.upper()} (NO PCA) - {N_TRIALS} trials")
    
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)
    
    best_params = study.best_params
    cv_f1 = study.best_value
    print(f"  Best CV F1: {cv_f1:.4f}")
    print(f"  Best params: {best_params}")
    
    # Build final model with best params
    if name.startswith("logistic"):
        final_model = make_pipeline(
            preprocessing,
            LogisticRegression(C=best_params["C"], solver=best_params["solver"], 
                             max_iter=1000, random_state=42)
        )
    elif name.startswith("randomforest"):
        final_model = make_pipeline(
            preprocessing,
            RandomForestClassifier(
                n_estimators=best_params["n_estimators"],
                max_depth=best_params["max_depth"],
                min_samples_split=best_params["min_samples_split"],
                random_state=42, n_jobs=-1
            )
        )
    elif name.startswith("svm"):
        final_model = make_pipeline(
            preprocessing,
            SVC(C=best_params["C"], kernel=best_params["kernel"], 
                probability=True, random_state=42)
        )
    elif name.startswith("xgboost"):
        final_model = make_pipeline(
            preprocessing,
            XGBClassifier(
                n_estimators=best_params["n_estimators"],
                max_depth=best_params["max_depth"],
                learning_rate=best_params["learning_rate"],
                random_state=42, use_label_encoder=False, eval_metric="logloss"
            )
        )
    
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    
    print(f"  Test F1: {test_f1:.4f}, Test Accuracy: {test_acc:.4f}")
    
    results[name] = {
        "pipeline": final_model,
        "cv_f1": cv_f1,
        "test_f1": test_f1,
        "test_acc": test_acc,
        "best_params": best_params
    }
    
    # Save model
    model_path = models_dir / f"{name}.pkl"
    joblib.dump(final_model, model_path)
    print(f"  ‚úì Model saved to {model_path}")
    
    # Log to MLflow
    if MLFLOW_TRACKING_URI:
        try:
            with mlflow.start_run(run_name=name):
                mlflow.log_param("model", name.split("_")[0])
                mlflow.log_param("uses_pca", False)
                mlflow.log_param("uses_optuna", True)
                mlflow.log_params(best_params)
                mlflow.log_metric("cv_f1", cv_f1)
                mlflow.log_metric("test_f1", test_f1)
                mlflow.log_metric("test_accuracy", test_acc)
                
                signature = infer_signature(X_train, final_model.predict(X_train))
                mlflow.sklearn.log_model(final_model, "model", signature=signature)
            print(f"  ‚úì Logged to MLflow")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  MLflow logging failed: {e}")


TRAINING 4 MODELS WITHOUT PCA (WITH OPTUNA)

üîç Optimizing LOGISTIC_OPTUNA (NO PCA) - 20 trials
  Best CV F1: 0.8568
  Best params: {'C': 0.3725393839578886, 'solver': 'lbfgs'}
  Test F1: 0.8312, Test Accuracy: 0.8098
  ‚úì Model saved to /Users/kusumareddy/python_final/models/logistic_optuna.pkl




üèÉ View run logistic_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/d4ef0a09550841e9ae8e19001f4565b0
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Optimizing RANDOMFOREST_OPTUNA (NO PCA) - 20 trials
  Best CV F1: 0.9835
  Best params: {'n_estimators': 284, 'max_depth': 15, 'min_samples_split': 4}
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/randomforest_optuna.pkl




üèÉ View run randomforest_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/8ae65af511c6446b85e7892f128679dc
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Optimizing SVM_OPTUNA (NO PCA) - 20 trials
  Best CV F1: 0.9808
  Best params: {'C': 31.428808908401084, 'kernel': 'rbf'}
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/svm_optuna.pkl




üèÉ View run svm_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/efd3d95891054ca09419fe3a4f7a42d0
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Optimizing XGBOOST_OPTUNA (NO PCA) - 20 trials


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  Best CV F1: 0.9882
  Best params: {'n_estimators': 124, 'max_depth': 8, 'learning_rate': 0.08229845204010139}
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/xgboost_optuna.pkl




üèÉ View run xgboost_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/1b2a9a23806645be8c4e9e531b7b8868
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow


## Define PCA Objective Functions

In [10]:
# PCA versions of objective functions

def objective_logistic_pca(trial):
    C = trial.suggest_float("C", 0.001, 100, log=True)
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    n_components = trial.suggest_int("n_components", 5, 10)
    
    model = make_pipeline(
        preprocessing,
        PCA(n_components=n_components),
        LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42)
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

def objective_rf_pca(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    n_components = trial.suggest_int("n_components", 5, 10)
    
    model = make_pipeline(
        preprocessing,
        PCA(n_components=n_components),
        RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42,
            n_jobs=-1
        )
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

def objective_svm_pca(trial):
    C = trial.suggest_float("C", 0.1, 100, log=True)
    kernel = trial.suggest_categorical("kernel", ["rbf", "linear"])
    n_components = trial.suggest_int("n_components", 5, 10)
    
    model = make_pipeline(
        preprocessing,
        PCA(n_components=n_components),
        SVC(C=C, kernel=kernel, probability=True, random_state=42)
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

def objective_xgb_pca(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    n_components = trial.suggest_int("n_components", 5, 10)
    
    model = make_pipeline(
        preprocessing,
        PCA(n_components=n_components),
        XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42,
            use_label_encoder=False,
            eval_metric="logloss"
        )
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1")
    return scores.mean()

print("‚úì PCA objective functions defined")

‚úì PCA objective functions defined


## Experiment 5-8: Models WITH PCA + Optuna

In [11]:
print("\n" + "="*80)
print("TRAINING 4 MODELS WITH PCA (WITH OPTUNA)")
print("="*80)

pca_model_configs = [
    ("logistic_with_pca_optuna", objective_logistic_pca),
    ("randomforest_with_pca_optuna", objective_rf_pca),
    ("svm_with_pca_optuna", objective_svm_pca),
    ("xgboost_with_pca_optuna", objective_xgb_pca),
]

for name, objective in pca_model_configs:
    print(f"\nüîç Optimizing {name.upper()} - {N_TRIALS} trials")
    
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)
    
    best_params = study.best_params
    cv_f1 = study.best_value
    n_components = best_params.pop("n_components")
    print(f"  Best CV F1: {cv_f1:.4f}")
    print(f"  Best params: {best_params}, n_components: {n_components}")
    
    # Build final model with best params
    base_name = name.replace("_with_pca_optuna", "")
    if base_name == "logistic":
        final_model = make_pipeline(
            preprocessing,
            PCA(n_components=n_components),
            LogisticRegression(C=best_params["C"], solver=best_params["solver"], 
                             max_iter=1000, random_state=42)
        )
    elif base_name == "randomforest":
        final_model = make_pipeline(
            preprocessing,
            PCA(n_components=n_components),
            RandomForestClassifier(
                n_estimators=best_params["n_estimators"],
                max_depth=best_params["max_depth"],
                min_samples_split=best_params["min_samples_split"],
                random_state=42, n_jobs=-1
            )
        )
    elif base_name == "svm":
        final_model = make_pipeline(
            preprocessing,
            PCA(n_components=n_components),
            SVC(C=best_params["C"], kernel=best_params["kernel"], 
                probability=True, random_state=42)
        )
    elif base_name == "xgboost":
        final_model = make_pipeline(
            preprocessing,
            PCA(n_components=n_components),
            XGBClassifier(
                n_estimators=best_params["n_estimators"],
                max_depth=best_params["max_depth"],
                learning_rate=best_params["learning_rate"],
                random_state=42, use_label_encoder=False, eval_metric="logloss"
            )
        )
    
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    
    print(f"  Test F1: {test_f1:.4f}, Test Accuracy: {test_acc:.4f}")
    
    results[name] = {
        "pipeline": final_model,
        "cv_f1": cv_f1,
        "test_f1": test_f1,
        "test_acc": test_acc,
        "best_params": {**best_params, "n_components": n_components}
    }
    
    # Save model
    model_path = models_dir / f"{name}.pkl"
    joblib.dump(final_model, model_path)
    print(f"  ‚úì Model saved to {model_path}")
    
    # Log to MLflow
    if MLFLOW_TRACKING_URI:
        try:
            with mlflow.start_run(run_name=name):
                mlflow.log_param("model", base_name)
                mlflow.log_param("uses_pca", True)
                mlflow.log_param("uses_optuna", True)
                mlflow.log_param("n_components", n_components)
                mlflow.log_params(best_params)
                mlflow.log_metric("cv_f1", cv_f1)
                mlflow.log_metric("test_f1", test_f1)
                mlflow.log_metric("test_accuracy", test_acc)
                
                signature = infer_signature(X_train, final_model.predict(X_train))
                mlflow.sklearn.log_model(final_model, "model", signature=signature)
            print(f"  ‚úì Logged to MLflow")
        except Exception as e:
            print(f"  ‚ö†Ô∏è  MLflow logging failed: {e}")


TRAINING 4 MODELS WITH PCA (WITH OPTUNA)

üîç Optimizing LOGISTIC_WITH_PCA_OPTUNA - 20 trials
  Best CV F1: 0.8540
  Best params: {'C': 14.528246637516036, 'solver': 'lbfgs'}, n_components: 6
  Test F1: 0.8520, Test Accuracy: 0.8390
  ‚úì Model saved to /Users/kusumareddy/python_final/models/logistic_with_pca_optuna.pkl




üèÉ View run logistic_with_pca_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/bdac4ce0b14a4babb7fbabe87c4e239f
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Optimizing RANDOMFOREST_WITH_PCA_OPTUNA - 20 trials
  Best CV F1: 0.9834
  Best params: {'n_estimators': 217, 'max_depth': 16, 'min_samples_split': 2}, n_components: 5
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/randomforest_with_pca_optuna.pkl




üèÉ View run randomforest_with_pca_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/6ca7216364de44ff8e65546ee2bbc2b4
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Optimizing SVM_WITH_PCA_OPTUNA - 20 trials
  Best CV F1: 0.9809
  Best params: {'C': 99.18374826087415, 'kernel': 'rbf'}, n_components: 9
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/svm_with_pca_optuna.pkl




üèÉ View run svm_with_pca_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/39463c8d7b23454296c1f6cb8591ea9e
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Optimizing XGBOOST_WITH_PCA_OPTUNA - 20 trials


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  Best CV F1: 0.9882
  Best params: {'n_estimators': 93, 'max_depth': 10, 'learning_rate': 0.16401332399717264}, n_components: 9
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/xgboost_with_pca_optuna.pkl


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


üèÉ View run xgboost_with_pca_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/6346c417eaa04f45b9be3c89a74f2442
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow


## Results Summary

In [12]:
# Find global best model
print("\n" + "="*80)
print("GLOBAL BEST MODEL (OPTUNA EXPERIMENTS)")
print("="*80)

global_best_name = max(results, key=lambda k: results[k]["test_f1"])
global_best = results[global_best_name]

print(f"Best model: {global_best_name}")
print(f"CV F1:      {global_best['cv_f1']:.4f}")
print(f"Test F1:    {global_best['test_f1']:.4f}")
print(f"Test Acc:   {global_best['test_acc']:.4f}")

# Save best model
model_path = models_dir / "global_best_model_optuna.pkl"
joblib.dump(global_best["pipeline"], model_path)
print(f"\n‚úì Saved best model to {model_path}")

# Print summary
print("\n" + "="*80)
print("SUMMARY OF ALL 8 OPTUNA EXPERIMENTS")
print("="*80)
print(f"{'Model':<40} | {'CV F1':<8} | {'Test F1':<8} | {'Test Acc':<8}")
print("-" * 80)
for name, res in sorted(results.items(), key=lambda x: -x[1]["test_f1"]):
    print(f"{name:<40} | {res['cv_f1']:.4f}   | {res['test_f1']:.4f}   | {res['test_acc']:.4f}")

end_time = time.monotonic()
elapsed = end_time - start_time
print(f"\n‚úì Total time: {int(elapsed//60)} min {elapsed%60:.1f} sec")
print("\n‚úÖ All 8 Optuna experiments complete! Check Dagshub for tracking.")


GLOBAL BEST MODEL (OPTUNA EXPERIMENTS)
Best model: randomforest_optuna
CV F1:      0.9835
Test F1:    1.0000
Test Acc:   1.0000

‚úì Saved best model to /Users/kusumareddy/python_final/models/global_best_model_optuna.pkl

SUMMARY OF ALL 8 OPTUNA EXPERIMENTS
Model                                    | CV F1    | Test F1  | Test Acc
--------------------------------------------------------------------------------
randomforest_optuna                      | 0.9835   | 1.0000   | 1.0000
svm_optuna                               | 0.9808   | 1.0000   | 1.0000
xgboost_optuna                           | 0.9882   | 1.0000   | 1.0000
randomforest_with_pca_optuna             | 0.9834   | 1.0000   | 1.0000
svm_with_pca_optuna                      | 0.9809   | 1.0000   | 1.0000
xgboost_with_pca_optuna                  | 0.9882   | 1.0000   | 1.0000
logistic_with_pca_optuna                 | 0.8540   | 0.8520   | 0.8390
logistic_optuna                          | 0.8568   | 0.8312   | 0.8098

‚úì Total