# Heart Disease Classification - Training Without Optuna

This notebook trains 8 models (4 algorithms √ó 2 PCA conditions) using default hyperparameters:

**Algorithms:** Logistic Regression, Random Forest, SVM, XGBoost  
**Conditions:** With PCA, Without PCA  
**Metric:** F1-Score (for classification)

## Experiment Matrix (8 total experiments)

| Algorithm | No PCA | With PCA |
|-----------|--------|----------|
| Logistic Regression | ‚úì | ‚úì |
| Random Forest | ‚úì | ‚úì |
| SVM | ‚úì | ‚úì |
| XGBoost | ‚úì | ‚úì |


In [1]:
import os
import sys
import time
from pathlib import Path
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import joblib

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

import mlflow
from mlflow.models import infer_signature

# Set base folder
base_folder = Path(os.getcwd()).parent
sys.path.insert(0, str(base_folder))

print(f"Base folder: {base_folder}")
start_time = time.monotonic()

Base folder: /Users/kusumareddy/python_final


In [9]:
# Load environment variables for MLflow/Dagshub
env_path = base_folder /  ".env"
if env_path.exists():
    load_dotenv(env_path)
    print(f"‚úì Loaded environment from {env_path}")
else:
    print(f"‚ö†Ô∏è  No .env file found at {env_path}")
    print("   Create notebooks/.env with your Dagshub credentials for experiment tracking")

# Set up MLflow
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME", "")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD", "")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

if MLFLOW_TRACKING_URI:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    print(f"‚úì MLflow tracking URI: {MLFLOW_TRACKING_URI}")
else:
    print("‚ö†Ô∏è  No MLflow tracking URI configured")

‚úì Loaded environment from /Users/kusumareddy/python_final/.env
‚úì MLflow tracking URI: https://dagshub.com/kusumayanna9/python_final.mlflow


## Load Data from SQLite Database

In [10]:
# Load data from PostgreSQL database using utilities
from db_utils import load_heart_data

heart_data = load_heart_data()
print(f"  Target distribution: {heart_data['target'].value_counts().to_dict()}")
print(f"  Features: {list(heart_data.columns[1:-1])}")
heart_data.head()

  df = pd.read_sql_query(query, conn)


‚úì Loaded 1025 patients from PostgreSQL database
  Target distribution: {1: 526, 0: 499}
  Features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


Unnamed: 0,patient_id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


## Split Data and Setup Preprocessing

In [11]:
# Split data
X = heart_data.drop(['patient_id', 'target'], axis=1)
y = heart_data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print(f"‚úì Train size: {len(X_train)}, Test size: {len(X_test)}")
print(f"  Train target distribution: {y_train.value_counts().to_dict()}")
print(f"  Test target distribution: {y_test.value_counts().to_dict()}")

‚úì Train size: 820, Test size: 205
  Train target distribution: {1: 421, 0: 399}
  Test target distribution: {1: 105, 0: 100}


In [12]:
# Import preprocessing pipeline
from classification_pipeline import build_preprocessing, FEATURE_NAMES

preprocessing = build_preprocessing()
print(f"‚úì Preprocessing pipeline created")
print(f"  Expected features: {FEATURE_NAMES}")

‚úì Preprocessing pipeline created
  Expected features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


## Experiment 1-4: Models WITHOUT PCA

In [13]:
print("\n" + "="*80)
print("TRAINING 4 MODELS WITHOUT PCA (DEFAULT HYPERPARAMETERS)")
print("="*80)

# Define models with default hyperparameters
models_no_pca = {
    "logistic_no_optuna": LogisticRegression(
        random_state=42, max_iter=1000, solver='lbfgs'
    ),
    "randomforest_no_optuna": RandomForestClassifier(
        random_state=42, n_estimators=100, max_depth=10, n_jobs=-1
    ),
    "svm_no_optuna": SVC(
        random_state=42, kernel='rbf', probability=True
    ),
    "xgboost_no_optuna": XGBClassifier(
        objective="binary:logistic", random_state=42, n_estimators=100,
        learning_rate=0.1, max_depth=6, use_label_encoder=False,
        eval_metric='logloss', n_jobs=-1
    )
}

results_no_pca = {}

for name, model in models_no_pca.items():
    print(f"\nüîç Training {name.upper()}...")
    
    # Create pipeline
    pipeline = make_pipeline(preprocessing, model)
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")
    cv_f1 = cv_scores.mean()
    
    # Train on full training set
    pipeline.fit(X_train, y_train)
    
    # Test predictions
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    
    print(f"  CV F1: {cv_f1:.4f} (¬±{cv_scores.std()*2:.4f})")
    print(f"  Test F1: {test_f1:.4f}, Test Accuracy: {test_acc:.4f}")
    
    # Store results
    results_no_pca[name] = {
        "pipeline": pipeline,
        "cv_f1": cv_f1,
        "test_f1": test_f1,
        "test_acc": test_acc
    }
    
    # Save model
    models_dir = base_folder / "models"
    models_dir.mkdir(exist_ok=True)
    model_path = models_dir / f"{name}.pkl"
    joblib.dump(pipeline, model_path)
    print(f"  ‚úì Model saved to {model_path}")
    
    # Log to MLflow if configured
    if MLFLOW_TRACKING_URI:
        with mlflow.start_run(run_name=name):
            mlflow.log_param("model", name.split("_")[0])
            mlflow.log_param("uses_pca", False)
            mlflow.log_param("uses_optuna", False)
            mlflow.log_metric("cv_f1", cv_f1)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.log_metric("test_accuracy", test_acc)
            
            signature = infer_signature(X_train, pipeline.predict(X_train))
            mlflow.sklearn.log_model(pipeline, "model", signature=signature)
        print(f"  ‚úì Logged to MLflow")


TRAINING 4 MODELS WITHOUT PCA (DEFAULT HYPERPARAMETERS)

üîç Training LOGISTIC_NO_OPTUNA...
  CV F1: 0.8548 (¬±0.0396)
  Test F1: 0.8312, Test Accuracy: 0.8098
  ‚úì Model saved to /Users/kusumareddy/python_final/models/logistic_no_optuna.pkl




üèÉ View run logistic_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/820a7bbde1f44a53924094d486d6b7c1
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Training RANDOMFOREST_NO_OPTUNA...
  CV F1: 0.9822 (¬±0.0374)
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/randomforest_no_optuna.pkl




üèÉ View run randomforest_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/099504031b144dfeb52d82c9c38f5c68
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Training SVM_NO_OPTUNA...
  CV F1: 0.9166 (¬±0.0367)
  Test F1: 0.9296, Test Accuracy: 0.9268
  ‚úì Model saved to /Users/kusumareddy/python_final/models/svm_no_optuna.pkl




üèÉ View run svm_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/392b9e9c4e554df9985efa987e7755a2
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Training XGBOOST_NO_OPTUNA...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  CV F1: 0.9859 (¬±0.0189)
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/xgboost_no_optuna.pkl




üèÉ View run xgboost_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/dd372c0fe6bf45658c343329df8c3157
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow


## Experiment 5-8: Models WITH PCA

In [None]:
print("\n" + "="*80)
print("TRAINING 4 MODELS WITH PCA (DEFAULT HYPERPARAMETERS)")
print("="*80)

# Define models with PCA (95% variance retention)
models_with_pca = {
    "logistic_with_pca_no_optuna": LogisticRegression(
        random_state=42, max_iter=1000, solver='lbfgs'
    ),
    "randomforest_with_pca_no_optuna": RandomForestClassifier(
        random_state=42, n_estimators=100, max_depth=10, n_jobs=-1
    ),
    "svm_with_pca_no_optuna": SVC(
        random_state=42, kernel='rbf', probability=True
    ),
    "xgboost_with_pca_no_optuna": XGBClassifier(
        objective="binary:logistic", random_state=42, n_estimators=100,
        learning_rate=0.1, max_depth=6, use_label_encoder=False,
        eval_metric='logloss', n_jobs=-1
    )
}

results_with_pca = {}
pca_components = 0.95  # Retain 95% of variance

for name, model in models_with_pca.items():
    print(f"\nüîç Training {name.upper()}...")
    
    # Create pipeline with PCA
    pipeline = make_pipeline(
        preprocessing,
        PCA(n_components=pca_components),
        model
    )
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")
    cv_f1 = cv_scores.mean()
    
    # Train on full training set
    pipeline.fit(X_train, y_train)
    
    # Check PCA components
    pca_step = pipeline.named_steps['pca']
    n_components_used = pca_step.n_components_
    explained_variance = pca_step.explained_variance_ratio_.sum()
    
    # Test predictions
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)
    
    print(f"  PCA: {n_components_used} components, {explained_variance:.3f} variance explained")
    print(f"  CV F1: {cv_f1:.4f} (¬±{cv_scores.std()*2:.4f})")
    print(f"  Test F1: {test_f1:.4f}, Test Accuracy: {test_acc:.4f}")
    
    # Store results
    results_with_pca[name] = {
        "pipeline": pipeline,
        "cv_f1": cv_f1,
        "test_f1": test_f1,
        "test_acc": test_acc,
        "pca_components": n_components_used,
        "explained_variance": explained_variance
    }
    
    # Save model
    model_path = models_dir / f"{name}.pkl"
    joblib.dump(pipeline, model_path)
    print(f"  ‚úì Model saved to {model_path}")
    
    # Log to MLflow if configured
    if MLFLOW_TRACKING_URI:
        with mlflow.start_run(run_name=name):
            mlflow.log_param("model", name.split("_")[0])
            mlflow.log_param("uses_pca", True)
            mlflow.log_param("uses_optuna", False)
            mlflow.log_param("pca_components", n_components_used)
            mlflow.log_param("explained_variance", explained_variance)
            mlflow.log_metric("cv_f1", cv_f1)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.log_metric("test_accuracy", test_acc)
            
            signature = infer_signature(X_train, pipeline.predict(X_train))
            mlflow.sklearn.log_model(pipeline, "model", signature=signature)
        print(f"  ‚úì Logged to MLflow")


TRAINING 4 MODELS WITH PCA (DEFAULT HYPERPARAMETERS)

üîç Training LOGISTIC_WITH_PCA_NO_OPTUNA...
  PCA: 12 components, 0.971 variance explained
  CV F1: 0.8524 (¬±0.0435)
  Test F1: 0.8312, Test Accuracy: 0.8098
  ‚úì Model saved to /Users/kusumareddy/python_final/models/logistic_with_pca_no_optuna.pkl




üèÉ View run logistic_with_pca_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/38a343915c0b4d56b994a693e702c7d8
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Training RANDOMFOREST_WITH_PCA_NO_OPTUNA...
  PCA: 12 components, 0.971 variance explained
  CV F1: 0.9811 (¬±0.0336)
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/randomforest_with_pca_no_optuna.pkl




üèÉ View run randomforest_with_pca_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/250f2144cee14d1ea52d2770531d3d48
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Training SVM_WITH_PCA_NO_OPTUNA...
  PCA: 12 components, 0.971 variance explained
  CV F1: 0.9121 (¬±0.0386)
  Test F1: 0.9252, Test Accuracy: 0.9220
  ‚úì Model saved to /Users/kusumareddy/python_final/models/svm_with_pca_no_optuna.pkl




üèÉ View run svm_with_pca_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/45ec8bbaf73a49eda5767b23df0fc9e7
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow

üîç Training XGBOOST_WITH_PCA_NO_OPTUNA...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  PCA: 12 components, 0.971 variance explained
  CV F1: 0.9822 (¬±0.0333)
  Test F1: 1.0000, Test Accuracy: 1.0000
  ‚úì Model saved to /Users/kusumareddy/python_final/models/xgboost_with_pca_no_optuna.pkl




üèÉ View run xgboost_with_pca_no_optuna at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0/runs/7497901befe34759b2644ac622fe31fb
üß™ View experiment at: https://dagshub.com/kusumayanna9/python_final.mlflow/#/experiments/0
  ‚úì Logged to MLflow


## Results Summary

In [15]:
# Combine all results
all_results = {**results_no_pca, **results_with_pca}

# Find global best model
global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
global_best = all_results[global_best_name]

print("\n" + "="*80)
print("GLOBAL BEST MODEL (WITHOUT OPTUNA)")
print("="*80)
print(f"Best model: {global_best_name}")
print(f"CV F1:      {global_best['cv_f1']:.4f}")
print(f"Test F1:    {global_best['test_f1']:.4f}")
print(f"Test Acc:   {global_best['test_acc']:.4f}")

# Save best model
best_model_path = models_dir / "global_best_model.pkl"
joblib.dump(global_best["pipeline"], best_model_path)
print(f"\n‚úì Saved best model to {best_model_path}")

# Print summary table
print("\n" + "="*80)
print("SUMMARY OF ALL 8 EXPERIMENTS (WITHOUT OPTUNA)")
print("="*80)
print(f"{'Model':<40} | {'CV F1':<8} | {'Test F1':<8} | {'Test Acc':<8}")
print("-" * 80)
for name, res in sorted(all_results.items(), key=lambda x: -x[1]["test_f1"]):
    print(f"{name:<40} | {res['cv_f1']:.4f}   | {res['test_f1']:.4f}   | {res['test_acc']:.4f}")

end_time = time.monotonic()
elapsed = end_time - start_time
print(f"\n‚úì Total time: {int(elapsed//60)} min {elapsed%60:.1f} sec")
print("\n‚úÖ All 8 experiments complete! Check Dagshub for tracking.")


GLOBAL BEST MODEL (WITHOUT OPTUNA)
Best model: randomforest_no_optuna
CV F1:      0.9822
Test F1:    1.0000
Test Acc:   1.0000

‚úì Saved best model to /Users/kusumareddy/python_final/models/global_best_model.pkl

SUMMARY OF ALL 8 EXPERIMENTS (WITHOUT OPTUNA)
Model                                    | CV F1    | Test F1  | Test Acc
--------------------------------------------------------------------------------
randomforest_no_optuna                   | 0.9822   | 1.0000   | 1.0000
xgboost_no_optuna                        | 0.9859   | 1.0000   | 1.0000
randomforest_with_pca_no_optuna          | 0.9811   | 1.0000   | 1.0000
xgboost_with_pca_no_optuna               | 0.9822   | 1.0000   | 1.0000
svm_no_optuna                            | 0.9166   | 0.9296   | 0.9268
svm_with_pca_no_optuna                   | 0.9121   | 0.9252   | 0.9220
logistic_no_optuna                       | 0.8548   | 0.8312   | 0.8098
logistic_with_pca_no_optuna              | 0.8524   | 0.8312   | 0.8098

‚úì Tot