# Tools

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score ,classification_report

# Load and Train Dataset

In [2]:
df = pd.read_csv("heart_diseases_final.csv")  

# Feaatures & target
X = df.drop(columns=['target'])
y = df['target']

# Dataset split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Testing set  size:", X_test.shape)

Training set size: (242, 15)
Testing set  size: (61, 15)


# Hyperparameter Tuning by using:
* GridSearchCV
* RandomizedSearchCV

In [3]:
# Define pipelines for each model
pipelines = {
    "logistic": Pipeline([
        ("clf", LogisticRegression(max_iter=1000, random_state=42))
    ]),
    "svm": Pipeline([
        ("clf", SVC(probability=True, random_state=42))
    ])
}


# Define hyperparameter grids
param_grids = {
    "logistic": {
        "clf__C": [0.001, 0.01, 0.1, 1, 5, 10, 50, 100],
        "clf__penalty": ["l2"],
        "clf__solver": ["lbfgs"]
    },
    "svm": {
        'clf__C': [0.001, 0.01, 0.1, 1, 5, 10, 50],
        "clf__kernel": ["linear", "rbf", "poly"],
        "clf__gamma": ["scale", "auto", 0.01, 0.1, 1]
    }
}

In [4]:
# Dictionary to store best models info
best_models = {}

for model_name in pipelines.keys():
    print(f"\nTuning {model_name.upper()}...")
    # Hyperparameter Tuning
    pipeline = pipelines[model_name]
    param_grid = param_grids[model_name]

    # GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best params (GridSearch): {grid_search.best_params_}")
    print(f"Best CV accuracy (GridSearch): {grid_search.best_score_:.4f}")

    # RandomizedSearchCV
    randomized_search = RandomizedSearchCV(
        pipeline, param_grid, n_iter=5, cv=5, scoring='accuracy', n_jobs=-1, random_state=42
    )
    randomized_search.fit(X_train, y_train)
    print(f"Best params (RandomizedSearch): {randomized_search.best_params_}")
    print(f"Best CV accuracy (RandomizedSearch): {randomized_search.best_score_:.4f}")

    # Evaluate tuned model on test set
    tuned_model = grid_search.best_estimator_  # using GridSearchCV result
    tuned_pred = tuned_model.predict(X_test)
    tuned_acc = accuracy_score(y_test, tuned_pred)
    print("\nTuned Model Test Set Evaluation:")
    print(classification_report(y_test, tuned_pred))

    # Evaluate baseline mode;
    baseline_pipeline = pipeline.fit(X_train, y_train)
    baseline_pred = baseline_pipeline.predict(X_test)
    baseline_acc = accuracy_score(y_test, baseline_pred)

    # Compare baseline vs tuned
    print(f"\n{model_name.upper()} comparison:")
    print(f"Baseline Accuracy: {baseline_acc:.4f}")
    print(f"Tuned Accuracy   : {tuned_acc:.4f}")

    # Decide which model to save
    if tuned_acc >= baseline_acc:
        best_model_to_save = tuned_model
    else:
        best_model_to_save = baseline_pipeline

    # Store best model info
    best_models[model_name] = {
        "baseline_accuracy": baseline_acc,
        "tuned_accuracy": tuned_acc,
        "saved_model": best_model_to_save
    }


Tuning LOGISTIC...
Best params (GridSearch): {'clf__C': 1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
Best CV accuracy (GridSearch): 0.8594
Best params (RandomizedSearch): {'clf__solver': 'lbfgs', 'clf__penalty': 'l2', 'clf__C': 10}
Best CV accuracy (RandomizedSearch): 0.8552

Tuned Model Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.96      0.82      0.89        33
           1       0.82      0.96      0.89        28

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.90      0.89      0.89        61


LOGISTIC comparison:
Baseline Accuracy: 0.8852
Tuned Accuracy   : 0.8852

Tuning SVM...
Best params (GridSearch): {'clf__C': 5, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}
Best CV accuracy (GridSearch): 0.8426
Best params (RandomizedSearch): {'clf__kernel': 'linear', 'clf__gamma': 'scale', 'clf__C': 1}
Best CV accuracy (RandomizedSearch): 0.8302

Tune

In [5]:
# Summary of best models
print("\n===== Summary of Best Models to be Saved =====")
for model_name, info in best_models.items():
    best_acc = max(info['baseline_accuracy'], info['tuned_accuracy'])
    print(f"{model_name.upper()}: Best Accuracy = {best_acc:.4f}")


===== Summary of Best Models to be Saved =====
LOGISTIC: Best Accuracy = 0.8852
SVM: Best Accuracy = 0.9180


# Model Pipeline Export
1. Save the trained model using joblib or pickle (.pkl format).
2. Ensure reproducibility by saving model pipeline (preprocessing + model).

In [6]:
# Load the raw dataset
df = pd.read_csv("heart_diseases.csv")

# Separate features and target
X = df.drop(columns=['num'])
y = df['num'].apply(lambda x: 1 if x > 0 else 0)

# Define features
categ_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

In [7]:
# Data Preprocessing Pipeline
numeric_transf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

catego_transf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("numerical", numeric_transf, num_cols),
    ("categorical", catego_transf, categ_cols)
])

# --- Custom Feature Selector ---
class RFEChi2Union(BaseEstimator, TransformerMixin):
    def __init__(self, rfe_k=10, chi2_k=10, random_state=42):
        self.rfe_k = rfe_k
        self.chi2_k = chi2_k
        self.random_state = random_state

    def fit(self, X, y):
        # Use feature indices instead of column names
        n_features = X.shape[1]
        feature_names = np.arange(n_features)

        # RFE with RandomForest
        rf = RandomForestClassifier(n_estimators=100, random_state=self.random_state)
        rfe = RFE(estimator=rf, n_features_to_select=self.rfe_k, step=1)
        rfe.fit(X, y)
        self.rfe_features_ = feature_names[rfe.support_]

        # Chi^2
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)

        chi2_selector = SelectKBest(score_func=chi2, k=min(self.chi2_k, n_features))
        chi2_selector.fit(X_scaled, y)
        self.chi2_features_ = feature_names[chi2_selector.get_support()]

        # Final feature set (indices)
        self.selected_features_ = np.unique(np.concatenate([self.rfe_features_, self.chi2_features_]))
        return self

    def transform(self, X):
        return X[:, self.selected_features_]

# Build final pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("feature_selection", RFEChi2Union(rfe_k=10, chi2_k=10)),
    ("model", SVC(probability=True, random_state=42))
])

In [8]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train
pipeline.fit(X_train, y_train)

# Evaluate
print("Train Accuracy:", pipeline.score(X_train, y_train))
print("Test Accuracy:", pipeline.score(X_test, y_test))

# Save full pipeline
joblib.dump(pipeline, "svm_model.pkl")
print("✅Final pipeline (preprocessing + PCA + Feature Selection + Best model(SVM)) saved as svm_model.pkl")

Train Accuracy: 0.8842975206611571
Test Accuracy: 0.9016393442622951
✅Final pipeline (preprocessing + PCA + Feature Selection + Best model(SVM)) saved as svm_model.pkl


In [9]:
# Predictions
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_proba)
report = classification_report(y_test, y_pred, digits=4)

# Save report to txt
with open("model_results.txt", "w") as f:
    f.write("Model Evaluation Results\n")
    f.write("========================\n")
    f.write(f"Accuracy: {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1-score: {f1:.4f}\n")
    f.write(f"ROC-AUC: {auc:.4f}\n\n")
    f.write("Classification Report:\n")
    f.write(report)

print("📄Results saved to model_results.txt")

📄Results saved to model_results.txt
