In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import joblib

In [2]:
data = pd.read_csv('Cardiovascular_Disease_Dataset.csv')

In [3]:
# Display basic statistics and info about the dataset
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientid          1000 non-null   int64  
 1   age                1000 non-null   int64  
 2   gender             1000 non-null   int64  
 3   chestpain          1000 non-null   int64  
 4   restingBP          1000 non-null   int64  
 5   serumcholestrol    1000 non-null   int64  
 6   fastingbloodsugar  1000 non-null   int64  
 7   restingrelectro    1000 non-null   int64  
 8   maxheartrate       1000 non-null   int64  
 9   exerciseangia      1000 non-null   int64  
 10  oldpeak            1000 non-null   float64
 11  slope              1000 non-null   int64  
 12  noofmajorvessels   1000 non-null   int64  
 13  target             1000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 109.5 KB
None
          patientid         age       gender    chestpain    resting

In [4]:
# Check for missing values in the dataset
print(data.isnull().sum())

patientid            0
age                  0
gender               0
chestpain            0
restingBP            0
serumcholestrol      0
fastingbloodsugar    0
restingrelectro      0
maxheartrate         0
exerciseangia        0
oldpeak              0
slope                0
noofmajorvessels     0
target               0
dtype: int64


In [6]:
data = data.drop("patientid", axis=1)


In [8]:
print(data["target"].value_counts())


target
1    580
0    420
Name: count, dtype: int64


In [9]:
X = data.drop("target", axis=1)
y = data["target"]

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [17]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

In [19]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else y_pred
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    
    results.append([name, acc, prec, rec, f1, roc])
    print(f"\n🔹 {name} Results")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc:.4f}")


🔹 Logistic Regression Results
              precision    recall  f1-score   support

           0       0.96      0.95      0.96        83
           1       0.97      0.97      0.97       117

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200

ROC-AUC: 0.9963

🔹 Random Forest Results
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        83
           1       0.98      1.00      0.99       117

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200

ROC-AUC: 0.9996

🔹 SVM Results
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        83
           1       0.98      0.98      0.98       117

    accuracy                           0.98       200
   macro avg       0.98  

In [20]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "ROC-AUC"])
print("\n📊 Model Comparison:")
print(results_df)


📊 Model Comparison:
                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression     0.965   0.966102  0.974359  0.970213  0.996293
1        Random Forest     0.990   0.983193  1.000000  0.991525  0.999588
2                  SVM     0.980   0.982906  0.982906  0.982906  0.997014
3                  KNN     0.935   0.956140  0.931624  0.943723  0.981979
4    Gradient Boosting     0.970   0.982609  0.965812  0.974138  0.998146
5              XGBoost     0.965   0.966102  0.974359  0.970213  0.997940


In [21]:
# Select the best model based on the highest ROC-AUC score
best_model_row = results_df.loc[results_df["ROC-AUC"].idxmax()]
print("🏆 Best Model:")
print(best_model_row)

🏆 Best Model:
Model        Random Forest
Accuracy              0.99
Precision         0.983193
Recall                 1.0
F1-Score          0.991525
ROC-AUC           0.999588
Name: 1, dtype: object


In [22]:
joblib.dump(model, 'best_model.pkl')
print("Model saved as best_model.pkl")

Model saved as best_model.pkl
