In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [6]:
df=pd.read_csv("cleaned_data.csv")

In [8]:
# 1. Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb_available = True
except ImportError:
    xgb_available = False


X = df.drop(columns=['cardio'])  # Replace with your actual target
y = df['cardio']

# 3. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Support Vector Machine': SVC(probability=True)  # Enable probability=True for ROC AUC
}

if xgb_available:
    models['XGBoost'] = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# 5. Evaluate Models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    })

# 6. Show Comparison Table
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='F1 Score', ascending=False))


                    Model  Accuracy  Precision    Recall  F1 Score   ROC AUC
3       Gradient Boosting  0.739039   0.748215  0.655486  0.698788  0.794329
6                 XGBoost  0.725650   0.732877  0.638715  0.682564  0.782282
4                AdaBoost  0.732738   0.760732  0.614554  0.679874  0.789342
0     Logistic Regression  0.727619   0.754677  0.607732  0.673280  0.773992
2           Random Forest  0.705697   0.694631  0.647243  0.670100  0.763095
1           Decision Tree  0.635994   0.604313  0.613417  0.608831  0.634391
5  Support Vector Machine  0.538199   0.000000  0.000000  0.000000  0.732641


# Conclusion

This project aimed to develop an accurate and reliable machine learning model for the early detection of cardiovascular disease (CVD) based on patient health data. Multiple classification algorithms were implemented and evaluated, including Logistic Regression, Decision Tree, Random Forest, Support Vector Machine (SVM), AdaBoost, Gradient Boosting, and XGBoost. Each model was assessed using key performance metrics such as accuracy, precision, recall, F1 score, and ROC AUC. Among all models, Gradient Boosting delivered the best performance, with the highest F1 Score (0.699) and ROC AUC (0.794), indicating its strong ability to correctly identify CVD cases while maintaining a good balance between false positives and false negatives. XGBoost and AdaBoost followed closely behind, showing competitive results. Simpler models like Logistic Regression were moderately effective and can be used where interpretability is a priority. The SVM model underperformed significantly, likely due to class imbalance or lack of tuning. In conclusion, Gradient Boosting is recommended for deployment in CVD risk prediction tasks, potentially assisting healthcare professionals in early intervention and treatment planning. Further enhancements can be achieved by tuning hyperparameters, applying cross-validation, and exploring deep learning or ensemble stacking approaches.