In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import pickle

In [9]:
# Load dataset
cardio_data = pd.read_csv('cardio_cleaned.csv')



In [10]:
cardio_data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,63,1,152,63,130,113,2,1,0,0,1,0
1,41,2,171,64,112,66,1,1,0,0,0,0
2,49,1,146,77,176,67,1,1,0,0,1,0
3,34,2,150,59,118,102,1,2,0,1,1,0
4,32,1,168,77,112,73,3,1,0,0,1,0


In [11]:
cardio_data.shape

(12946, 12)

In [12]:
cardio_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12946 entries, 0 to 12945
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   age          12946 non-null  int64
 1   gender       12946 non-null  int64
 2   height       12946 non-null  int64
 3   weight       12946 non-null  int64
 4   ap_hi        12946 non-null  int64
 5   ap_lo        12946 non-null  int64
 6   cholesterol  12946 non-null  int64
 7   gluc         12946 non-null  int64
 8   smoke        12946 non-null  int64
 9   alco         12946 non-null  int64
 10  active       12946 non-null  int64
 11  cardio       12946 non-null  int64
dtypes: int64(12)
memory usage: 1.2 MB


In [13]:
cardio_data.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [14]:
cardio_data.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0,12946.0
mean,47.061563,1.49243,164.556156,69.52271,134.362506,89.68778,1.491967,1.403677,0.199367,0.150626,0.701916,0.234744
std,10.119235,0.499962,9.941084,14.639591,25.811938,17.325266,0.666802,0.666148,0.39954,0.357697,0.457434,0.423855
min,30.0,1.0,130.0,30.0,90.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,38.0,1.0,158.0,59.0,112.0,75.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,47.0,1.0,165.0,69.0,134.0,90.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,56.0,2.0,171.0,79.0,157.0,105.0,2.0,2.0,0.0,0.0,1.0,0.0
max,64.0,2.0,201.0,132.0,179.0,119.0,3.0,3.0,1.0,1.0,1.0,1.0


In [16]:
cardio_data['cardio'].value_counts()

cardio
0    9907
1    3039
Name: count, dtype: int64

In [18]:
X = cardio_data.drop(columns='cardio', axis=1)
Y = cardio_data['cardio']

In [19]:
print(X)

       age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0       63       1     152      63    130    113            2     1      0   
1       41       2     171      64    112     66            1     1      0   
2       49       1     146      77    176     67            1     1      0   
3       34       2     150      59    118    102            1     2      0   
4       32       1     168      77    112     73            3     1      0   
...    ...     ...     ...     ...    ...    ...          ...   ...    ...   
12941   39       1     177      74    155     80            1     3      0   
12942   61       2     168      71    120    100            1     1      1   
12943   32       2     148      49    133     68            1     1      0   
12944   46       1     166      58    117     71            1     1      0   
12945   59       1     166     104    114     64            2     1      0   

       alco  active  
0         0       1  
1         0       0

In [20]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
12941    0
12942    0
12943    0
12944    0
12945    0
Name: cardio, Length: 12946, dtype: int64


In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [22]:
models = {
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "ExtraTrees": ExtraTreesClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [23]:
# Train all models
for name, model in models.items():
    model.fit(X_train, Y_train)


Parameters: { "use_label_encoder" } are not used.



In [26]:
print("\n--- Model Performance on Test Data ---")
for name, model in models.items():
    preds = model.predict(X_train)
    print(f"\n{name}")
    print("Accuracy :", accuracy_score(Y_train, preds))
    print("F1 Score :", f1_score(Y_train, preds))
    print("Precision:", precision_score(Y_train, preds))
    print("Recall   :", recall_score(Y_train, preds))



--- Model Performance on Test Data ---

RandomForest
Accuracy : 1.0
F1 Score : 1.0
Precision: 1.0
Recall   : 1.0

GradientBoosting
Accuracy : 0.9809772112784859
F1 Score : 0.9579867775645127
Precision: 0.9946855624446412
Recall   : 0.9238996297819827

ExtraTrees
Accuracy : 1.0
F1 Score : 1.0
Precision: 1.0
Recall   : 1.0

DecisionTree
Accuracy : 1.0
F1 Score : 1.0
Precision: 1.0
Recall   : 1.0

XGBoost
Accuracy : 1.0
F1 Score : 1.0
Precision: 1.0
Recall   : 1.0


In [27]:
for name, model in models.items():
    preds = model.predict(X_test)
    print(f"\n{name}")
    print("Test Accuracy :", accuracy_score(Y_test, preds))
    print("F1 Score      :", f1_score(Y_test, preds))
    print("Precision     :", precision_score(Y_test, preds))
    print("Recall        :", recall_score(Y_test, preds))


RandomForest
Test Accuracy : 0.9756756756756757
F1 Score      : 0.9466553767993227
Precision     : 0.9755671902268761
Recall        : 0.9194078947368421

GradientBoosting
Test Accuracy : 0.9694980694980695
F1 Score      : 0.9316017316017317
Precision     : 0.9835466179159049
Recall        : 0.8848684210526315

ExtraTrees
Test Accuracy : 0.9563706563706563
F1 Score      : 0.9021645021645022
Precision     : 0.9524680073126143
Recall        : 0.8569078947368421

DecisionTree
Test Accuracy : 0.9814671814671815
F1 Score      : 0.9605263157894737
Precision     : 0.9605263157894737
Recall        : 0.9605263157894737

XGBoost
Test Accuracy : 0.9922779922779923
F1 Score      : 0.9835255354200988
Precision     : 0.9851485148514851
Recall        : 0.9819078947368421


In [36]:
# Save each model to a pickle file
for name, model in models.items():
    with open(f"{name}_model.sav", "wb") as f:
        pickle.dump(model, f)


building the predictive system

In [28]:
input_data = (54,1,150,61,130,80,3,3,0,0,1)
input_data_np = np.asarray(input_data).reshape(1, -1)

# Store model probabilities
probabilities = {}

for name, model in models.items():
    prob = model.predict_proba(input_data_np)[0][1]
    probabilities[name] = prob
    print(f"{name} - Probability of Disease: {prob:.4f}")

# Decision logic
chosen_model_name = ""
prediction = None

# CASE 1: At least one model gives prob ≥ 0.5 → choose highest probability
high_probs = {k: v for k, v in probabilities.items() if v >= 0.5}
if high_probs:
    chosen_model_name = max(high_probs, key=high_probs.get)
else:
    # CASE 2: All models < 0.5 → choose one with the lowest probability
    chosen_model_name = min(probabilities, key=probabilities.get)

# Final prediction
chosen_model = models[chosen_model_name]
final_prediction = chosen_model.predict(input_data_np)[0]

# Output
print(f"\nSelected Model: {chosen_model_name}")
print("Prediction     :", "Has cardiovascular disease" if final_prediction == 1 else "Is healthy")


RandomForest - Probability of Disease: 0.2000
GradientBoosting - Probability of Disease: 0.1626
ExtraTrees - Probability of Disease: 0.5200
DecisionTree - Probability of Disease: 0.0000
XGBoost - Probability of Disease: 0.0024

Selected Model: ExtraTrees
Prediction     : Has cardiovascular disease




In [33]:
input_data = (62,1,165,70,170,100,1,1,0,0,0)  # e.g., (52, 2, 170, 78, 145, 90, 2, 1, 0, 0, 1)
input_data_np = np.asarray(input_data).reshape(1, -1)

# Store model probabilities
probabilities = {}

for name, model in models.items():
    prob = model.predict_proba(input_data_np)[0][1]
    probabilities[name] = prob

# Decision logic
high_probs = {k: v for k, v in probabilities.items() if v >= 0.5}
if high_probs:
    chosen_model_name = max(high_probs, key=high_probs.get)
else:
    chosen_model_name = min(probabilities, key=probabilities.get)

# Final prediction
chosen_model = models[chosen_model_name]
final_prediction = chosen_model.predict(input_data_np)[0]
final_probability = probabilities[chosen_model_name]

# Output
print(f"\nSelected Model     : {chosen_model_name}")
print(f"Probability         : {final_probability:.4f}")
print("Prediction          :", "Has cardiovascular disease" if final_prediction == 1 else "Is healthy")



Selected Model     : RandomForest
Probability         : 0.0000
Prediction          : Is healthy




In [39]:
import numpy as np
import pickle

# Load all models from pickle files
model_names = ["RandomForest", "GradientBoosting", "ExtraTrees", "DecisionTree", "XGBoost"]
models = {}

for name in model_names:
    with open(f"{name}_model.sav", "rb") as f:
        models[name] = pickle.load(f)

# Input data
input_data = (54,1,170,68,130,80,3,3,0,0,1)
input_data_np = np.asarray(input_data).reshape(1, -1)

# Store model probabilities
probabilities = {}

for name, model in models.items():
    prob = model.predict_proba(input_data_np)[0][1]
    probabilities[name] = prob

# Decision logic
high_probs = {k: v for k, v in probabilities.items() if v >= 0.5}
if high_probs:
    chosen_model_name = max(high_probs, key=high_probs.get)
else:
    chosen_model_name = min(probabilities, key=probabilities.get)

# Final prediction
chosen_model = models[chosen_model_name]
final_prediction = chosen_model.predict(input_data_np)[0]
final_probability = probabilities[chosen_model_name]

# Output
print(f"\nSelected Model     : {chosen_model_name}")
print(f"Probability         : {final_probability:.4f}")
print("Prediction          :", "Has cardiovascular disease" if final_prediction == 1 else "Is healthy")



Selected Model     : DecisionTree
Probability         : 0.0000
Prediction          : Is healthy


