In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv("../datasets/heart-disease/cardio_train.csv",sep=';')
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [4]:
data.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [6]:
print(data["cardio"].value_counts())

cardio
0    35021
1    34979
Name: count, dtype: int64


In [7]:
data.drop_duplicates(inplace=True)

In [8]:
rename_map = {
    'age': 'age',
    'gender': 'gender',
    'height': 'height',
    'weight': 'weight',
    'ap_hi': 'systolic_pressure',
    'ap_lo': 'diastolic_pressure',
    'cholesterol': 'cholesterol',
    'gluc': 'glucose',
    'smoke': 'smoker',
    'alco': 'alcohol',
    'active': 'active',
    'cardio': 'heart_disease',  
}
existing_cols = {old: new for old, new in rename_map.items() if old in data.columns}
data.rename(columns=existing_cols, inplace=True)

In [9]:
data.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'systolic_pressure',
       'diastolic_pressure', 'cholesterol', 'glucose', 'smoker', 'alcohol',
       'active', 'heart_disease'],
      dtype='object')

In [10]:
data["age"] = data["age"] / 365.25


In [11]:
data = data.drop("id", axis=1)

In [12]:
X = data.drop("heart_disease", axis=1)
y = data["heart_disease"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    loss_function="Logloss",
    eval_metric="Accuracy",
    verbose=50,
    random_seed=42
)

In [15]:
model.fit(X_train, y_train)


0:	learn: 0.7242857	total: 55.8ms	remaining: 27.8s


50:	learn: 0.7369286	total: 388ms	remaining: 3.41s
100:	learn: 0.7393214	total: 708ms	remaining: 2.8s
150:	learn: 0.7416964	total: 1.04s	remaining: 2.4s
200:	learn: 0.7438393	total: 1.38s	remaining: 2.05s
250:	learn: 0.7451607	total: 1.7s	remaining: 1.68s
300:	learn: 0.7464643	total: 2.01s	remaining: 1.33s
350:	learn: 0.7487500	total: 2.33s	remaining: 989ms
400:	learn: 0.7506607	total: 2.68s	remaining: 661ms
450:	learn: 0.7513036	total: 3.01s	remaining: 327ms
499:	learn: 0.7530893	total: 3.34s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x744cb3b2f430>

In [16]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.78      0.75      6988
           1       0.76      0.70      0.73      7012

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000



In [17]:
import pickle
with open("../pkl files/heartDisease.pkl", "wb") as rf:
    pickle.dump(model, rf)