In [75]:
import pandas as pd 
import os
from sklearn.ensemble import RandomForestClassifier
import os
import csv
import datetime as dt
import numpy as np
from sklearn.metrics import classification_report
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib import style

# Features:
1.	Age | Objective Feature | age | int (days)
2.	Height | Objective Feature | height | int (cm) |
3.	Weight | Objective Feature | weight | float (kg) |
4.	Gender | Objective Feature | gender | categorical code |
5.	Systolic blood pressure | Examination Feature | ap_hi | int |
6.	Diastolic blood pressure | Examination Feature | ap_lo | int |
7.	Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
8.	Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
9.	Smoking | Subjective Feature | smoke | binary |
10.	Alcohol intake | Subjective Feature | alco | binary |
11.	Physical activity | Subjective Feature | active | binary |
12.	Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

All of the dataset values were collected at the moment of medical examination.

The Problem Statement:
To build an application to classify the patients to be healthy or suffering from cardiovascular disease based on the given attributes.

In [76]:
# import data set 
cv_data=pd.read_csv("Resources/cardio_train.csv")

In [77]:
cv_data.head()

Unnamed: 0,id,age,gender,height,weight,systolic,diastolic,cholesterol,gluc,smoke,alcohol intake,active,cardiovascular presence
0,988,22469,1,155,69.0,130,80,2,2,0,0,1,0
1,989,14648,1,163,71.0,110,70,1,1,0,0,1,1
2,990,21901,1,165,70.0,120,80,1,1,0,0,1,0
3,991,14549,2,165,85.0,120,80,1,1,1,1,1,0
4,992,23393,1,155,62.0,120,80,1,1,0,0,1,0


In [10]:
# drop id 
cv_data=cv_data.drop("id", axis=1)

In [14]:
# add bmi
cv_data['bmi'] = round(cv_data.weight/cv_data.height * 100, 2)

In [23]:
# converting age from days to years
cv_data.age=round(cv_data['age']/365, 0)

In [24]:
cv_data.head()

Unnamed: 0,age,gender,height,weight,systolic,diastolic,cholesterol,gluc,smoke,alcohol intake,active,cardiovascular presence,bmi
0,62.0,1,155,69.0,130,80,2,2,0,0,1,0,44.52
1,40.0,1,163,71.0,110,70,1,1,0,0,1,1,43.56
2,60.0,1,165,70.0,120,80,1,1,0,0,1,0,42.42
3,40.0,2,165,85.0,120,80,1,1,1,1,1,0,51.52
4,64.0,1,155,62.0,120,80,1,1,0,0,1,0,40.0


In [25]:
## target=cv_data["cardiovascular presence"]
target_names=["Absence of Cardiovascular Disease","Presence of Cardiovascular Disease"]

In [57]:
data=cv_data.drop("cardiovascular presence", axis=1)
feature_names=data.columns
data.head()

Unnamed: 0,age,gender,height,weight,systolic,diastolic,cholesterol,gluc,smoke,alcohol intake,active,bmi
0,62.0,1,155,69.0,130,80,2,2,0,0,1,44.52
1,40.0,1,163,71.0,110,70,1,1,0,0,1,43.56
2,60.0,1,165,70.0,120,80,1,1,0,0,1,42.42
3,40.0,2,165,85.0,120,80,1,1,1,1,1,51.52
4,64.0,1,155,62.0,120,80,1,1,0,0,1,40.0


In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [61]:
# create random forest classifier
rf=RandomForestClassifier(n_estimators=300)
rf=rf.fit(X_train, y_train)

In [47]:
print(f"Test Score: {rf.score(X_test, y_test)}")
print(f"Training Score: {rf.score(X_train,y_train)}")

Test Score: 0.6896571626457347
Training Score: 0.9092640692640692


In [62]:
features_data=sorted(zip(rf.feature_importances_,feature_names), reverse=True)

In [63]:
features_data

[(0.1915091573032224, 'bmi'),
 (0.18501133856607382, 'systolic'),
 (0.15056361444279617, 'age'),
 (0.13886567668082142, 'height'),
 (0.129673022027835, 'weight'),
 (0.09115974417005822, 'diastolic'),
 (0.04044113558868372, 'cholesterol'),
 (0.020249740787828416, 'gender'),
 (0.018704050142959457, 'gluc'),
 (0.015825655699125418, 'active'),
 (0.009995147942410545, 'smoke'),
 (0.008001716648185435, 'alcohol intake')]

In [64]:
predictions=rf.predict(X_test)

In [70]:
predictions

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [74]:
# classification report
print(metrics.confusion_matrix(y_test,predictions))

print(classification_report(y_test, predictions, target_names=target_names))

[[6186 2468]
 [2545 6127]]
                                    precision    recall  f1-score   support

 Absence of Cardiovascular Disease       0.71      0.71      0.71      8654
Presence of Cardiovascular Disease       0.71      0.71      0.71      8672

                          accuracy                           0.71     17326
                         macro avg       0.71      0.71      0.71     17326
                      weighted avg       0.71      0.71      0.71     17326

