In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

FileNotFoundError: [Errno 2] No such file or directory: '../input'

First we import the dataset.

In [2]:
dataset = pd.read_csv('../input/cardio_train.csv', sep = ';')

FileNotFoundError: File b'../input/cardio_train.csv' does not exist

Now checking the various features of the dataset.

In [None]:
dataset.info()

* RangeIndex: 70000 entries, 0 to 69999
* Data columns (total 13 columns):
* id             70000 non-null int64
* age            70000 non-null int64
* gender         70000 non-null int64
* height         70000 non-null int64
* weight         70000 non-null float64
* ap_hi          70000 non-null int64
* ap_lo          70000 non-null int64
* cholesterol    70000 non-null int64
* gluc           70000 non-null int64
* smoke          70000 non-null int64
* alco           70000 non-null int64
* active         70000 non-null int64
* cardio         70000 non-null int64
* dtypes: float64(1), int64(12)

In [None]:
dataset.describe()

In [None]:
dataset.head()

As you can see:
* The age is given by days.
* Gender is denoted by 1 and 2 (Need to find what each number stands for).
* Height is in centimeters, as integer values.
* Weight is in kilograms, as float values.
* Systolic (ap_hi) and Diastolic (ap_lo) blood pressure, as integer values.
* Cholesterol and Glucose levels indicated by zone , as integer values.
* Smoking, Alcholic intake and Physical Activity as Binary values.
* Presence or absence of cardiovascular disease as Binary values.

Now let's check the affect each parameter has on the CVD.

First starting with age, by converting days to years.

In [None]:
dataset['years'] = (dataset['age']/360).round().astype(int)
dataset.head()

Now plotting a graph to show the trend with CVDs and age

In [None]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 8 #To change figure size
sns.countplot(x = 'years', hue = 'cardio', data = dataset)

As visible, people over the age of 55 show higher chances of being diaganosed with CVDs.

Now we find whether 1 stands for male or female.
Assuming the male population is generally taller.

In [None]:
dataset.groupby('gender')['height'].mean()

As you can see, mean height of 2 is greater than 1.

Also, assuming that men generally weigh heavier than females,

In [None]:
dataset.groupby('gender')['weight'].mean()

Mean weight of 2 > 1, hence our assumption is true.
1 -> Female
2 -> Male

Now, to check for height and weight:
If you notice,
* Id's are irrelevant.
* The maximum height in the dataset is 250 cms.
* The maximum weight in the dataset is 200 kgs.
* The minimum height in the dataset is 55 cms.
* The minimum weight in the dataset is 10 kgs.
Now considering that the minimum age is 30 years and maximum age is 66 years,  it is highly likely that these cases are special and is better to treat them as outliers.
    

In [None]:
dataset.drop(['id'], axis = 1, inplace = True)
dataset.drop(dataset[(dataset['height'] > dataset['height'].quantile(0.975)) | (dataset['height'] < dataset['height'].quantile(0.025))].index,inplace=True)
dataset.drop(dataset[(dataset['weight'] > dataset['weight'].quantile(0.975)) | (dataset['weight'] < dataset['weight'].quantile(0.025))].index,inplace=True)

In [None]:
dataset.describe()

As you can now see, the minimum and maximum values for height and weight seems reasonable, after considering these values for over a range of 2.5%<= x <= 97.5%

Now coming to the blood pressures.

* We know that the diastolic blood pressure cannot exceed the systolic blood pressure.
* Blood pressure cannot be negative.

Using these constraints, we eliminate any outliers.

In [None]:
dataset.drop(dataset[(dataset['ap_hi'] > dataset['ap_hi'].quantile(0.975)) | (dataset['ap_hi'] < dataset['ap_hi'].quantile(0.025))].index,inplace=True)
dataset.drop(dataset[(dataset['ap_lo'] > dataset['ap_lo'].quantile(0.975)) | (dataset['ap_lo'] < dataset['ap_lo'].quantile(0.025))].index,inplace=True)

In [None]:
dataset.describe()

In [None]:
dataset.head()

Now, we'll separate our dataset into two part to avoid any major changes in the main dataset.

In [None]:
X = dataset.drop(['age', 'cardio'], axis = 1)
y = dataset.iloc[:, -2]

Now it's time to separate the features and the target values and then split the dataset into training and test set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.25, random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

Now let us check to see if there's any missing data.

In [None]:
dataset.isnull().values.any()

Since, there's no missing data, we proceed with Feature Scaling.

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train) 
X_test = sc_X.transform(X_test)

Let us create a class and list that will store our classifier's name, accuracy and the number of false negatives it generated.

We are considering the false negatives to be of high concern because it is better to diagnose a person with no CVD as a patient with CVD rather than the contrary.

In [None]:
class Classifier:
    def __init__(self, name, acc, falneg):
        self.name = name
        self.acc = acc
        self.falneg = falneg
    def __str__(self):
        return (f"Name of classifier: {self.name}\tAccuracy: {self.acc}\tNo. of False Negatives: {self.falneg}")
        
clf_list = []

Now let's proceed with our models.

Starting off with SVM.
Radial Basis Kernel.

In [None]:
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'rbf', gamma = 'scale', random_state = 0)
svc_clf.fit(X_train, y_train)
svc_pred = svc_clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_svc = accuracy_score(y_test,svc_pred)
print(f"Accuracy for this model {acc_svc*100}")

Accuracy for this model 72.6855546687949%

Let us see the confusion matrix for this kernel.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, svc_pred)
print(cm)

[[6241 1497]
 [2610 4688]] 
 Is the confusion matrix.

As you can see, the number of false negatives is relatively quite high, which is a concern.

Let us add the classifier to the list.

In [None]:
clf_list.append(Classifier("SVC (rbf)", round(acc_svc*100, 4), cm[1][0]))
print(clf_list[0])

Now let us see for the polynomial kernel of SVC.

In [None]:
from sklearn.svm import SVC
svc_poly_clf = SVC(kernel = 'poly', degree = 3, gamma = 'scale', random_state = 0)
svc_poly_clf.fit(X_train, y_train)
svc_poly_pred = svc_poly_clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_poly_svc = accuracy_score(y_test, svc_poly_pred)
print(f"Accuracy for this model {acc_poly_svc*100}")

* Accuracy for this model 71.60814046288907% (degree = 3).
* Accuracy for this model 70.85661080074487% (degree = 4).
* Accuracy for this model 69.61957967544559% (degree = 5).

Therefore, as we increase the degree for the polynomial SVC, it's accuracy decreases.

Let us see the confusion matrix for this kernel.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, svc_poly_pred)
print(cm)

[[6392 1346]
 [2924 4374]]
  Is the confusion matrix.
 
 As you can see, the number of false negatives is more than that of the rbf kernel, moreover the accuracy score of rbf is greater than that of the 3rd degree polynomial model.

Hence, let us move add the polynomial kernel to the list.

In [None]:
clf_list.append(Classifier("SVC (degree = 3)", round(acc_poly_svc*100, 4), cm[1][0]))
print(clf_list[1])

Now, let us try the Naïve Bayes Model.

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
nb_pred = nb_clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_nb = accuracy_score(y_test,nb_pred)
print(f"Accuracy for this model {acc_nb*100}")

Accuracy for this model 71.32881085395051%

Let us see the confusion matrix for this model.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, nb_pred)
print(cm)

[[6261 1477]
[2834 4464]]
 
Is the confusion matrix.
 
Let us add this classifier to our list.

In [None]:
clf_list.append(Classifier("Naïve Bayes", round(acc_nb*100, 4), cm[1][0]))
print(clf_list[2])

Now,  let us try the Random Forest Classification Model.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_rf = accuracy_score(y_test,rf_pred)
print(f"Accuracy for this model {acc_rf*100}")

Accuracy for this model 69.74594306996542%, which is not as good.

Let us see the confusion matrix for this model.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, rf_pred)
print(cm)

[[5573 2165]
[2384 4914]]

Is the confusion matrix.
    
If you observe the false negatives, it is quite good comapred to the other models.

Let us add this classifier to our list.

In [None]:
clf_list.append(Classifier("Random Forest", round(acc_rf*100, 4), cm[1][0]))
print(clf_list[3])

Now, let us try the K-Nearest Neighbours Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
kn_clf = KNeighborsClassifier(n_neighbors = 150, metric = 'minkowski', p = 2)
kn_clf.fit(X_train, y_train)
kn_pred = kn_clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_kn = accuracy_score(y_test,kn_pred)
print(f"Accuracy for this model {acc_kn*100}")

Accuracy for this model 72.29316307528599

Let us see the confusion matrix for this model.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, kn_pred)
print(cm)

[[6235 1503]
[2663 4635]]

Is the confusion matrix.
  
The number of false negatives for this model is somewhat average.
  
Let us add this classifier to our list.

In [None]:
clf_list.append(Classifier("K-NN", round(acc_kn*100, 4), cm[1][0]))
print(clf_list[4])

Now,  let us try the Logistic Regression model.

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 0, solver = 'liblinear', multi_class = 'ovr')
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc_lr = accuracy_score(y_test, lr_pred)
print(f"Accuracy for this model {acc_lr*100}")

Accuracy for this model 72.41952646980579%

Let us see the confusion matrix for this model.

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, lr_pred)
print(cm)

 [[6126 1612]
 [2535 4763]]
  
Is the confusion matrix.
  
The number of false negatives for this model is somewhat decent.
  
Let us add this classifier to our list.

In [None]:
clf_list.append(Classifier("Logistic Regression", round(acc_lr*100, 4), cm[1][0]))
print(clf_list[5])

Let us take a look at the scores of all our models.

In [None]:
for model in clf_list:
    print(model)