### This problem reads a dataset and determines if a patient may develop cardiac issue or not based on different features. I used SelectKBest tool with f_classif filter to select necessary features and then used the data to train different classifier functions and checked the performance of the models.

In [5]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [39]:
df = pd.read_csv('cardio_train.csv', sep=';')
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [40]:
df.shape

(70000, 13)

In [173]:
df.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

### Seperate the features and the target

In [67]:
x = df.iloc[:,:-1] # dropped cardio column 

In [68]:
x.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0


### convert the age (given in days) feature in to yr (number of years rounded)

In [74]:
x['yr'] = (x.age/365).round().astype(int)

In [79]:
x = x.drop(['id','age'], axis=1) # dropping the unnecessary features

In [80]:
x.head()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,yr
0,2,168,62.0,110,80,1,1,0,0,1,50
1,1,156,85.0,140,90,3,1,0,0,1,55
2,1,165,64.0,130,70,3,1,0,0,0,52
3,2,169,82.0,150,100,1,1,0,0,1,48
4,1,156,56.0,100,60,1,1,0,0,0,48


In [89]:
y = df.iloc[:,12] # only kept cardio column

In [86]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: cardio, dtype: int64

###  Feature selection process- select K number of best features using the SelectKbest tool having f_classif filtering method

In [120]:
from sklearn.feature_selection import SelectKBest # feature selection technique

In [121]:
from sklearn.feature_selection import f_classif # import score function

In [175]:
# f_classif assesses how well each feature can discriminate between the different classes in your target variable
fit_features = SelectKBest(score_func= f_classif) 
fit_features

In [176]:
fit_features.fit(x, y)

In [124]:
score_col = pd.DataFrame(fit_features.scores_, columns = ['score']) # scores each feature

In [106]:
score_col

Unnamed: 0,score
0,4.603641
1,8.197397
2,2388.777887
3,208.339524
4,303.629011
5,3599.361137
6,562.772977
7,16.790541
8,3.761355
9,89.091494


In [112]:
name_col = pd.DataFrame(x.columns, columns=['name']) # column header

In [113]:
name_col

Unnamed: 0,name
0,gender
1,height
2,weight
3,ap_hi
4,ap_lo
5,cholesterol
6,gluc
7,smoke
8,alco
9,active


In [126]:
# concatinate the two cols
new_tab = pd.concat([name_col, score_col], axis=1)

In [125]:
new_tab

Unnamed: 0,name,score
0,gender,4.603641
1,height,8.197397
2,weight,2388.777887
3,ap_hi,208.339524
4,ap_lo,303.629011
5,cholesterol,3599.361137
6,gluc,562.772977
7,smoke,16.790541
8,alco,3.761355
9,active,89.091494


In [127]:
new_tab.nlargest(8, 'score') # get top 8 features sorted by score

Unnamed: 0,name,score
10,yr,4193.661786
5,cholesterol,3599.361137
2,weight,2388.777887
6,gluc,562.772977
4,ap_lo,303.629011
3,ap_hi,208.339524
9,active,89.091494
7,smoke,16.790541


### Train the Random Forest function with processed x, y data

In [128]:
from sklearn.model_selection import train_test_split

In [129]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.30, random_state=42)

In [130]:
from sklearn.ensemble import RandomForestClassifier

In [131]:
rfc_model = RandomForestClassifier()

In [132]:
rfc_model.fit(xtrain, ytrain)

In [178]:
rfc_model.score(xtest, ytest) # score not so good

0.7065238095238096

### Train model with Decision Tree Classifier

In [134]:
from sklearn.tree import DecisionTreeClassifier

In [135]:
dtc_model = DecisionTreeClassifier()

In [136]:
dtc_model.fit(xtrain, ytrain)

In [137]:
dtc_model.score(xtest, ytest) # performance not good

0.6332380952380953

### Train model with Support Vector Classifier (from SVM)

In [138]:
from sklearn.svm import LinearSVC 

In [139]:
lsvc_model = LinearSVC() 

In [141]:
lsvc_model.fit(xtrain, ytrain)

In [142]:
lsvc_model.score(xtest, ytest) # performance not good

0.6531904761904762

In [143]:
from sklearn.svm import SVC

In [144]:
svc_model = SVC()

In [145]:
svc_model.fit(xtrain, ytrain)

In [148]:
svc_model.score(xtest, ytest)

0.7240952380952381

###  Feature selection process SelectKBest with Score_func = chi2 (not f_classif)

In [150]:
from sklearn.feature_selection import chi2 # import score function

In [155]:
fit_features = SelectKBest(score_func= chi2) # chi2 can't handle -ve data, so I scaled the x features with MinMaxScaler to remove -ve values

In [158]:
from sklearn.preprocessing import MinMaxScaler

In [159]:
mms_scaling = MinMaxScaler()

In [163]:
x_scaled = mms_scaling.fit_transform(x) # scaled the x features to remove -ve values so i can use SelectKBest(score_func=chi2)

In [164]:
 fit_features.fit(x_scaled, y)

In [172]:
# split scaled x and y data (not previously defined x data)
xtrain, xtest, ytrain, ytest = train_test_split(x_scaled, y, test_size = 0.30, random_state=42)

### After applying MinMaxScaler to scale the features, training the RandomForestClassifier function with scaled data

In [170]:
rfc_model.fit(xtrain, ytrain)

In [171]:
rfc_model.score(xtest, ytest) # performance is not that great

0.7065238095238096