In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
np.set_printoptions(suppress=True)

from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling  import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import f1_score, precision_recall_curve, fbeta_score
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import LinearSVC, SVC

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from catboost import CatBoostClassifier, Pool

from mlxtend.plotting import plot_decision_regions
%matplotlib inline
%precision 3



'%.3f'

In [2]:
df_cardio = pd.read_csv('DATA/cardiovascular_train.csv', sep=';')

In [3]:
df_cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# can only use features that I can match with the stroke data

df_cardio = df_cardio.drop(['id', 'alco', 'active', 'cholesterol'], axis = 1)

In [5]:
df_cardio.cardio.value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [6]:
def glucose_change(cell):
    if cell == 1:
        return 'normal'
    if cell == 2:
        return 'above_average'
    else:
        return 'high'

#### ap_hi = Systolic blood pressure, ap_lo = Diastolic blood pressure


In [7]:
# transforming all features to match the format of data from stroke dataset

df_cardio['bmi'] = df_cardio.weight // (df_cardio.height/100)**2
df_cardio.gluc = df_cardio.gluc.apply(lambda x: glucose_change(x))
df_cardio.gender = df_cardio.gender.apply(lambda x: 'Male' if x == 1 else 'Female')
df_cardio['sys_indicator'] = df_cardio.ap_hi.apply(lambda x: 1 if x >= 140 else 0)
df_cardio['dia_indicator'] = df_cardio.ap_lo.apply(lambda x: 1 if x >= 90 else 0)
df_cardio['hypertension'] = df_cardio.sys_indicator + df_cardio.dia_indicator
df_cardio.hypertension = df_cardio.hypertension.apply(lambda x: 1 if x > 0 else 0)
df_cardio.age = df_cardio.age // 365

In [8]:
df_cardio.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,gluc,smoke,cardio,bmi,sys_indicator,dia_indicator,hypertension
0,50,Female,168,62.0,110,80,normal,0,0,21.0,0,0,0
1,55,Male,156,85.0,140,90,normal,0,1,34.0,1,1,1
2,51,Male,165,64.0,130,70,normal,0,1,23.0,0,0,0
3,48,Female,169,82.0,150,100,normal,0,1,28.0,1,1,1
4,47,Male,156,56.0,100,60,normal,0,0,23.0,0,0,0


In [9]:
df_cardio = df_cardio.drop(['height', 'weight', 'ap_hi', 'ap_lo', 'sys_indicator', 'dia_indicator'], axis = 1)

In [10]:
df_cardio.head()

Unnamed: 0,age,gender,gluc,smoke,cardio,bmi,hypertension
0,50,Female,normal,0,0,21.0,0
1,55,Male,normal,0,1,34.0,1
2,51,Male,normal,0,1,23.0,0
3,48,Female,normal,0,1,28.0,1
4,47,Male,normal,0,0,23.0,0


In [11]:
df_cardio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 7 columns):
age             70000 non-null int64
gender          70000 non-null object
gluc            70000 non-null object
smoke           70000 non-null int64
cardio          70000 non-null int64
bmi             70000 non-null float64
hypertension    70000 non-null int64
dtypes: float64(1), int64(4), object(2)
memory usage: 3.7+ MB


In [12]:
df_cardio.describe()

Unnamed: 0,age,smoke,cardio,bmi,hypertension
count,70000.0,70000.0,70000.0,70000.0,70000.0
mean,52.840671,0.088129,0.4997,27.064586,0.352929
std,6.766774,0.283484,0.500003,6.101424,0.477884
min,29.0,0.0,0.0,3.0,0.0
25%,48.0,0.0,0.0,23.0,0.0
50%,53.0,0.0,0.0,26.0,0.0
75%,58.0,0.0,1.0,30.0,1.0
max,64.0,1.0,1.0,298.0,1.0


## Dummy the gender and glucose

In [13]:
df_cardio = pd.get_dummies(df_cardio)
df_cardio.head()

Unnamed: 0,age,smoke,cardio,bmi,hypertension,gender_Female,gender_Male,gluc_above_average,gluc_high,gluc_normal
0,50,0,0,21.0,0,1,0,0,0,1
1,55,0,1,34.0,1,0,1,0,0,1
2,51,0,1,23.0,0,0,1,0,0,1
3,48,0,1,28.0,1,1,0,0,0,1
4,47,0,0,23.0,0,0,1,0,0,1


## Modeling and Prediction

In [16]:
def print_classification_metrics(model, X_train, y_train, X_test, y_test):
    predictions = model.predict(X_test)
    
    print('Precision: ', precision_score(y_test, predictions))
    print('Recall: ', recall_score(y_test, predictions))
    print('F1 Score: ', f1_score(y_test, predictions))
    print('Accuracy: ', accuracy_score(y_test, predictions))
    print('ROC: ', roc_auc_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))

In [17]:
X = df_cardio.drop('cardio', axis = 1)
y = df_cardio.cardio

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=30)

In [19]:
lr_model = LogisticRegression(solver='lbfgs', C=0.01, max_iter=300)
lr_model.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
print_classification_metrics(lr_model, X_train, y_train, X_test, y_test)

Precision:  0.7592787114845938
Recall:  0.6211687195645946
F1 Score:  0.683314951945801
Accuracy:  0.7128571428571429
ROC:  0.7126219773371563
[[5643 1375]
 [2645 4337]]


## Grid Search

In [24]:
#Randomized Grid Search

param_dist = dict(C=np.linspace(0.01,1,100))
for _ in range(5):
    rand = RandomizedSearchCV(LogisticRegression(max_iter = 300, solver='lbfgs'), param_dist, cv=5, scoring='accuracy', n_iter=20)
    rand.fit(X_train, y_train)
    print(rand.best_params_, rand.best_score_)

{'C': 0.02} 0.7121607142857143
{'C': 0.15000000000000002} 0.7116785714285714
{'C': 0.08} 0.7117321428571428
{'C': 0.01} 0.712875
{'C': 0.04} 0.7119285714285715


## Other Models

In [21]:
#kNN
model_knn = KNeighborsClassifier(n_neighbors=3)
model_knn.fit(X_train, y_train)
knn_pred = model_knn.predict(X_test)

#Bernoulli Bayes
nb_bern = BernoulliNB()
nb_bern.fit(X_train, y_train)
bern_pred = nb_bern.predict(X_test)

#Gaussian Bayes
nb_gauss = GaussianNB()
nb_gauss.fit(X_train, y_train)
gauss_pred = nb_gauss.predict(X_test)

#Tree Model
model_tree = DecisionTreeClassifier(class_weight='balanced')
model_tree.fit(X_train, y_train)
tree_pred = model_tree.predict(X_test)

#Random Forest
model_forest = RandomForestClassifier(class_weight='balanced', n_estimators=100)
model_forest.fit(X_train, y_train)
forest_pred = model_forest.predict(X_test)



### Don't consider SVMs because training and grid searching will take too long

In [22]:
print(accuracy_score(y_test,knn_pred))
print(accuracy_score(y_test,bern_pred))
print(accuracy_score(y_test,gauss_pred))
print(accuracy_score(y_test,tree_pred))
print(accuracy_score(y_test,forest_pred))

0.6473571428571429
0.6920714285714286
0.6902142857142857
0.6911428571428572
0.6942857142857143


#### Random forest is performing well, so let's grid search and see if we can increase the score

In [30]:
#parameters

forest_grid_params = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_features': ['log2', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [67]:
grid_forest = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=forest_grid_params, n_iter=100, cv=5, verbose=2, random_state=30, n_jobs = -1, scoring = 'accuracy')
grid_forest.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 45.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 65.5min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=30, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=2)

In [68]:
grid_forest.best_params_

{'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

## Fit optimized tree

In [44]:
model_forest = RandomForestClassifier(n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features='auto', max_depth=10, bootstrap=True)
model_forest.fit(X_train, y_train)
forest_pred = model_forest.predict(X_test)

In [28]:
print(accuracy_score(y_test,forest_pred))

0.7155


### Giving a test case

In [31]:
X_test.head()

Unnamed: 0,age,smoke,bmi,hypertension,gender_Female,gender_Male,gluc_above_average,gluc_high,gluc_normal
44332,50,0,37.0,1,1,0,1,0,0
3703,51,0,24.0,0,0,1,0,0,1
68011,64,0,33.0,0,0,1,0,0,1
13961,55,0,28.0,0,0,1,0,0,1
41074,59,0,38.0,0,0,1,0,0,1


In [96]:
test_case = np.array([70, 0, 25, 0, 0, 1, 0, 0, 1])

In [97]:
test_case

array([70,  0, 25,  0,  0,  1,  0,  0,  1])

In [98]:
model_forest.predict_proba(test_case.reshape(1, 9))

array([[0.391, 0.609]])

### Trying XGBoost with CatBoost

In [40]:
# initialize data
train_data = X_train

train_labels = y_train

test_data = Pool(train_data, train_labels)

model =  CatBoostClassifier(iterations=500,
                            depth=3,
                            loss_function='Logloss',
                            verbose=False,
                            random_seed=30)
# train the model
model.fit(train_data, train_labels)
# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)

class =  [0. 0. 0. ... 0. 0. 1.]
proba =  [[0.694 0.306]
 [0.679 0.321]
 [0.735 0.265]
 ...
 [0.721 0.279]
 [0.719 0.281]
 [0.166 0.834]]


In [41]:
accuracy_score(y_test, model.predict(X_test))

0.7145

#### I would explore hypertuning the CatBoost model given more time. Would likely give a model that performs better than random forest