In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [67]:
df = pd.read_csv('heart.csv')

In [68]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [69]:
df.shape

(303, 14)

In [70]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
print(X_train.shape)
print(X_test.shape)

(242, 13)
(61, 13)


In [73]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

In [74]:
# random forest
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.819672131147541

In [75]:
# gradient boosting
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
accuracy_score(y_test, y_pred)

0.7704918032786885

In [76]:
# svc
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.7049180327868853

In [77]:
# logistic regression
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.8852459016393442

In [78]:
# validate models
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(LogisticRegression(), X, y, cv = 10, scoring = 'accuracy'))

0.8316129032258065

# 1. GridSearchCV

In [79]:
n_estimators = [20, 60, 100, 120] # number of trees in random forest
max_features = [0.2, 0.6, 1.0] # maximum number of features to consider at every split
max_depth = [2,4,6,8] # maximum number of levels in a tree
max_samples = [0.5, .75, 1]

In [80]:
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples
}

In [81]:
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 4, 6, 8], 'max_samples': [0.5, 0.75, 1]}


In [82]:
rf = RandomForestClassifier()

In [83]:
rf_grid = GridSearchCV(
    estimator = rf,
    param_grid = param_grid,
    cv = 5,
    verbose = 2,
    n_jobs = -1
)

In [84]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [85]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 20}

In [86]:
rf_grid.best_score_

0.8432823129251702

# 2. RandomizedSearchCV

In [87]:
n_estimators = [20, 40, 60, 80, 100, 120] # number of trees in random forest
criterion = ['gini', 'entropy', 'log_loss'] # function to measure quality of split
max_depth = [2, 4, 6, 8, 10] # maximum depth of each tree
min_samples_split = [2, 4, 6, 8] # minimum number of samples required to split internal node
min_samples_leaf = [1,2,3,4,5] # minimum number of samples required to be at leaf node
max_features = [0.2, 0.4, 0.5, 0.6, 0.8, 1.0] # number of features to consider when looking for best split
bootstrap = [True, False], # whether to use bootstrap samples when using trees
oob_score = [True, False], # whether to use out of bags samples to estimate generalization accuracy
warm_start = [True, False], # reuse previous solution to add more estimators
max_samples = [0.25, 0.5, 0.75, 1.0] # sets the number of samples drawn from training set for each tree


In [102]:
params = {
    'n_estimators': n_estimators,
    'criterion': criterion,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'max_features': max_features,
    'bootstrap': [True, False],
    'warm_start': [True, False],
    'max_samples': max_samples,
}

In [103]:
rf = RandomForestClassifier(n_jobs=-1, random_state=23, verbose=23, oob_score=True)

In [104]:
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = params,
    cv = 5,
    verbose = 1,
    n_jobs = -1,
    random_state=33
)

In [105]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
building tree 1 of 20building tree 2 of 20
building tree 3 of 20

building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  20

In [106]:
rf_random.best_params_

{'warm_start': True,
 'n_estimators': 20,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_samples': 0.5,
 'max_features': 0.2,
 'max_depth': 4,
 'criterion': 'entropy',
 'bootstrap': True}