Load the modules

In [1]:
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

from src import datareader, enums

Load in the data and split between training and testing

In [2]:
filename = '/Users/mhogan/Documents/heartfailurepred/heart.csv'
keys, features, labels = datareader.get_csv_features_labels(filename)
tt_splits = train_test_split(features, labels, test_size=0.33)
features_train, features_test, labels_train, labels_test = tt_splits

Normalize the data

In [3]:
normalized_features_train = preprocessing.normalize(features_train.swapaxes(0, 1)).swapaxes(0, 1)
normalized_features_test = preprocessing.normalize(features_test.swapaxes(0, 1)).swapaxes(0, 1)
normalized_features_test = normalized_features_test.reshape((len(normalized_features_test), len(keys)-1))

Define the AdaBoost model using RandomForestClassier

In [4]:
n_estimators_forest = 100
n_estimators_adaboost = 100
random_forest_classifier: RandomForestClassifier = RandomForestClassifier(n_estimators=n_estimators_forest,
                                                                          max_depth=3,
                                                                          bootstrap=True,
                                                                          verbose=False,
                                                                          n_jobs=2,
                                                                          )
bagging_classifier = BaggingClassifier(base_estimator=random_forest_classifier,
                                       n_estimators=n_estimators_adaboost)

bagging_classifier.fit(normalized_features_train, labels_train)

BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=3, n_jobs=2,
                                                        verbose=False),
                  n_estimators=100)

Predict the results

In [5]:
probabilities = bagging_classifier.predict_proba(features_test)
predictions_class = bagging_classifier.predict(features_test)
print(probabilities)
print(predictions_class)

[[0.39160735 0.60839265]
 [0.66017271 0.33982729]
 [0.28820456 0.71179544]
 [0.65388791 0.34611209]
 [0.46478904 0.53521096]
 [0.66017271 0.33982729]
 [0.66017271 0.33982729]
 [0.78824402 0.21175598]
 [0.36982976 0.63017024]
 [0.35380869 0.64619131]
 [0.32223156 0.67776844]
 [0.29772584 0.70227416]
 [0.28686476 0.71313524]
 [0.29531976 0.70468024]
 [0.38830004 0.61169996]
 [0.3523465  0.6476535 ]
 [0.29772584 0.70227416]
 [0.25805863 0.74194137]
 [0.25492115 0.74507885]
 [0.34057078 0.65942922]
 [0.28820456 0.71179544]
 [0.29655972 0.70344028]
 [0.2997376  0.7002624 ]
 [0.32280042 0.67719958]
 [0.45971558 0.54028442]
 [0.66017271 0.33982729]
 [0.44866406 0.55133594]
 [0.65388791 0.34611209]
 [0.35723799 0.64276201]
 [0.45310492 0.54689508]
 [0.33783131 0.66216869]
 [0.30090275 0.69909725]
 [0.29772584 0.70227416]
 [0.64033974 0.35966026]
 [0.5130805  0.4869195 ]
 [0.53457481 0.46542519]
 [0.29655972 0.70344028]
 [0.28820456 0.71179544]
 [0.29772584 0.70227416]
 [0.27891662 0.72108338]


Calculate intermediate results

In [6]:
true_classes = enums.HeartDiseaseClassification()
true_positive = np.count_nonzero(np.logical_and(predictions_class == true_classes.HeartDisease,
                                                labels_test == true_classes.HeartDisease).astype(int))
false_positive = np.count_nonzero(np.logical_and(predictions_class == true_classes.HeartDisease,
                                                 labels_test == true_classes.Normal).astype(int))
false_negative = np.count_nonzero(np.logical_and(predictions_class == true_classes.Normal,
                                                 labels_test == true_classes.HeartDisease).astype(int))
recall = true_positive / (false_negative + true_positive)
precision = true_positive / (true_positive + false_positive)
true_negative = np.count_nonzero(np.logical_and(predictions_class == true_classes.Normal,
                                                labels_test == true_classes.Normal).astype(int))
positives = np.count_nonzero((labels_test == true_classes.HeartDisease).astype(int))
negatives = len(labels_test) - positives

Calculate metrics

In [7]:
sensitivity = float(true_positive) / positives
specificity = float(true_negative) / negatives
accuracy = (true_positive + true_negative) / (positives + negatives)
f1_score = 2. / (1./recall + 1./precision)
mcc = (true_positive * true_negative) - (false_positive * false_negative)
mcc /= np.sqrt((true_positive + false_positive) * (true_positive + false_negative) *
               (true_negative + false_positive) * (true_negative + false_negative))
print('accuracy = ', accuracy)
print('sensitivity = ', sensitivity)
print('specificity = ', specificity)
print('f1_score = ', f1_score)
print('Matthew\'s Corrleation Coefficient = ', mcc)


accuracy =  0.8283828382838284
sensitivity =  0.9239766081871345
specificity =  0.7045454545454546
f1_score =  0.8586956521739131
Matthew's Corrleation Coefficient =  0.6534607429902773
