Load the modules

In [1]:
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from src import datareader, enums

Load in the data and split between training and testing

In [2]:
filename = '/Users/mhogan/Documents/heartfailurepred/heart.csv'
keys, features, labels = datareader.get_csv_features_labels(filename)
tt_splits = train_test_split(features, labels, test_size=0.33)
features_train, features_test, labels_train, labels_test = tt_splits

Normalize the data

In [3]:
normalized_features_train = preprocessing.normalize(features_train.swapaxes(0, 1)).swapaxes(0, 1)
normalized_features_test = preprocessing.normalize(features_test.swapaxes(0, 1)).swapaxes(0, 1)
normalized_features_test = normalized_features_test.reshape((len(normalized_features_test), len(keys)-1))

Define the AdaBoost model using RandomForestClassier

In [4]:
n_estimators_adaboost = 10000
decision_classifier: DecisionTreeClassifier = DecisionTreeClassifier(criterion='entropy',
                                                                     max_depth=3)

adaboost_classifier: AdaBoostClassifier = AdaBoostClassifier(base_estimator=decision_classifier,
                                                             n_estimators=n_estimators_adaboost,
                                                             learning_rate=0.9)

adaboost_classifier.fit(normalized_features_train, labels_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=3),
                   learning_rate=0.9, n_estimators=10000)

Predict the results

In [5]:
predictions_class = adaboost_classifier.predict(features_test)

Calculate intermediate results

In [6]:
true_classes = enums.HeartDiseaseClassification()
true_positive = np.count_nonzero(np.logical_and(predictions_class == true_classes.HeartDisease,
                                                labels_test == true_classes.HeartDisease).astype(int))
false_positive = np.count_nonzero(np.logical_and(predictions_class == true_classes.HeartDisease,
                                                 labels_test == true_classes.Normal).astype(int))
false_negative = np.count_nonzero(np.logical_and(predictions_class == true_classes.Normal,
                                                 labels_test == true_classes.HeartDisease).astype(int))
recall = true_positive / (false_negative + true_positive)
precision = true_positive / (true_positive + false_positive)
true_negative = np.count_nonzero(np.logical_and(predictions_class == true_classes.Normal,
                                                labels_test == true_classes.Normal).astype(int))
positives = np.count_nonzero((labels_test == true_classes.HeartDisease).astype(int))
negatives = len(labels_test) - positives

Calculate metrics

In [7]:
sensitivity = float(true_positive) / positives
specificity = float(true_negative) / negatives
accuracy = (true_positive + true_negative) / (positives + negatives)
f1_score = 2. / (1./recall + 1./precision)
mcc = (true_positive * true_negative) - (false_positive * false_negative)
mcc /= np.sqrt((true_positive + false_positive) * (true_positive + false_negative) *
               (true_negative + false_positive) * (true_negative + false_negative))
print('accuracy = ', accuracy)
print('sensitivity = ', sensitivity)
print('specificity = ', specificity)
print('f1_score = ', f1_score)
print('Matthew\'s Corrleation Coefficient = ', mcc)


accuracy =  0.7788778877887789
sensitivity =  0.9620253164556962
specificity =  0.5793103448275863
f1_score =  0.8194070080862533
Matthew's Corrleation Coefficient =  0.5917906864327347
