In [1]:
# Model Selection
# ===============
# This notebook compares the scores of different model algorithms, 
# preprocessing methods, and classification methods.
#
# The binary classification models generally performed very well,
# with QDA and random forest scoring the highest on informedness.
# PCA improved the classification performance of these models 
# significantly. The ternary and multiclass models were less 
# successfull. The disparity between classification methods is 
# wide enough that the ternary and multiclass models do not need
# to be considered further without additional data.
#
# Although they scored similarly on informedness, the binary QDA
# and random forest models scored very differently on sensitivity
# and specificity. While they both scored well on both measures,
# QDA scored higher on sensitivity while random forest scored
# higher on specificity. All other things being equal, we prefer a
# model with greater sensitivity over one with greater specificity
# because the risk of underdiagnosis is greater than the risk of
# overdiagnosis. If a patient with a healthy heart is diagnosed
# with ischemic heart disease, they might undergo additional tests
# unnecessarily, which could be costly and invasive (e.g. angiography).
# If a patient with heart disease is classified as healthy, they might 
# experience a myocardial infarction that could have been prevented if 
# they had been diagnosed correctly. Although misdiagnosis confers risk
# in both cases, the latter case is associated with significantly worse
# outcomes than the former.
#
# Therefore, we choose binary QDA with PCA as our model.
#
# Copyright 2020 Jerrad M. Genson
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

scores_list <- list(
    list('random forest', 'standard scaling', 0.8602, 0.9375, 0.8182, 0.9211, 0.7392, 52.5, 0.4328, 1.0, 0.6131, 0.1613))

scores <- data.frame(do.call(rbind.data.frame, scores_list))
colnames(scores) <- c('model', 'preprocessing', 'accuracy', 'precision', 'sensitivity', 'specificity', 'informedness', 'dor', 'ami', 'outlier informedness', 'cv informedness', 'mad informedness', 'commit hash')
scores

ERROR: Error in names(x) <- value: 'names' attribute [13] must be the same length as the vector [12]


In [None]:
scores_list <- list(
    list('quadratic discriminant analysis', 'standard scaling', TRUE, 0.6452, 0.4369, 0.3596, 0.1158),
    list('quadratic discriminant analysis', 'robust scaling', TRUE, 0.6452, 0.4369, 0.3596, 0.1158),
    list('quadratic discriminant analysis', 'pca', TRUE, 0.6452, 0.4369, 0.3596, 0.1158),
    list('random forest', 'robust scaling', TRUE, 0.6559, 0.49, 0.4894, 0.1757),
    list('random forest', 'pca', TRUE, 0.6559, 0.4886, 0.4886, 0.1739),
    list('support vector machine', 'standard scaling', TRUE, 0.6129, 0.4471, 0.5394, 0.1449),
    list('support vector machine', 'robust scaling', TRUE, 0.6237, 0.4331, 0.4778, 0.1239),
    list('support vector machine', 'pca', TRUE, 0.6022, 0.4311, 0.5304, 0.1264)
)

ternary_scores <- data.frame(do.call(rbind.data.frame, scores_list))
colnames(ternary_scores) <- c('model', 'preprocessing', 'includes fbs', 'accuracy', 'hmean_precision', 'hmean_recall', 'informedness')
ternary_scores

In [None]:
scores_list <- list(
    list('quadratic discriminant analysis', 'standard scaling', TRUE, 0.4086, 0.263, 0.3246, -0.288),
    list('quadratic discriminant analysis', 'robust scaling', TRUE, 0.4839, 0.0, 0.0, -0.372),
    list('quadratic discriminant analysis', 'pca', TRUE, 0.5054, 0.2998, 0.2163, -0.2579),
    list('random forest', 'standard scaling', TRUE, 0.4731, 0.3036, 0.2507, -0.1239),
    list('random forest', 'robust scaling', TRUE, 0.4624, 0.2606, 0.16, -0.1292),
    list('random forest', 'pca', TRUE, 0.4624, 0.2444, 0.3262, -0.2455),
    list('support vector machine', 'standard scaling', TRUE, 0.4194, 0.2172, 0.1503, -0.2212),
    list('support vector machine', 'robust scaling', TRUE, 0.4194, 0.2172, 0.1503, -0.2212),
    list('support vector machine', 'pca', TRUE, 0.4624, 0.213, 0.2161, -0.05439)
)

multiclass_scores <- data.frame(do.call(rbind.data.frame, scores_list))
colnames(multiclass_scores) <- c('model', 'preprocessing', 'includes fbs', 'accuracy', 'hmean_precision', 'hmean_recall', 'informedness')
multiclass_scores