In [1]:
# Model Selection
# ===============
# This notebook compares the scores of different model algorithms, 
# preprocessing methods, and classification methods.
#
# We can see from the experiment data that naive bayes classifier with 
# informedness=0.7574 and DOR=59.63 and random forest with informedness=0.723
# and dor=38.77 were the best-performing models. Although naive bayes scored
# higher on informedness and dor, random forest did a bit better on sensitivity
# (0.8545 vs. 0.8364). Although the difference isn't very great, we prefer a
# model with higher sensitivity as the risk of mortality from myocardial infarction
# is much higher than the risk of mortality from angiography[1][2]. In addition, 
# random forest is a more reliable method for estimating the probablity associated 
# with a classification than naive bayes. Therefore, we choose random forest as 
# our model.
#
# References:
# [1] https://www.ahajournals.org/doi/10.1161/CIR.0b013e3182742cf6
# [2] https://www.nhs.uk/conditions/angiography/risks/
#
# Copyright 2020 Jerrad M. Genson
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

library(tidyverse)

GIT_ROOT <- system2('git', args=c('rev-parse', '--show-toplevel'), stdout=TRUE)
DATA <- file.path(GIT_ROOT, 'data')
MODEL_DATA <- file.path(DATA, 'model_selection.csv')

scores <- read_csv(MODEL_DATA)
scores

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  model = [31mcol_character()[39m,
  preprocessing = [31mcol_character()[39m,
  accuracy = [32mcol_double()[39m,
  precision = [32mcol_double()[39m,
  sensitivity = [32mcol_double()[39m,
  specificity = [32mcol_double()

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,isomap,0.7527,0.7308,0.8085,0.6957,0.5042,9.651,0.5077,0.6667,0.3528,0.08761,addf0d1


In [2]:
best_informedness <- scores[scores$informedness == max(scores$informedness), ]
best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,isomap,0.7527,0.7308,0.8085,0.6957,0.5042,9.651,0.5077,0.6667,0.3528,0.08761,addf0d1


In [3]:
best_dor <- scores[scores$dor == max(scores$dor), ]
best_dor

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,isomap,0.7527,0.7308,0.8085,0.6957,0.5042,9.651,0.5077,0.6667,0.3528,0.08761,addf0d1


In [4]:
best_sensitivity <- scores[scores$sensitivity == max(scores$sensitivity), ]
best_sensitivity

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,isomap,0.7527,0.7308,0.8085,0.6957,0.5042,9.651,0.5077,0.6667,0.3528,0.08761,addf0d1


In [5]:
best_cv_informedness <- scores[scores$`cv informedness` == max(scores$`cv informedness`), ]
best_cv_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,isomap,0.7527,0.7308,0.8085,0.6957,0.5042,9.651,0.5077,0.6667,0.3528,0.08761,addf0d1


In [6]:
second_best_informedness <- scores[scores$informedness == unique(sort(scores$informedness, TRUE))[2], ]
second_best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
,,,,,,,,,,,,


In [7]:
third_best_informedness <- scores[scores$informedness == unique(sort(scores$informedness, TRUE))[3], ]
third_best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
,,,,,,,,,,,,
