In [1]:
# Model Selection
# ===============
# This notebook compares the scores of different model algorithms, 
# preprocessing methods, and classification methods.
#
# The two best performing models are gradient boosting and radius neighbors.
# Radius neighbors performs better on standard informedness, diagnostic odds
# ratio, and outlier informedness. However, gradient boosting performs better 
# on cross-validation. This indicates that while radius neighbors performs
# better on the limited amount of data reserved for validation, however,
# gradient boosting appears to benefit more from increased data, which will
# be advantageous when we combine the training and validation datasets.
# Therefore, we choose gradient boosting with robust scaling as our model.
#
# Copyright 2020 Jerrad M. Genson
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

library(tidyverse)

GIT_ROOT <- system2('git', args=c('rev-parse', '--show-toplevel'), stdout=TRUE)
DATA <- file.path(GIT_ROOT, 'data')
MODEL_DATA <- file.path(DATA, 'model_selection.csv')

scores <- read_csv(MODEL_DATA)
scores

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


[36m──[39m [1m[1mColumn specificati

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
gbc,none,0.8172,0.7719,0.9167,0.7111,0.6278,27.08,0.6441,0.2667,0.6166,0.09018,d574e30
gbc,robust scaling,0.8387,0.8113,0.8958,0.7778,0.6736,30.1,0.6799,0.4333,0.6753,0.07386,26fd6ad
gbc,standard scaling,0.8387,0.8,0.9167,0.7556,0.6722,34.0,0.6834,0.2667,0.62,0.1186,7076219
gbc,pca,0.8065,0.7778,0.875,0.7333,0.6083,19.25,0.6161,0.4333,0.6119,0.06385,1e0e6d0
gbc,isomap,0.7634,0.7708,0.7708,0.7556,0.5264,10.4,0.5264,0.1,0.5917,0.1002,1a1bb28
gbc,locally linear embedding,0.7527,0.7358,0.8125,0.6889,0.5014,9.595,0.5061,-0.1,0.5755,0.09754,6596248
gbc,hessian lle,0.7312,0.717,0.7917,0.6667,0.4583,7.6,0.4626,0.06667,0.5755,0.1064,e46e66b
gbc,modified lle,0.8065,0.7885,0.8542,0.7556,0.6097,18.1,0.6137,0.06667,0.5528,0.1306,f905831
gbc,factor analysis,0.7634,0.75,0.8125,0.7111,0.5236,10.67,0.527,0.2381,0.645,0.1207,24f9b64
gbc,feature agglomeration,0.8387,0.8,0.9167,0.7556,0.6722,34.0,0.6834,0.4333,0.6417,0.1182,1848532


In [2]:
best_informedness <- scores[scores$informedness == max(scores$informedness), ]
best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rnc,robust scaling,0.8495,0.8269,0.8958,0.8,0.6958,34.4,0.7004,0.5,0.619,0.07159,195fd9e


In [3]:
best_dor <- scores[scores$dor == max(scores$dor), ]
best_dor

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rnc,robust scaling,0.8495,0.8269,0.8958,0.8,0.6958,34.4,0.7004,0.5,0.619,0.07159,195fd9e


In [4]:
best_sensitivity <- scores[scores$sensitivity == max(scores$sensitivity), ]
best_sensitivity

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
gbc,none,0.8172,0.7719,0.9167,0.7111,0.6278,27.08,0.6441,0.2667,0.6166,0.09018,d574e30
gbc,standard scaling,0.8387,0.8,0.9167,0.7556,0.6722,34.0,0.6834,0.2667,0.62,0.1186,7076219
gbc,feature agglomeration,0.8387,0.8,0.9167,0.7556,0.6722,34.0,0.6834,0.4333,0.6417,0.1182,1848532


In [5]:
best_cv_informedness <- scores[scores$`cv informedness` == max(scores$`cv informedness`), ]
best_cv_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
gbc,robust scaling,0.8387,0.8113,0.8958,0.7778,0.6736,30.1,0.6799,0.4333,0.6753,0.07386,26fd6ad


In [6]:
second_best_informedness <- scores[scores$informedness == unique(sort(scores$informedness, TRUE))[2], ]
second_best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
gbc,robust scaling,0.8387,0.8113,0.8958,0.7778,0.6736,30.1,0.6799,0.4333,0.6753,0.07386,26fd6ad


In [7]:
third_best_informedness <- scores[scores$informedness == unique(sort(scores$informedness, TRUE))[3], ]
third_best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,mcc,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
gbc,standard scaling,0.8387,0.8,0.9167,0.7556,0.6722,34,0.6834,0.2667,0.62,0.1186,7076219
gbc,feature agglomeration,0.8387,0.8,0.9167,0.7556,0.6722,34,0.6834,0.4333,0.6417,0.1182,1848532
