In [1]:
# Model Selection
# ===============
# This notebook compares the scores of different model algorithms, 
# preprocessing methods, and classification methods.
#
# The two best performing models are gradient boosting and radius neighbors.
# Radius neighbors performs better on standard informedness, diagnostic odds
# ratio, and outlier informedness. However, gradient boosting performs better 
# on cross-validation. This indicates that while radius neighbors performs
# better on the limited amount of data reserved for validation, however,
# gradient boosting appears to benefit more from increased data, which will
# be advantageous when we combine the training and validation datasets.
# Therefore, we choose gradient boosting with robust scaling as our model.
#
# Copyright 2020 Jerrad M. Genson
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

library(tidyverse)

GIT_ROOT <- system2('git', args=c('rev-parse', '--show-toplevel'), stdout=TRUE)
DATA <- file.path(GIT_ROOT, 'data')
MODEL_DATA <- file.path(DATA, 'model_selection.csv')

scores <- read_csv(MODEL_DATA)
scores

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  model = [31mcol_character()[39m,
  preprocessing = [31mcol_character()[39m,
  accuracy = [32mcol_double()[39m,
  precision = [32mcol_double()[39m,
  sensitivity = [32mcol_double()[39m,
  specificity = [32mcol_double()

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,robust scaling,0.8197,0.8421,0.6667,0.9189,0.5856,22.67,,0.5,0.5143,0.2317,f8469be
rfc,standard scaling,0.8033,0.8,0.6667,0.8919,0.5586,16.5,0.2582,0.5,0.5143,0.2746,94c6579
rfc,pca,0.8033,0.7727,0.7083,0.8649,0.5732,15.54,0.2565,0.5,0.5071,0.2214,c34a598
rfc,isomap,0.7049,0.6071,0.7083,0.7027,0.411,5.74,0.1113,0.3,0.5119,0.2,cb0728e
rfc,lle,0.7377,0.6538,0.7083,0.7568,0.4651,7.556,0.1494,0.1,0.5786,0.2619,f708b98
rfc,feature agglomeration,0.7377,0.6538,0.7083,0.7568,0.4651,7.556,0.1494,0.3,0.5143,0.2,3d44e37
rfc,nca,0.6721,0.5588,0.7917,0.5946,0.3863,5.573,0.1004,0.3,0.5071,0.1667,e9bdd96
rfc,factor analysis,0.7705,0.7273,0.6667,0.8378,0.5045,10.33,0.1929,0.5,0.4476,0.2095,87cb944
etc,robust scaling,0.7541,0.6957,0.6667,0.8108,0.4775,8.571,,0.5,0.5,0.1619,9f5b916
dtc,robust scaling,0.6066,0.5,0.7083,0.5405,0.2489,2.857,,0.3,0.3333,0.1738,8b723aa


In [2]:
best_informedness <- scores[scores$informedness == max(scores$informedness), ]
best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
lrc,modified lle,0.8525,0.8947,0.7083,0.9459,0.6543,42.5,0.3875,0.5,0.4857,0.1762,a718db8


In [3]:
best_dor <- scores[scores$dor == max(scores$dor), ]
best_dor

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
lrc,modified lle,0.8525,0.8947,0.7083,0.9459,0.6543,42.5,0.3875,0.5,0.4857,0.1762,a718db8


In [4]:
best_sensitivity <- scores[scores$sensitivity == max(scores$sensitivity), ]
best_sensitivity

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
rfc,nca,0.6721,0.5588,0.7917,0.5946,0.3863,5.573,0.1004,0.3,0.5071,0.1667,e9bdd96


In [5]:
best_cv_informedness <- scores[scores$`cv informedness` == max(scores$`cv informedness`), ]
best_cv_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
qda,robust scaling,0.7869,0.7895,0.625,0.8919,0.5169,13.75,,0.5,0.6571,0.1571,eb89d93
qda,feature agglomeration,0.7869,0.7895,0.625,0.8919,0.5169,13.75,0.2257,0.5,0.6571,0.1571,44d8eda


In [6]:
second_best_informedness <- scores[scores$informedness == unique(sort(scores$informedness, TRUE))[2], ]
second_best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
lrc,robust scaling,0.8361,0.85,0.7083,0.9189,0.6273,27.52,,0.5,0.531,0.2167,87fb70c
lrc,isomap,0.8361,0.85,0.7083,0.9189,0.6273,27.52,0.3359,0.5,0.5143,0.2603,b49fe61
lrc,lle,0.8361,0.85,0.7083,0.9189,0.6273,27.52,0.3359,0.5,0.531,0.2464,1866d34
lrc,nca,0.8361,0.85,0.7083,0.9189,0.6273,27.52,0.3359,0.5,0.531,0.2917,3ad5091
qda,pca,0.8361,0.85,0.7083,0.9189,0.6273,27.52,0.3359,0.5,0.5982,0.2286,7519c68


In [7]:
third_best_informedness <- scores[scores$informedness == unique(sort(scores$informedness, TRUE))[3], ]
third_best_informedness

model,preprocessing,accuracy,precision,sensitivity,specificity,informedness,dor,ami,outlier informedness,cv informedness,mad informedness,commit hash
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
lrc,pca,0.8197,0.8095,0.7083,0.8919,0.6002,20.04,0.2931,0.5,0.5429,0.2083,1516719
