# Model evaluation

Kendra Wyant  
October 1, 2025

### Set Up Environment

In [None]:
suppressPackageStartupMessages(library(tidyverse))










In [None]:
test_metrics_full <- read_csv(here::here(path_models, 
                                         "best_config_v17_kfold_full.csv"), 
                              show_col_types = FALSE) |> 
  select(split_num, "full model" = roc_auc) |> 
  arrange(split_num)

test_metrics_baseline <- read_csv(here::here(path_models, 
                                  "best_config_v17_kfold_baseline.csv"),
                                  show_col_types = FALSE) |> 
  select(split_num, "baseline model" = roc_auc) |> 
  arrange(split_num)

test_metrics_meta <- read_csv(here::here(path_models, 
                                       "best_config_v17_kfold_meta.csv"),
                            show_col_types = FALSE) |> 
  select(split_num, "metadata model" = roc_auc) |> 
  arrange(split_num)

test_metrics_passive <- read_csv(here::here(path_models,
                                    "best_config_v17_kfold_passive.csv"),
                            show_col_types = FALSE) |>
  select(split_num, "passive metadata model" = roc_auc) |>
  arrange(split_num)



test_metrics_all <- test_metrics_full |> 
  left_join(test_metrics_baseline, by = c("split_num")) |> 
  left_join(test_metrics_meta, by = c("split_num")) |>
  left_join(test_metrics_passive, by = c("split_num")) |>
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-split_num) |> 
  glimpse()


Rows: 30
Columns: 6
$ `full model`             <dbl> 0.7275479, 0.7506222, 0.7513884, 0.6362167, 0…
$ `baseline model`         <dbl> 0.7522946, 0.7764934, 0.7510941, 0.6374951, 0…
$ `metadata model`         <dbl> 0.6329102, 0.5228854, 0.6602388, 0.6770259, 0…
$ `passive metadata model` <dbl> 0.4971991, 0.5433545, 0.5767612, 0.6286495, 0…
$ fold_num                 <int> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, …
$ repeat_num               <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, …

#### Model evaluation

In [None]:

# Repeated CV (id = repeat, id2 = fold within repeat)
# with a common variance:  statistic ~ model + (model | id2/id)
set.seed(101)
pp <- test_metrics_all |> 
  rename(id = fold_num,
         id2 = repeat_num) |> 
  perf_mod(formula = statistic ~ model + (1 | id2/id),
         transform = tidyposterior::logit_trans,  # for skewed & bounded AUC
         iter = 4000, chains = 4, adapt_delta = .99, # increased iteration from 2000 to fix divergence issues
         family = gaussian, 
)  



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 7.2e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.72 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.173 seconds (Warm-up)
Chain 1:                1.2 seconds (Sampling)
Chain 1:                2.373 

In [None]:
pp_tidy <- pp |> 
  tidy(seed = 123) 

q = c(.025, .5, .975)
pp_perf_tibble <- pp_tidy |> 
  group_by(model) |> 
  summarize(pp_median = quantile(posterior, probs = q[2]),
            pp_lower = quantile(posterior, probs = q[1]), 
            pp_upper = quantile(posterior, probs = q[3])) |> 
  mutate(model = factor(model, levels = c("full model", "baseline model", "metadata model", "passive metadata model"))) |> 
  arrange(model)

pp_perf_tibble |> 
  write_csv(here::here(path_models, "pp_perf_tibble.csv"))

pp_tidy |> 
  write_csv(here::here(path_models, "posteriors.csv"))

pp_perf_tibble


# A tibble: 4 × 4
  model                  pp_median pp_lower pp_upper
  <fct>                      <dbl>    <dbl>    <dbl>
1 full model                 0.685    0.659    0.710
2 baseline model             0.687    0.661    0.711
3 metadata model             0.625    0.597    0.651
4 passive metadata model     0.573    0.544    0.601