# Fairness analyses

Kendra Wyant  
April 2, 2025

### Set Up Environment

In [None]:

# handle conflicts
options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")


ℹ SHA-1 hash of file is "77e91675366f10788c6bcb59fa1cfc9ee0c75281"

In [None]:

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(tidyposterior))
library(kableExtra, exclude = "group_rows")
library(Rcpp, exclude = "populate")
library(brms, exclude = c("ar", "mixture"))


Loading 'brms' package (version 2.22.0). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

In [None]:

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/format_path.R?raw=true")


ℹ SHA-1 hash of file is "d1f1c542783f2e9a6ff50a400f909ba175ac618e"

ℹ SHA-1 hash of file is "6e9288d22f09da9ec15a1d5c046a0b6736ecce8b"

In [None]:
path_processed <- format_path(str_c("risk/data_processed/lag"))
path_models_lag <- format_path(str_c("risk/models/lag"))


### Read in Model Performance Metrics

In [None]:
auroc_dem_0 <- read_csv(here::here(path_models_lag, 
                                   "test_auroc_6_x_5_1day_0_v3_nested_dem.csv"),
                      col_types = cols()) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  select(-outer_split_num)


auroc_dem_24 <- read_csv(here::here(path_models_lag, 
                                    "test_auroc_6_x_5_1day_24_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_72 <- read_csv(here::here(path_models_lag, 
                                    "test_auroc_6_x_5_1day_72_v3_nested_dem.csv"),
                      col_types = cols()) |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_168 <- read_csv(here::here(path_models_lag, 
                                     "test_auroc_6_x_5_1day_168_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_336 <- read_csv(here::here(path_models_lag, 
                                     "test_auroc_6_x_5_1day_336_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  arrange(outer_split_num) |>
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)


### Get Median Posterior Probabilities and contrast analyses

function

In [None]:
calc_pp <- function (lag, dem_var) {
  data_name <- str_c("auroc_dem_", lag)
  
  data <- 
    if (dem_var == "sex") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, male, female)
  } else if (dem_var == "income") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, `above poverty`, `below poverty`)
  } else if (dem_var == "race") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, `non-hispanic white` = white, `not white`)
  } else {
    stop(dem_var, " not in data")
  }
  
  
  set.seed(101)
  pp <- data |> 
    perf_mod(formula = statistic ~ model + (1 | id2/id),
             transform = tidyposterior::logit_trans,  
             iter = 4000, chains = 4,  
             adapt_delta = .999,
             family = gaussian) 

  pp_tidy <- pp |> 
    tidy(seed = 123) |> 
    mutate(lag = lag)

  q = c(.025, .5, .975)
  ci <- pp_tidy |> 
    group_by(model) |> 
    summarize(pp_median = quantile(posterior, probs = q[2]),
              pp_lower = quantile(posterior, probs = q[1]), 
              pp_upper = quantile(posterior, probs = q[3]))  |> 
    mutate(lag = lag) |> 
    arrange(model)
  
  
  contrast_lists <- 
    if (dem_var == "sex") {
    c(list("male"), list("female"))
  } else if (dem_var == "income") {
    c(list("above poverty"), list("below poverty"))
  } else if (dem_var == "race") {
    c(list("non-hispanic white"), list("not white"))
  } else {
    stop(dem_var, " not in data")
  }
      
  ci_contrast <- pp |>
    contrast_models(contrast_lists[1],  contrast_lists[2]) |> 
  summary(size = 0) 
  
  ci_median_contrast <- pp |> 
    contrast_models(contrast_lists[1],  contrast_lists[2]) |>  
    group_by(contrast) |> 
    summarize(median = quantile(difference, .5)) |> 
    mutate(contrast = str_remove(contrast, "\\."))


ci_contrast <- ci_contrast |> 
    mutate(lag = lag) |> 
    left_join(ci_median_contrast, by = c("contrast")) |> 
    select(contrast, probability, median, lower, upper, lag) 
  
  list(pp = pp_tidy, ci = ci, ci_contrast = ci_contrast)
}


sex

In [None]:
sex <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "sex")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 0.000989 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 9.89 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.08 seconds (Warm-up)
Chain 1:                1.71 seconds (Sampling)
Chain 1:                2.79 

# Posterior samples of performance
# A tibble: 80,000 × 3
   model  posterior   lag
   <chr>      <dbl> <dbl>
 1 male       0.915     0
 2 female     0.887     0
 3 male       0.920     0
 4 female     0.871     0
 5 male       0.922     0
 6 female     0.886     0
 7 male       0.918     0
 8 female     0.880     0
 9 male       0.927     0
10 female     0.869     0
# ℹ 79,990 more rows

# A tibble: 10 × 5
   model  pp_median pp_lower pp_upper   lag
   <chr>      <dbl>    <dbl>    <dbl> <dbl>
 1 female     0.880    0.857    0.899     0
 2 male       0.922    0.906    0.935     0
 3 female     0.858    0.831    0.881    24
 4 male       0.900    0.880    0.917    24
 5 female     0.844    0.818    0.868    72
 6 male       0.893    0.875    0.910    72
 7 female     0.814    0.784    0.840   168
 8 male       0.898    0.880    0.914   168
 9 female     0.786    0.754    0.814   336
10 male       0.883    0.863    0.901   336

# A tibble: 5 × 6
  contrast       probability median  lower  upper   lag
  <chr>                <dbl>  <dbl>  <dbl>  <dbl> <dbl>
1 male vs female           1 0.0418 0.0265 0.0582     0
2 male vs female           1 0.0419 0.0249 0.0597    24
3 male vs female           1 0.0486 0.0320 0.0673    72
4 male vs female           1 0.0837 0.0640 0.105    168
5 male vs female           1 0.0976 0.0766 0.120    336

income

In [None]:
income <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "income")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.9e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.29 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.356 seconds (Warm-up)
Chain 1:                1.163 seconds (Sampling)
Chain 1:                2.51

# Posterior samples of performance
# A tibble: 80,000 × 3
   model         posterior   lag
   <chr>             <dbl> <dbl>
 1 above poverty     0.921     0
 2 below poverty     0.875     0
 3 above poverty     0.882     0
 4 below poverty     0.870     0
 5 above poverty     0.910     0
 6 below poverty     0.876     0
 7 above poverty     0.875     0
 8 below poverty     0.854     0
 9 above poverty     0.897     0
10 below poverty     0.864     0
# ℹ 79,990 more rows

# A tibble: 10 × 5
   model         pp_median pp_lower pp_upper   lag
   <chr>             <dbl>    <dbl>    <dbl> <dbl>
 1 above poverty     0.904    0.875    0.925     0
 2 below poverty     0.877    0.843    0.904     0
 3 above poverty     0.884    0.857    0.908    24
 4 below poverty     0.847    0.811    0.877    24
 5 above poverty     0.873    0.841    0.898    72
 6 below poverty     0.833    0.794    0.865    72
 7 above poverty     0.859    0.830    0.884   168
 8 below poverty     0.828    0.795    0.857   168
 9 above poverty     0.844    0.812    0.872   336
10 below poverty     0.787    0.747    0.822   336

# A tibble: 5 × 6
  contrast                       probability median   lower  upper   lag
  <chr>                                <dbl>  <dbl>   <dbl>  <dbl> <dbl>
1 above poverty vs below poverty       0.964 0.0261 0.00210 0.0513     0
2 above poverty vs below poverty       0.991 0.0368 0.0115  0.0633    24
3 above poverty vs below poverty       0.987 0.0396 0.0108  0.0696    72
4 above poverty vs below poverty       0.966 0.0313 0.00344 0.0593   168
5 above poverty vs below poverty       0.997 0.0575 0.0253  0.0903   336

race

In [None]:
race <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "race")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.5e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.25 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 0.935 seconds (Warm-up)
Chain 1:                0.879 seconds (Sampling)
Chain 1:                1.81

# Posterior samples of performance
# A tibble: 80,000 × 3
   model              posterior   lag
   <chr>                  <dbl> <dbl>
 1 non-hispanic white     0.879     0
 2 not white              0.582     0
 3 non-hispanic white     0.961     0
 4 not white              0.804     0
 5 non-hispanic white     0.877     0
 6 not white              0.622     0
 7 non-hispanic white     0.893     0
 8 not white              0.759     0
 9 non-hispanic white     0.838     0
10 not white              0.521     0
# ℹ 79,990 more rows

# A tibble: 10 × 5
   model              pp_median pp_lower pp_upper   lag
   <chr>                  <dbl>    <dbl>    <dbl> <dbl>
 1 non-hispanic white     0.908    0.765    0.968     0
 2 not white              0.688    0.406    0.875     0
 3 non-hispanic white     0.888    0.712    0.964    24
 4 not white              0.612    0.317    0.850    24
 5 non-hispanic white     0.878    0.816    0.919    72
 6 not white              0.726    0.620    0.813    72
 7 non-hispanic white     0.864    0.818    0.900   168
 8 not white              0.752    0.680    0.814   168
 9 non-hispanic white     0.843    0.797    0.882   336
10 not white              0.733    0.664    0.795   336

# A tibble: 5 × 6
  contrast                        probability median  lower upper   lag
  <chr>                                 <dbl>  <dbl>  <dbl> <dbl> <dbl>
1 non-hispanic white vs not white       0.990  0.211 0.0517 0.421     0
2 non-hispanic white vs not white       0.992  0.267 0.0727 0.485    24
3 non-hispanic white vs not white       1.00   0.150 0.0810 0.228    72
4 non-hispanic white vs not white       1.00   0.110 0.0597 0.166   168
5 non-hispanic white vs not white       1.00   0.109 0.0624 0.159   336

Bind all pp/contrast tibbles and save

In [None]:
posteriors_sex |> 
  bind_rows(posteriors_income) |> 
  bind_rows(posteriors_race) |> 
  write_csv(here::here(path_models_lag, "posteriors_dem.csv"))

pp_sex |> 
  bind_rows(pp_income) |> 
  bind_rows(pp_race) |> 
  write_csv(here::here(path_models_lag, "pp_dem_all.csv"))

pp_dem_contrast <- contrast_sex |> 
  bind_rows(contrast_income) |> 
  bind_rows(contrast_race) |> 
  write_csv(here::here(path_models_lag, "pp_dem_contrast_all.csv"))
