# Fairness analyses

Kendra Wyant  
January 27, 2025

### Set Up Environment

In [None]:

# handle conflicts
options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")


ℹ SHA-1 hash of file is "77e91675366f10788c6bcb59fa1cfc9ee0c75281"

In [None]:

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(tidyposterior))
library(kableExtra, exclude = "group_rows")
library(Rcpp, exclude = "populate")
library(brms, exclude = c("ar", "mixture"))


Loading 'brms' package (version 2.22.0). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

In [None]:

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/format_path.R?raw=true")


ℹ SHA-1 hash of file is "a58e57da996d1b70bb9a5b58241325d6fd78890f"

ℹ SHA-1 hash of file is "6e9288d22f09da9ec15a1d5c046a0b6736ecce8b"

In [None]:
path_processed <- format_path(str_c("studydata/risk/data_processed/lag"))
path_models_lag <- format_path(str_c("studydata/risk/models/lag"))


### Read in Model Performance Metrics

In [None]:
auroc_dem_0 <- read_csv(here::here(path_models_lag, 
                                   "test_auroc_6_x_5_1day_0_v3_nested_dem.csv"),
                      col_types = cols()) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  select(-outer_split_num)


auroc_dem_24 <- read_csv(here::here(path_models_lag, 
                                    "test_auroc_6_x_5_1day_24_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_72 <- read_csv(here::here(path_models_lag, 
                                    "test_auroc_6_x_5_1day_72_v3_nested_dem.csv"),
                      col_types = cols()) |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_168 <- read_csv(here::here(path_models_lag, 
                                     "test_auroc_6_x_5_1day_168_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_336 <- read_csv(here::here(path_models_lag, 
                                     "test_auroc_6_x_5_1day_336_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  arrange(outer_split_num) |>
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)


### Get Median Posterior Probabilities and contrast analyses

function

In [None]:
calc_pp <- function (lag, dem_var) {
  data_name <- str_c("auroc_dem_", lag)
  
  data <- 
    if (dem_var == "sex") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, male, female)
  } else if (dem_var == "income") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, `above poverty`, `below poverty`)
  } else if (dem_var == "race") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, `non-hispanic white` = white, `not white`)
  } else {
    stop(dem_var, " not in data")
  }
  
  
  set.seed(101)
  pp <- data |> 
    perf_mod(formula = statistic ~ model + (1 | id2/id),
             transform = tidyposterior::logit_trans,  
             iter = 4000, chains = 4,  
             adapt_delta = .99,
             family = gaussian) 

  pp_tidy <- pp |> 
    tidy(seed = 123)

  q = c(.025, .5, .975)
  ci <- pp_tidy |> 
    group_by(model) |> 
    summarize(pp_median = quantile(posterior, probs = q[2]),
              pp_lower = quantile(posterior, probs = q[1]), 
              pp_upper = quantile(posterior, probs = q[3]))  |> 
    mutate(lag = lag) |> 
    arrange(model)
  
  
  contrast_lists <- 
    if (dem_var == "sex") {
    c(list("male"), list("female"))
  } else if (dem_var == "income") {
    c(list("above poverty"), list("below poverty"))
  } else if (dem_var == "race") {
    c(list("non-hispanic white"), list("not white"))
  } else {
    stop(dem_var, " not in data")
  }
      
  ci_contrast <- pp |>
    contrast_models(contrast_lists[1],  contrast_lists[2]) |> 
  summary(size = 0) 
  
  ci_median_contrast <- pp |> 
    contrast_models(contrast_lists[1],  contrast_lists[2]) |>  
    group_by(contrast) |> 
    summarize(median = quantile(difference, .5)) |> 
    mutate(contrast = str_remove(contrast, "\\."))


ci_contrast <- ci_contrast |> 
    mutate(lag = lag) |> 
    left_join(ci_median_contrast, by = c("contrast")) |> 
    select(contrast, probability, median, lower, upper, lag) 
  
  list(ci = ci, ci_contrast = ci_contrast)
}


sex

In [None]:
sex <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "sex")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 6.3e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.63 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.507 seconds (Warm-up)
Chain 1:                1.7 seconds (Sampling)
Chain 1:                3.207 

to find out why this is a problem and how to eliminate them.




SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.5e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.25 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.216 seconds (Warm-up)
Chain 1:                1.56 seconds (Sampling)
Chain 1:                2.776

# A tibble: 10 × 5
   model  pp_median pp_lower pp_upper   lag
   <chr>      <dbl>    <dbl>    <dbl> <dbl>
 1 female     0.880    0.858    0.899     0
 2 male       0.922    0.906    0.935     0
 3 female     0.858    0.831    0.882    24
 4 male       0.900    0.880    0.918    24
 5 female     0.844    0.820    0.866    72
 6 male       0.893    0.875    0.909    72
 7 female     0.814    0.786    0.840   168
 8 male       0.898    0.881    0.913   168
 9 female     0.786    0.754    0.815   336
10 male       0.883    0.863    0.902   336

# A tibble: 5 × 6
  contrast       probability median  lower  upper   lag
  <chr>                <dbl>  <dbl>  <dbl>  <dbl> <dbl>
1 male vs female        1    0.0420 0.0266 0.0577     0
2 male vs female        1.00 0.0417 0.0249 0.0598    24
3 male vs female        1    0.0490 0.0318 0.0675    72
4 male vs female        1    0.0837 0.0643 0.104    168
5 male vs female        1    0.0970 0.0763 0.119    336

income

In [None]:
income <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "income")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.4e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.24 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.118 seconds (Warm-up)
Chain 1:                0.847 seconds (Sampling)
Chain 1:                1.96

# A tibble: 10 × 5
   model         pp_median pp_lower pp_upper   lag
   <chr>             <dbl>    <dbl>    <dbl> <dbl>
 1 above poverty     0.903    0.867    0.930     0
 2 below poverty     0.861    0.813    0.899     0
 3 above poverty     0.881    0.847    0.907    24
 4 below poverty     0.830    0.784    0.867    24
 5 above poverty     0.871    0.835    0.902    72
 6 below poverty     0.806    0.754    0.851    72
 7 above poverty     0.861    0.826    0.890   168
 8 below poverty     0.787    0.739    0.830   168
 9 above poverty     0.842    0.801    0.876   336
10 below poverty     0.728    0.667    0.782   336

# A tibble: 5 × 6
  contrast                       probability median   lower  upper   lag
  <chr>                                <dbl>  <dbl>   <dbl>  <dbl> <dbl>
1 above poverty vs below poverty       0.978 0.0416 0.00736 0.0784     0
2 above poverty vs below poverty       0.991 0.0506 0.0159  0.0868    24
3 above poverty vs below poverty       0.998 0.0641 0.0244  0.107     72
4 above poverty vs below poverty       0.999 0.0733 0.0359  0.113    168
5 above poverty vs below poverty       1.00  0.114  0.0671  0.162    336

race

In [None]:
race <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "race")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.4e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.24 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.155 seconds (Warm-up)
Chain 1:                1.67 seconds (Sampling)
Chain 1:                2.825

# A tibble: 10 × 5
   model              pp_median pp_lower pp_upper   lag
   <chr>                  <dbl>    <dbl>    <dbl> <dbl>
 1 non-hispanic white     0.907    0.759    0.969     0
 2 not white              0.685    0.403    0.873     0
 3 non-hispanic white     0.889    0.706    0.963    24
 4 not white              0.609    0.322    0.843    24
 5 non-hispanic white     0.878    0.816    0.920    72
 6 not white              0.726    0.619    0.811    72
 7 non-hispanic white     0.864    0.818    0.900   168
 8 not white              0.754    0.682    0.814   168
 9 non-hispanic white     0.843    0.797    0.883   336
10 not white              0.733    0.663    0.796   336

# A tibble: 5 × 6
  contrast                        probability median  lower upper   lag
  <chr>                                 <dbl>  <dbl>  <dbl> <dbl> <dbl>
1 non-hispanic white vs not white       0.991  0.215 0.0565 0.422     0
2 non-hispanic white vs not white       0.994  0.268 0.0809 0.481    24
3 non-hispanic white vs not white       1.00   0.150 0.0810 0.227    72
4 non-hispanic white vs not white       1.00   0.109 0.0615 0.164   168
5 non-hispanic white vs not white       1      0.109 0.0637 0.160   336

Bind all pp/contrast tibbles and save

In [None]:
pp_sex |> 
  bind_rows(pp_income) |> 
  bind_rows(pp_race) |> 
  write_csv(here::here(path_models_lag, "pp_dem_all.csv"))

pp_dem_contrast <- contrast_sex |> 
  bind_rows(contrast_income) |> 
  bind_rows(contrast_race) |> 
  write_csv(here::here(path_models_lag, "pp_dem_contrast_all.csv"))
