# Fairness analyses

Kendra Wyant  
January 27, 2025

### Set Up Environment

In [None]:

# handle conflicts
options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")


ℹ SHA-1 hash of file is "77e91675366f10788c6bcb59fa1cfc9ee0c75281"

In [None]:

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(tidyposterior))
library(kableExtra, exclude = "group_rows")
library(Rcpp, exclude = "populate")
library(brms, exclude = c("ar", "mixture"))


Loading 'brms' package (version 2.22.0). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

In [None]:

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/format_path.R?raw=true")


ℹ SHA-1 hash of file is "a58e57da996d1b70bb9a5b58241325d6fd78890f"

ℹ SHA-1 hash of file is "6e9288d22f09da9ec15a1d5c046a0b6736ecce8b"

In [None]:
path_processed <- format_path(str_c("studydata/risk/data_processed/lag"))
path_models_lag <- format_path(str_c("studydata/risk/models/lag"))


### Read in Model Performance Metrics

In [None]:
auroc_dem_0 <- read_csv(here::here(path_models_lag, 
                                   "test_auroc_6_x_5_1day_0_v3_nested_dem.csv"),
                      col_types = cols()) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  select(-outer_split_num)


auroc_dem_24 <- read_csv(here::here(path_models_lag, 
                                    "test_auroc_6_x_5_1day_24_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_72 <- read_csv(here::here(path_models_lag, 
                                    "test_auroc_6_x_5_1day_72_v3_nested_dem.csv"),
                      col_types = cols()) |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_168 <- read_csv(here::here(path_models_lag, 
                                     "test_auroc_6_x_5_1day_168_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)

auroc_dem_336 <- read_csv(here::here(path_models_lag, 
                                     "test_auroc_6_x_5_1day_336_v3_nested_dem.csv"),
                      col_types = cols())  |> 
  add_row(outer_split_num = 12) |>
  add_row(outer_split_num = 28) |>
  arrange(outer_split_num) |>
  mutate(across(everything(), ~if_else(.x == 0, .0000001, .x))) |> 
  mutate(fold_num = rep(1:5, 6),
         repeat_num = c(rep(1, 5), rep(2, 5), rep(3, 5), 
                        rep(4, 5), rep(5, 5), rep(6, 5))) |> 
  select(-outer_split_num)


### Get Median Posterior Probabilities and contrast analyses

function

In [None]:
calc_pp <- function (lag, dem_var) {
  data_name <- str_c("auroc_dem_", lag)
  
  data <- 
    if (dem_var == "sex") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, female, male)
  } else if (dem_var == "income") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, `below poverty`,`above poverty`)
  } else if (dem_var == "race") {
    get(data_name) |> 
    select(id = repeat_num, id2 = fold_num, `not white`, `non-hispanic white` = white)
  } else {
    stop(dem_var, " not in data")
  }
  
  
  set.seed(101)
  pp <- data |> 
    perf_mod(formula = statistic ~ model + (1 | id2/id),
             transform = tidyposterior::logit_trans,  
             iter = 4000, chains = 4,  
             adapt_delta = .99,
             family = gaussian) 

  pp_tidy <- pp |> 
    tidy(seed = 123)

  q = c(.025, .5, .975)
  ci <- pp_tidy |> 
    group_by(model) |> 
    summarize(pp_median = quantile(posterior, probs = q[2]),
              pp_lower = quantile(posterior, probs = q[1]), 
              pp_upper = quantile(posterior, probs = q[3]))  |> 
    mutate(lag = lag) |> 
    arrange(model)
  
  
  contrast_lists <- 
    if (dem_var == "sex") {
    c(list("female"), list("male"))
  } else if (dem_var == "income") {
    c(list("below poverty"), list("above poverty"))
  } else if (dem_var == "race") {
    c(list("not white"), list("non-hispanic white"))
  } else {
    stop(dem_var, " not in data")
  }
      
  ci_contrast <- pp |>
    contrast_models(contrast_lists[1],  contrast_lists[2]) |> 
  summary(size = 0) 
  
  ci_median_contrast <- pp |> 
    contrast_models(contrast_lists[1],  contrast_lists[2]) |>  
    group_by(contrast) |> 
    summarize(median = quantile(difference, .5)) |> 
    mutate(contrast = str_remove(contrast, "\\."))


ci_contrast <- ci_contrast |> 
    mutate(lag = lag) |> 
    left_join(ci_median_contrast, by = c("contrast")) |> 
    select(contrast, probability, median, lower, upper, lag) 
  
  list(ci = ci, ci_contrast = ci_contrast)
}


sex

In [None]:
sex <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "sex")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 6.2e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.62 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.249 seconds (Warm-up)
Chain 1:                1.122 seconds (Sampling)
Chain 1:                2.37

to find out why this is a problem and how to eliminate them.




SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.3e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.23 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.521 seconds (Warm-up)
Chain 1:                1.599 seconds (Sampling)
Chain 1:                3.12

# A tibble: 10 × 5
   model  pp_median pp_lower pp_upper   lag
   <chr>      <dbl>    <dbl>    <dbl> <dbl>
 1 female     0.880    0.857    0.899     0
 2 male       0.922    0.906    0.935     0
 3 female     0.858    0.830    0.881    24
 4 male       0.900    0.878    0.917    24
 5 female     0.844    0.818    0.866    72
 6 male       0.893    0.875    0.909    72
 7 female     0.814    0.784    0.840   168
 8 male       0.898    0.879    0.914   168
 9 female     0.784    0.750    0.814   336
10 male       0.881    0.858    0.899   336

# A tibble: 5 × 6
  contrast       probability  median   lower   upper   lag
  <chr>                <dbl>   <dbl>   <dbl>   <dbl> <dbl>
1 female vs male    0.000125 -0.0418 -0.0578 -0.0270     0
2 female vs male    0        -0.0418 -0.0603 -0.0244    24
3 female vs male    0        -0.0490 -0.0671 -0.0322    72
4 female vs male    0        -0.0838 -0.105  -0.0637   168
5 female vs male    0        -0.0968 -0.120  -0.0746   336

income

In [None]:
income <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "income")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.7e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.27 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.097 seconds (Warm-up)
Chain 1:                1.595 seconds (Sampling)
Chain 1:                2.69

# A tibble: 10 × 5
   model         pp_median pp_lower pp_upper   lag
   <chr>             <dbl>    <dbl>    <dbl> <dbl>
 1 above poverty     0.903    0.867    0.930     0
 2 below poverty     0.861    0.810    0.899     0
 3 above poverty     0.881    0.847    0.908    24
 4 below poverty     0.830    0.785    0.868    24
 5 above poverty     0.870    0.833    0.901    72
 6 below poverty     0.807    0.752    0.850    72
 7 above poverty     0.860    0.826    0.889   168
 8 below poverty     0.786    0.737    0.828   168
 9 above poverty     0.842    0.797    0.879   336
10 below poverty     0.734    0.671    0.791   336

# A tibble: 5 × 6
  contrast                       probability  median   lower    upper   lag
  <chr>                                <dbl>   <dbl>   <dbl>    <dbl> <dbl>
1 below poverty vs above poverty     0.0231  -0.0418 -0.0794 -0.00687     0
2 below poverty vs above poverty     0.00975 -0.0506 -0.0861 -0.0164     24
3 below poverty vs above poverty     0.00525 -0.0637 -0.107  -0.0238     72
4 below poverty vs above poverty     0.0005  -0.0733 -0.114  -0.0363    168
5 below poverty vs above poverty     0       -0.107  -0.158  -0.0590    336

race

In [None]:
race <- c(0, 24,72,168,336) |> 
  map(\(lag) calc_pp(lag, "race")) 



SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 2.8e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.28 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 4000 [  0%]  (Warmup)
Chain 1: Iteration:  400 / 4000 [ 10%]  (Warmup)
Chain 1: Iteration:  800 / 4000 [ 20%]  (Warmup)
Chain 1: Iteration: 1200 / 4000 [ 30%]  (Warmup)
Chain 1: Iteration: 1600 / 4000 [ 40%]  (Warmup)
Chain 1: Iteration: 2000 / 4000 [ 50%]  (Warmup)
Chain 1: Iteration: 2001 / 4000 [ 50%]  (Sampling)
Chain 1: Iteration: 2400 / 4000 [ 60%]  (Sampling)
Chain 1: Iteration: 2800 / 4000 [ 70%]  (Sampling)
Chain 1: Iteration: 3200 / 4000 [ 80%]  (Sampling)
Chain 1: Iteration: 3600 / 4000 [ 90%]  (Sampling)
Chain 1: Iteration: 4000 / 4000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.187 seconds (Warm-up)
Chain 1:                0.818 seconds (Sampling)
Chain 1:                2.00

# A tibble: 10 × 5
   model              pp_median pp_lower pp_upper   lag
   <chr>                  <dbl>    <dbl>    <dbl> <dbl>
 1 non-hispanic white     0.908    0.761    0.967     0
 2 not white              0.684    0.403    0.878     0
 3 non-hispanic white     0.888    0.715    0.961    24
 4 not white              0.609    0.322    0.836    24
 5 non-hispanic white     0.878    0.821    0.920    72
 6 not white              0.727    0.626    0.812    72
 7 non-hispanic white     0.864    0.817    0.899   168
 8 not white              0.753    0.680    0.812   168
 9 non-hispanic white     0.842    0.791    0.882   336
10 not white              0.724    0.650    0.788   336

# A tibble: 5 × 6
  contrast                        probability median  lower   upper   lag
  <chr>                                 <dbl>  <dbl>  <dbl>   <dbl> <dbl>
1 not white vs non-hispanic white    0.0102   -0.215 -0.425 -0.0539     0
2 not white vs non-hispanic white    0.00638  -0.270 -0.483 -0.0799    24
3 not white vs non-hispanic white    0.000125 -0.150 -0.226 -0.0804    72
4 not white vs non-hispanic white    0.000125 -0.111 -0.165 -0.0603   168
5 not white vs non-hispanic white    0.000125 -0.117 -0.171 -0.0673   336

Bind all pp/contrast tibbles and save

In [None]:
pp_sex |> 
  bind_rows(pp_income) |> 
  bind_rows(pp_race) |> 
  write_csv(here::here(path_models_lag, "pp_dem_all.csv"))

pp_dem_contrast <- contrast_sex |> 
  bind_rows(contrast_income) |> 
  bind_rows(contrast_race) |> 
  write_csv(here::here(path_models_lag, "pp_dem_contrast_all.csv"))
