# Algorithmic Fairness

Gaylen Fronk & Kendra Wyant  
January 14, 2025

### Set Up Environment

In [None]:
study <- "match"
version <- "v6"
cv <- "nested"
y_col_name <- "pp_hybrid_wk4_outcome"

Packages for script

In [None]:
library(tidyposterior)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
✔ broom        1.0.5      ✔ rsample      1.2.0 
✔ dials        1.2.1      ✔ tune         1.1.2 
✔ infer        1.0.6      ✔ workflows    1.1.4 
✔ modeldata    1.3.0      ✔ workflowsets 1.0.1 
✔ parsnip      1.2.0      ✔ yardstick    1.3.0 
✔ recipes      1.0.10     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.

Absolute paths

In [None]:
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/format_path.R?raw=true")

ℹ SHA-1 hash of file is "a58e57da996d1b70bb9a5b58241325d6fd78890f"

Chunk Defaults

In [None]:
knitr::opts_chunk$set(attr.output='style="max-height: 500px;"')

options(tibble.width = Inf)
options(tibble.print_max = Inf)

Source

In [None]:
# EDA
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")

ℹ SHA-1 hash of file is "c045eee2655a18dc85e715b78182f176327358a7"

### Read in preds and metrics for best model

In [None]:
auroc_dem <- read_csv(file.path(path_models,
                          str_c("pp_hybrid_wk4_outcome/", "auroc_dem_", 
                                version, "_", cv, ".csv")),
                      show_col_types = FALSE) |> 
  arrange(outer_split_num) |> 
  mutate(repeat_num = rep(str_c("repeat", 1:3), each = 10),
         fold_num = rep(str_c("fold", 1:10), 3)) |>   # assumes 3x10 fold
  select(-outer_split_num) |> 
  glimpse()

Rows: 30
Columns: 8
$ Female               <dbl> 0.5783251, 0.7061404, 0.6829268, 0.5608696, 0.760…
$ Male                 <dbl> 0.6911765, 0.7551669, 0.7989510, 0.7280702, 0.704…
$ `Not White`          <dbl> 0.7777778, 0.6969697, 0.7098214, 0.7163636, 0.795…
$ `White/Non-Hispanic` <dbl> 0.5805322, 0.6601562, 0.6696833, 0.6434109, 0.617…
$ `Above Poverty Line` <dbl> 0.5508658, 0.7037500, 0.7365591, 0.6637081, 0.673…
$ `Below Poverty Line` <dbl> 0.7250000, 0.6756757, 0.7138047, 0.7600000, 0.791…
$ repeat_num           <chr> "repeat1", "repeat1", "repeat1", "repeat1", "repe…
$ fold_num             <chr> "fold1", "fold2", "fold3", "fold4", "fold5", "fol…

### Get Median Posterior Probabilities and contrast analyses

function

In [None]:
calc_pp <- function (data, dem_var) {
  
  if (dem_var == "sex") {
    data <- data |> 
      select(id = repeat_num, id2 = fold_num, Female, Male)
  } else if (dem_var == "income") {
    data <- data |> 
      select(id = repeat_num, id2 = fold_num, `Below Poverty Line`, `Above Poverty Line`)
  } else if (dem_var == "race_ethnicity") {
    data <- data |> 
      select(id = repeat_num, id2 = fold_num, `Not White`, `White/Non-Hispanic`)
  } else {
    stop(dem_var, " not in data")
  }
  
  set.seed(101)
  pp <- data |> 
    perf_mod(formula = statistic ~ model + (1 | id2/id),
             transform = tidyposterior::logit_trans,  
             iter = 2000, chains = 4,  
             adapt_delta = .99,
             family = gaussian) 

  pp_tidy <- pp |> 
    tidy(seed = 123)

  q = c(.025, .5, .975)
  ci <- pp_tidy |> 
    group_by(model) |> 
    summarize(pp_median = quantile(posterior, probs = q[2]),
              pp_lower = quantile(posterior, probs = q[1]), 
              pp_upper = quantile(posterior, probs = q[3]))  |> 
    arrange(model)
  
  
  contrast_lists <- 
    if (dem_var == "sex") {
    c(list("Male"), list("Female"))
  } else if (dem_var == "income") {
    c(list("Above Poverty Line"), list("Below Poverty Line"))
  } else if (dem_var == "race_ethnicity") {
    c(list("White/Non-Hispanic"), list("Not White"))
  } else {
    stop(dem_var, " not in data")
  }
      
  ci_contrast <- pp |>
    contrast_models(contrast_lists[1],  contrast_lists[2]) |> 
  summary(size = 0) 
  
  ci_median_contrast <- pp |> 
    contrast_models(contrast_lists[1],  contrast_lists[2]) |>  
    group_by(contrast) |> 
    summarize(median = quantile(difference, .5)) |> 
    mutate(contrast = str_remove(contrast, "\\."))


ci_contrast <- ci_contrast |> 
    left_join(ci_median_contrast, by = c("contrast")) |> 
    select(contrast, probability, median, lower, upper) 
  
  list(ci = ci, ci_contrast = ci_contrast)
}

Sex

In [None]:
sex <- calc_pp(data = auroc_dem, "sex")


SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 0.000121 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.21 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.786 seconds (Warm-up)
Chain 1:                0.631 seconds (Sampling)
Chain 1:                2.4

# A tibble: 2 × 4
  model  pp_median pp_lower pp_upper
  <chr>      <dbl>    <dbl>    <dbl>
1 Female     0.670    0.639    0.698
2 Male       0.726    0.699    0.751

# A tibble: 1 × 5
  contrast       probability median  lower  upper
  <chr>                <dbl>  <dbl>  <dbl>  <dbl>
1 Male vs Female       0.998 0.0559 0.0261 0.0866

Income

In [None]:
income <- calc_pp(data = auroc_dem, "income")


SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 3e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.3 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 1.931 seconds (Warm-up)
Chain 1:                0.8 seconds (Sampling)
Chain 1:                2.731 sec

# A tibble: 2 × 4
  model              pp_median pp_lower pp_upper
  <chr>                  <dbl>    <dbl>    <dbl>
1 Above Poverty Line     0.658    0.628    0.685
2 Below Poverty Line     0.690    0.661    0.717

# A tibble: 1 × 5
  contrast                                 probability  median   lower    upper
  <chr>                                          <dbl>   <dbl>   <dbl>    <dbl>
1 Above Poverty Line vs Below Poverty Line      0.0335 -0.0331 -0.0618 -0.00397

Race/Ethnicity

In [None]:
race_eth <- calc_pp(data = auroc_dem, "race_ethnicity")


SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 3.3e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.33 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 0.917 seconds (Warm-up)
Chain 1:                0.569 seconds (Sampling)
Chain 1:                1.48

# A tibble: 2 × 4
  model              pp_median pp_lower pp_upper
  <chr>                  <dbl>    <dbl>    <dbl>
1 Not White              0.687    0.653    0.719
2 White/Non-Hispanic     0.639    0.602    0.674

# A tibble: 1 × 5
  contrast                        probability  median   lower   upper
  <chr>                                 <dbl>   <dbl>   <dbl>   <dbl>
1 White/Non-Hispanic vs Not White      0.0105 -0.0481 -0.0839 -0.0127

Bind all pp/contrast tibbles and save

In [None]:
pp_sex |> 
  bind_rows(pp_income) |> 
  bind_rows(pp_race_eth) |> 
  write_csv(file.path(path_models, "pp_dem_all.csv"))

contrast_sex |> 
  bind_rows(contrast_income) |> 
  bind_rows(contrast_race_eth) |> 
  write_csv(file.path(path_models, "pp_dem_contrast_all.csv"))