# Preliminaries

In [None]:
suppressPackageStartupMessages(library(tidyverse))
library(patchwork)

In [None]:
ws_namespace <- Sys.getenv("WORKSPACE_NAMESPACE")
ws_name <- Sys.getenv("WORKSPACE_NAME")
ws_bucket <- Sys.getenv("WORKSPACE_BUCKET")

In [None]:
theme_set(theme_bw())

## Read in phenotypic and metabolomic data

* Phenotype data come from integrating inputs from dbGaP (/PIC-SURE), MESA investigators, and metadata from metabolomic preprocessing.
* Genotype data come from dbGaP (Freeze 9b TOPMed-wide genotype VCF files).

In [None]:
list.files()

In [None]:
analysis_df <- read_csv("analysis/analysis_df.csv", col_types=cols())
names(analysis_df)

In [None]:
snp_info_df <- read_csv("genotypes/snp_info.csv", col_types=cols())

# Explore physical activity main effects and covariate adjustments

## Preprocessing

In [None]:
winsorize_quantile <- function(x, quantiles = c(0.05, 0.95)) {
  bounds <- quantile(x, quantiles, na.rm = TRUE)
  print(paste0(sum(x < bounds[1], na.rm = TRUE), " values winsorized at the lower bound."))
  print(paste0(sum(x > bounds[2], na.rm = TRUE), " values winsorized at the upper bound."))
  case_when(
    x < bounds[1] ~ bounds[1],
    x > bounds[2] ~ bounds[2],
    TRUE ~ x
  )
}

truncate_quantile <- function(x, quantiles = c(0.05, 0.95)) {
  bounds <- quantile(x, quantiles, na.rm = TRUE)
  print(paste0(sum(x < bounds[1], na.rm = TRUE), " values truncated at the lower bound."))
  print(paste0(sum(x > bounds[2], na.rm = TRUE), " values truncated at the upper bound."))
  case_when(
    x < bounds[1] ~ as.numeric(NA),
    x > bounds[2] ~ as.numeric(NA),
    TRUE ~ x
  )
}

In [None]:
raw_pa_fields <- c("mvpa", "mod_pa", "vig_pa")

pa_fields <- c(raw_pa_fields, 
               paste0(raw_pa_fields, "_log"),
               paste0(raw_pa_fields, "_win"),
               paste0(raw_pa_fields, "_trunc"))

analysis_df <- analysis_df %>%
  mutate(across(all_of(raw_pa_fields), ~ log(. + 1), .names = "{.col}_log"),
         across(all_of(raw_pa_fields), 
                ~ winsorize_quantile(., c(0, 0.9)), .names = "{.col}_win"),
         across(all_of(raw_pa_fields), ~ truncate_quantile(., c(0, 0.9)), 
                .names = "{.col}_trunc"))

## Distributions

In [None]:
dim(analysis_df)
table(is.na(analysis_df$hdl))

In [None]:
options(repr.plot.width=16, repr.plot.height=5)

mvpa_hist <- analysis_df %>%
  filter(!is.na(mvpa)) %>%
  ggplot(aes(x = mvpa_win)) +
  geom_histogram(bins = 30) +
  labs(x = "MVPA", y = "Count")

mvpa_smooth <- analysis_df %>%
  filter(!is.na(mvpa)) %>%
  ggplot(aes(x = mvpa_win, y = hdl)) +
  geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"), 
              se = TRUE, na.rm = TRUE) +
  labs(x = "MVPA", y = "HDL-C - unadjusted")

mvpa_hist + mvpa_smooth

In [None]:
# a <- analysis_df %>%
# mutate(HDL_P_calc = H1P + H2P + H3P + H4P + H5P + H6P + H7P)

# a %>%
# select(mesa_id, matches("H.P")) %>%
# pivot_longer(-mesa_id, names_to = "subfraction", values_to = "concentration") %>%
# group_by(subfraction) %>%
# summarise(m = mean(concentration, na.rm = TRUE)) %>%
# ggplot(aes(x = "", y = m, fill = subfraction)) +
# geom_bar(stat = "identity", width = 1) +
# coord_polar("y")

## Main effects

In [None]:
fit_main_effect_model <- function(y, e, covars, df, std = TRUE) {
  if (std) {
    df[[y]] <- scale(df[[y]])
    df[[e]] <- scale(df[[e]])
  }  
  form_str <- paste0(y, " ~ ", e, " + ", paste(covars, collapse=" + "))
  lm_fit <- lm(as.formula(form_str), data=df) 
  lm_fit %>%
      broom::tidy() %>%
      filter(term == e)
}

basic_covars <- c("site", "gender_f0m1", "age")
covar_sets <- list(
    basic = basic_covars,
    add_ses = c(basic_covars, "ses_score", "income_cat"),
    add_ses_HL = c(basic_covars, "ses_score", "income_cat", 
                              "drinks_per_week", "smoking", "ahei_score", "dash_score"),
    add_ses_HL_gPC = c(basic_covars, "ses_score", "income_cat", 
                              "drinks_per_week", "smoking", "ahei_score", "dash_score",
                       paste0("gPC", 1:5)),
    add_ses_HL_race = c(basic_covars, "ses_score", "income_cat", 
                              "drinks_per_week", "smoking", "ahei_score", "dash_score",
                   "race")
)

In [None]:
# Impute missing covariate values to avoid major drops in sample size
analysis_df <- analysis_df %>%
  mutate(income_cat = ifelse(is.na(income_cat), "Missing", income_cat),  # Add missing indicators to SES & HL covariates rather than drop them
         smoking = ifelse(is.na(smoking), "NEVER", smoking),
         across(all_of(c("ses_score", "drinks_per_week", "ahei_score", "dash_score")), 
                ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

In [None]:
main_effect_sensitivity_res_df <- expand_grid(
  e = c(pa_fields, "rs295849"),
  y = "hdl_log",
  covar_set = names(covar_sets)
) %>%
  rowwise() %>%
  mutate(lm_fit = list(fit_main_effect_model(y, e, covar_sets[[covar_set]], analysis_df))) %>%
  unnest(lm_fit)

In [None]:
options(repr.plot.width=16, repr.plot.height=5)

main_effect_sensitivity_res_df %>%
  filter(e %in% c("pa_win", "rs295849"),
         y == "hdl_log") %>%
  mutate(l95 = estimate - 1.96 * std.error,
         u95 = estimate + 1.96 * std.error,
         covar_set = factor(covar_set, levels=names(covar_sets))) %>%
  ggplot(aes(x=covar_set, y=estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin=l95, ymax=u95), width=0.2) +
  geom_hline(yintercept=0, color="gray") +
  facet_wrap(vars(e), scale="free_y", nrow=1) +
  labs(x="Covariate set", y="Standardized PA or SNP main effect estimate (95% CI)",
       title="Main effects in the full MESA dataset")

It appears that both gPCs and race variables have some effect on PA effect estimates. Given this, and the multi-population nature of this dataset, we will include 5 gPCs in subsequent models (in addition to PA x gPC interaction terms for GxE tests).

In [None]:
covars <- covar_sets$add_ses_HL_gPC

In [None]:
main_effect_sensitivity_res_df %>%
  filter(e != "rs295849",
         covar_set == "add_ses_HL_gPC") %>%
  arrange(desc(statistic)) %>%
  mutate(e = factor(e, levels = e, labels = e)) %>%
  ggplot(aes(x=e, y=statistic)) +
  geom_bar(stat="identity", width=0.5) +
  geom_hline(yintercept=0, color="gray") +
  facet_wrap(~y, nrow=2) +
  labs(x="Covariate set", y="Z-statistic",
       title="Significance of main effects for alternative PA variables") +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.9))

It also appears that vigorous PA has a substantially stronger association with HDL-C than the "intentional PA" variable used in the CHARGE Phase I meta-analysis.

In [None]:
main_effect_sex_int_res <- lm(
  hdl ~ mvpa_win * gender_f0m1 + site + age + ses_score + income_cat + 
  drinks_per_week + smoking + ahei_score + dash_score + gPC1 + gPC2 + gPC3 + gPC4 + gPC5,
  data = analysis_df
) %>%
  broom::tidy()
  
main_effect_sex_int_res %>%
  filter(grepl("mvpa|gender", term))

In [None]:
primary_pa_fields <- paste0(c("mvpa", "mod_pa", "vig_pa"), "_win")
primary_pa_fields_clean <- c("Moderate + vigorous PA",
                             "Moderate PA", "Vigorous PA")

# Test for the primary interactions

Can we reproduce in MESA the interactions found in the original CHARGE GLI meta-analyses?

## Previously reported GxEs

SNPs come from the CHARGE GLI Phase I PA-lipids paper: Kilpelainen et al. 2019, *Nat. Comm.* (https://doi.org/10.1038/s41467-018-08008-w).

Physical activity was coded as a binary variable. HDL-C was log-transformed prior to analysis.

Details on the genetic variants:

In [None]:
head(snp_info_df)

Details on the previously reported GxE effects:

In [None]:
gli_info_df <- tribble(
    ~SNP, ~exposure, ~outcome, ~effect_allele, ~EAF, ~beta_int, ~se_int,
    "rs2862183", "pa", "hdl_log", "T", "0.22", "-0.014", "0.003",
    "rs295849", "pa", "hdl_log", "T", "0.38", "0.009", "0.002",
    "rs141588480", "pa", "hdl_log", "Ins", "0.95", "-0.054", "0.010") %>%
    mutate(across(c(EAF, beta_int, se_int), as.numeric)) %>%
    filter(SNP != "rs141588480")

gli_info_df

## Replication of the primary GxEs in MESA

In [None]:
test_gxe <- function(y, snp, e, covars, df, std = TRUE, e_by_gPC = TRUE) {
    if (std) {
        df[[y]] <- scale(df[[y]])
        df[[e]] <- scale(df[[e]])
    }
    if (e_by_gPC) {
        gPCs <- grep("gPC", covars, value = TRUE)
        covars <- c(covars, paste0(e, " * ", gPCs))
    }
    form_str <- paste0(y, " ~ ", e, " * ", snp)
    if (!identical(covars, "")) form_str <- paste0(form_str, " + ", paste(covars, collapse=" + "))
    sumstats <- lm(as.formula(form_str), data=df) %>%
        broom::tidy() %>%
        filter(term %in% c(e, snp, paste0(e, ":", snp))) %>%
        mutate(EAF_topmed = sum(df[[snp]]) / (2 * nrow(df)))
    sumstats
}

In [None]:
all_y <- c("hdl_log", "S_HDL_P", "M_HDL_P", "L_HDL_P", "HDL_C", paste0("H", 1:7, "P"))
pa_subtype_gxe_res_df_all <- expand_grid(
  y = all_y,
  e = primary_pa_fields
) %>%
  rowwise() %>%
  mutate(lm_res = list(test_gxe(y, "rs295849", e, covars, analysis_df))) %>%
  unnest(lm_res)
pa_subtype_gxe_res_df_female <- expand_grid(
  y = all_y,
  e = primary_pa_fields
) %>%
  rowwise() %>%
  mutate(lm_res = list(test_gxe(y, "rs295849", e, covars, analysis_df %>% filter(gender_f0m1 == 0)))) %>%
  unnest(lm_res)
pa_subtype_gxe_res_df <- bind_rows(list(
  all = pa_subtype_gxe_res_df_all,
  female = pa_subtype_gxe_res_df_female
), .id="subgroup") %>%
  filter(grepl(":rs295849", term))

In [None]:
options(repr.plot.width=16, repr.plot.height=5)

pa_subtype_gxe_res_df %>%
  filter(!grepl("H.P", y)) %>%
  mutate(l95 = estimate - 1.96 * std.error,
         u95 = estimate + 1.96 * std.error,
         e = factor(e, levels = primary_pa_fields, labels = primary_pa_fields_clean)) %>%
  ggplot(aes(x=e, y=estimate, color=subgroup)) +
  geom_point(position=position_dodge(width=0.3)) +
  geom_errorbar(aes(ymin=l95, ymax=u95), width=0.2, position=position_dodge(width=0.3)) +
  geom_hline(yintercept=0, color="gray") +
  facet_wrap(~y, nrow=2, scale="fixed") +
  labs(x="", y="Standardized interaction effect estimate (95% CI)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.9))

In [None]:
pa_subtype_gxe_res_df %>%
  filter(e == "mvpa_win",
         grepl("H[1-7]P", y)) %>%
  mutate(l95 = estimate - 1.96 * std.error,
         u95 = estimate + 1.96 * std.error) %>%
  ggplot(aes(x=y, y=estimate, color=subgroup)) +
  geom_point(position=position_dodge(width=0.3)) +
  geom_errorbar(aes(ymin=l95, ymax=u95), width=0.2, position=position_dodge(width=0.3)) +
  geom_hline(yintercept=0, color="gray") +
  labs(x="", y="Standardized interaction effect estimate (95% CI)") +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.9))

### Additional notes on HDL fractions

In [None]:
hdl_fraction_tbl <- tribble(
  ~subfraction, ~size_range,
  "small", "7.3-8.2 nm",
  "medium", "8.2-9.4 nm",
  "large", "9.4-14 nm"
)
hdl_fraction_tbl

In [None]:
hdl_subfraction_fraction_df <- analysis_df %>%
  summarise(across(contains("HDL_P"), ~ mean(., na.rm = TRUE))) %>%
  pivot_longer(everything(), names_to = "subfraction", values_to = "value") %>%
  filter(!grepl("_lp3", subfraction)) %>%
  mutate(frac_of_HDL_P = value / value[subfraction == "HDL_P"]) %>%
  select(-value)

hdl_subfraction_cor_mat <- analysis_df %>%
  select(HDL_C, contains("HDL_P")) %>% 
  cor(use = "pairwise.complete.obs")
hdl_subfraction_cor_df <- hdl_subfraction_cor_mat[, "HDL_C", drop = FALSE] %>%
  as.data.frame() %>%
  rownames_to_column("subfraction") %>%
  rename(HDL_C_corr = HDL_C) %>%
  filter(grepl("_HDL_P", subfraction))

mesa_hdl_subfraction_extra_df <- inner_join(
  hdl_subfraction_fraction_df, 
  hdl_subfraction_cor_df,
  by = "subfraction"
)

### Race-specificity of the interaction?

In [None]:
pa_subtype_gxe_byRace_res_df_all <- expand_grid(
  y = c("hdl_log", "S_HDL_P", "M_HDL_P", "L_HDL_P", "HDL_C"),
  e = primary_pa_fields,
  r = unique(analysis_df$race)
) %>%
  rowwise() %>%
  mutate(lm_res = list(test_gxe(y, "rs295849", e, covars, 
                                analysis_df %>% filter(race == r)))) %>%
  unnest(lm_res)
pa_subtype_gxe_byRace_res_df_female <- expand_grid(
  y = c("hdl_log", "M_HDL_P", "HDL_C"),
  e = primary_pa_fields,
  r = unique(analysis_df$race)
) %>%
  rowwise() %>%
  mutate(lm_res = list(test_gxe(y, "rs295849", e, covars, 
                                analysis_df %>% filter(gender_f0m1 == 0, race == r)))) %>%
  unnest(lm_res)
pa_subtype_gxe_byRace_res_df <- bind_rows(list(
  all = pa_subtype_gxe_byRace_res_df_all,
  female = pa_subtype_gxe_byRace_res_df_female
), .id="subgroup") %>%
  filter(grepl(":rs295849", term))

In [None]:
options(repr.plot.width=16, repr.plot.height=15)

pa_subtype_gxe_byRace_res_df %>%
  mutate(l95 = estimate - 1.96 * std.error,
         u95 = estimate + 1.96 * std.error) %>%
  ggplot(aes(x=e, y=estimate, color=subgroup)) +
  geom_point(position=position_dodge(width=0.3)) +
  geom_errorbar(aes(ymin=l95, ymax=u95), width=0.2, position=position_dodge(width=0.3)) +
  geom_hline(yintercept=0, color="gray") +
  facet_wrap(vars(y, r), ncol=4, scale="free") +
  labs(x="", y="Standardized interaction effect estimate (95% CI)")

# Export 

Relevant R objects are exported for use in creating manuscript figures and tables.

In [None]:
analysis_df %>%
  select(mesa_id, gender_f0m1, age, bmi, rs295849, 
         matches(".*pa$"), matches(".*pa_win$"), contains("hdl")) %>%
  saveRDS("manuscript/mesa_analysis_df.rds")

saveRDS(main_effect_sensitivity_res_df, 
        "manuscript/mesa_pa_hdl_res_df.rds")

saveRDS(main_effect_sex_int_res, 
        "manuscript/mesa_pa_hdl_sexInt_res_df.rds")

saveRDS(pa_subtype_gxe_res_df, 
        "manuscript/mesa_gxe_exploration_df.rds")

saveRDS(pa_subtype_gxe_byRace_res_df, 
        "manuscript/mesa_gxe_exploration_byRace_df.rds")

saveRDS(mesa_hdl_subfraction_extra_df, 
        "manuscript/mesa_hdl_subfraction_extra_df.rds")

system(paste0("gsutil cp manuscript/* ", ws_bucket, "/manuscript/"))