# Define independent sets from multi-context colocalizationa and single-context fine-mapping

Date: Nov 18, 2025

We extracted 95% CS from single-context fine-mapping analysis and 95% CS and uCoS from multi-context colocalization analysis within APOE TAD region (chr19:41840000-47960000). Cohorts included: ROSMAP, MSBB, KNIGHT, and MIGA.

**Related files (original)**

- Single-context fine-mapping results, directly extracted from top loci tables from SuSiE 95% CS: `xqtl_only_APOE_all_cohorts_SuSiE.rds`
- Multi-context colocalization results, directly extracted from top loci tables from ColocBoost 95% CoS and uCoS: `xqtl_only_APOE_all_cohorts_ColocBoost.rds`

**Analysis streamline**

We applied two levels of merging criterias to ensure the independent of CS in the subsequent imputation analysis. This step was necessary to avoid the confounding effects of imputation in regions of high LD.

**Conclusion**

- Before merging: There are 943 95% CS from single-context fine-mapping analysis and 341 95% CoS and uCoS from multi-context colocalization analysis from 4 xQTL cohorts.
- After merging:
  1. Overlapping variants: we merged sets if two sets includes the overlapped variants. There are 543 xQTL only sets after merging, however, we will only consider 429 sets in the following analysis, since the variants in remaining sets do not have the matched LD reference panel in ADSP - can not check between purity and do the imputation analysisl.
  2. Between purity (min-between-purity > 0.8): we merged sets from 1 if between purity > 0.8 even though no overlapped variants. There are 415 sets remains.

**Related files (results)**

- Independent sets after merging: `xqtl_only_APOE_all_cohorts_merged_cos_cs_after_between_purity.rds`

## 1. Merging criteria 1: overlapping variants

In [5]:
library(tidyverse)
library(data.table)
library(vroom)
library(vctrs)
library(susieR)
library(matrixStats)
library(pecotmr)
library(colocboost)
source("merge_coloc_also_within_loci.R")

In [6]:
APOE_summary <- readRDS("xqtl_only_APOE_all_cohorts_ColocBoost.rds")
APOE_summary <- APOE_summary %>% filter(start >= 42346101 & start <= 46842901)
SuSiE_95_APOE <- readRDS("xqtl_only_APOE_all_cohorts_SuSiE.rds")
SuSiE_95_APOE <- SuSiE_95_APOE %>% filter(start >= 42346101 & start <= 46842901)
SuSiE_95_APOE$identifier <- paste0(SuSiE_95_APOE$event_ID,"_",SuSiE_95_APOE$cs_coverage_0.95)

In [7]:
APOE_summary %>% pull(identifier) %>% unique %>% length
SuSiE_95_APOE %>% pull(identifier) %>% unique %>% length

In [8]:
## checking overlap variants in ColocBoost identifications
cos <- APOE_summary$identifier %>% unique
threshold <- 0.0001
flat_cos <- lapply(1:length(cos), function(i){
    pos <- which(APOE_summary$identifier == cos[i])
    APOE_summary$variant_ID[pos]
})
flat_cos_vcp <- lapply(1:length(cos), function(i){
    pos <- which(APOE_summary$identifier == cos[i])
    APOE_summary$vcp[pos] %>% as.numeric
})
flat_coloc_outcome <- lapply(1:length(cos), function(i){
    pos <- which(APOE_summary$identifier == cos[i])
    tmp <- APOE_summary$event_ID[pos] %>% unique
    tmp <- lapply(tmp, function(tt) strsplit(tt, "; ")[[1]] )
    tmp %>% unlist %>% unique
})

In [10]:
## checking overlap variants in SuSiE identifications
cs <- SuSiE_95_APOE$identifier %>% unique
flat_cs <- lapply(1:length(cs), function(i){
    pos <- which(SuSiE_95_APOE$identifier == cs[i])
    SuSiE_95_APOE$variant_ID[pos]
})
flat_cs_pip <- lapply(1:length(cs), function(i){
    pos <- which(SuSiE_95_APOE$identifier == cs[i])
    SuSiE_95_APOE$PIP[pos] %>% as.numeric
})
flat_susie_outcome <- lapply(1:length(cs), function(i){
    pos <- which(SuSiE_95_APOE$identifier == cs[i])
    tmp <- SuSiE_95_APOE$event_ID[pos] %>% unique
    tmp <- lapply(tmp, function(tt) strsplit(tt, "; ")[[1]] )
    tmp %>% unlist %>% unique
})

In [12]:
## merge then toether and checking the overlap variants across two methods
all_cos_cs <- c(flat_cs, flat_cos)
all_cos_cs_pip <- c(flat_cs_pip, flat_cos_vcp)
all_outcome <- c(flat_susie_outcome, flat_coloc_outcome)
merge_pairwise_idx <- get_merge_pairwise_idx(all_cos_cs, all_cos_cs_pip, threshold = threshold)
merge_pairwise_idx %>% length

In [13]:
## extract final merged results
final_set <- final_pip_vcp <- list()
final_Outcome <- c()
for (ii in 1:length(merge_pairwise_idx)){
    p.merge <- merge_pairwise_idx[[ii]]
    # - coloc outcomes 
    oo <- all_outcome[p.merge] %>% unlist
    colocOutcome <- paste0(unique(oo), collapse = "; ")
    # - coloc CoS and pph4
    snps <- unlist(all_cos_cs[p.merge])
    vcps <- unlist(all_cos_cs_pip[p.merge])
    context_df <- data.frame(SNP = snps, vcp = vcps, stringsAsFactors = FALSE)
    unique_snps <- unique(context_df$SNP)
    max_vcp <- sapply(unique_snps, function(snp) {
      max(context_df$vcp[context_df$SNP == snp], na.rm = TRUE)
      # min(context_df$vcp[context_df$SNP == snp], na.rm = TRUE)
    })
    merged_df <- data.frame(SNP = unique_snps, MaxVCP = max_vcp, stringsAsFactors = FALSE)
    cos <- merged_df$SNP
    vcp <- merged_df$MaxVCP
    # - coloc purity
    final_set <- c(final_set, list(cos))
    final_pip_vcp <- c(final_pip_vcp, list(vcp))
    final_Outcome <- c(final_Outcome, colocOutcome)
}
names(final_set) <- 
    names(final_pip_vcp) <- 
    names(final_Outcome) <- 
    paste0("ind_set_", 1:length(merge_pairwise_idx))

In [17]:
final_Outcome[1:3]

### Overlapped with ADSP LD reference panel (no need to rerun)

In [18]:
ldblock<-load_LD_matrix('ld_meta_file_apoe.tsv',
                         region = data.frame(chr='chr19', start=42346101, end=46842901))
LD <- ldblock$combined_LD_matrix
variants_LD <- paste0("chr", colnames(LD))
# Do this ONCE before the loop
final_set_matched <- lapply(final_set, function(cos) {
  match(cos, variants_LD) %>% na.omit()
})
pos <- which(sapply(final_set_matched, function(ss) length(ss) == 0))
pos %>% length

[1mRows: [22m[34m1[39m [1mColumns: [22m[34m4[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (2): #chrom, path
[32mdbl[39m (2): start, end

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [19]:
final_set_matched <- final_set_matched[-pos]
final_set <- final_set[-pos]
final_pip_vcp <- final_pip_vcp[-pos]
final_Outcome <- final_Outcome[-pos]
final_set_matched %>% length
final_set %>% length
final_pip_vcp %>% length
final_Outcome %>% length

## 2. Merging criteria 2: between purity

In [20]:
ncos <- length(final_set)
ncos

In [None]:
between_purity_matrix <- matrix(0, nrow = ncos, ncol = ncos)
for (i in 1:(ncos - 1)) {
    for (j in (i + 1):ncos) {
      cos1 <- final_set_matched[[i]]
      cos2 <- final_set_matched[[j]]
      purity_stats <- LD[cos1, cos2, drop = FALSE] %>% abs %>% min
      between_purity_matrix[i,j] <- between_purity_matrix[j,i] <- purity_stats
    }
}

In [21]:
is.between <- (between_purity_matrix > 0.8)
temp <- sapply(1:nrow(is.between), function(x) {
    tt <- c(x, which(is.between[x, ] != 0))
    return(paste0(sort(tt), collapse = ";"))
})
temp <- merge_sets(temp)
potential_merged <- lapply(temp, function(x) as.numeric(unlist(strsplit(x, ";"))))
potential_merged %>% length

In [22]:
final_set_after <- final_pip_vcp_after <- list()
final_Outcome_after <- c()
for (ii in 1:length(potential_merged)){
    p.merge <- potential_merged[[ii]]
    # - coloc outcomes 
    oo <- final_Outcome[p.merge] %>% unlist
    colocOutcome <- paste0(unique(oo), collapse = "; ")
    # - coloc CoS and pph4
    snps <- unlist(final_set[p.merge])
    vcps <- unlist(final_pip_vcp[p.merge])
    context_df <- data.frame(SNP = snps, vcp = vcps, stringsAsFactors = FALSE)
    unique_snps <- unique(context_df$SNP)
    max_vcp <- sapply(unique_snps, function(snp) {
      max(context_df$vcp[context_df$SNP == snp], na.rm = TRUE)
      # min(context_df$vcp[context_df$SNP == snp], na.rm = TRUE)
    })
    merged_df <- data.frame(SNP = unique_snps, MaxVCP = max_vcp, stringsAsFactors = FALSE)
    cos <- merged_df$SNP
    vcp <- merged_df$MaxVCP
    # - coloc purity
    final_set_after <- c(final_set_after, list(cos))
    final_pip_vcp_after <- c(final_pip_vcp_after, list(vcp))
    final_Outcome_after <- c(final_Outcome_after, colocOutcome)
}
names(final_set_after) <- 
    names(final_pip_vcp_after) <- 
    names(final_Outcome_after) <- 
    paste0("ind_set_after_between_purity_", 1:length(potential_merged))

In [23]:
final_set_after %>% length

In [None]:
merged_info <- list(
    final_set = final_set_after,
    final_pip_vcp = final_pip_vcp_after,
    final_Outcome = final_Outcome_after
)
saveRDS(merged_info, "xqtl_only_APOE_all_cohorts_merged_cos_cs_after_between_purity.rds")