# Fine-mapping of AD-related risk loci in Black/African American summary statistics
* Project: Cross-ancestry PAR
* Version: R/4.4
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook overview
* Extract chromosome and base pair positions from summary statistics for selected loci
* Perform fine-mapping and save results

In [1]:
library("data.table")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("snpStats")
library("robustbase")
library(ggplot2)
library(tidyr)
#devtools::install_github("chr1swallace/coloc")
library("coloc")
library("tidyverse")
library("readr")

This is coloc version 5.2.3

── [1mAttaching core tidyverse packages[22m ────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m     masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m      masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m       masks [34mdata.table[39m::first()
[31m✖[39m [34mlubridate[39m::[32mhour()[39m    masks [34mdata.table[39m::hour()
[31m✖[39m [34mlubridate[39m::[32misoweek()[39m masks [34mdata.table[39m::isoweek()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m         masks 

In [3]:
## Read dataframe
df0 <- fread("{WORK_DIR}/AD/summary_stats/diverse_ancestry/NG00100_Kunkle2021/Kunkle2020_ADGC_AA_META_Model1_SummaryStats.withAlleleFreqs_REFORMATTED.txt", header =T)

In [4]:
head(df0)

Chr,Pos,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue,Effect_allele_Freq
<int>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
10,100000012,10:100000012,A,G,0.1734,0.1515,0.2524,0.018
10,100000122,10:100000122,A,T,0.3347,0.2294,0.1445,0.0083
10,100000354,10:100000354,T,C,0.6226,1.017,0.5404,0.9989
10,100000588,10:100000588,T,C,-0.0061,0.1033,0.9531,0.9603
10,100000625,10:100000625,A,G,0.0059,0.0415,0.8873,0.7182
10,100000645,10:100000645,A,C,0.004,0.0491,0.9343,0.823


In [5]:
# Rename columns
df0$Effect <- df0$Beta
df0$StdErr <- df0$SE
df0$`P-value` <- df0$Pvalue
df0$CHR <- df0$Chr
df0$BP <- df0$Pos

In [6]:
## EXTRACT CHRS - 100KB up & downstream from GWAS hits
# APOE rs429358 at chromosome 19, position 45,411,941 & rs7412 at chromosome 19, position 45,412,079
APOE_sumstats = subset(df0, CHR==19 & ((BP > 45311941 & BP < 45511941) | (BP > 45312079 & BP < 45512079)))
# BIN1 rs6733839 hit at chromosome 2, position 127,135,234 -> BP range: 127,035,234 - 127,235,234
BIN1_sumstats = subset(df0, CHR==2 & BP > 127035234 & BP < 127235234)
# TREM2 rs143442484 hit at chromosome 6, position 41,161,469 & rs75932628 at chromosome 6, position 41,161514
TREM2_sumstats = subset(df0, CHR==6 & ((BP > 41061469 & BP < 41261469) | (BP > 41061514 & BP < 41261514)))
# CD2AP rs7767350 hit at chromosome 6, position 47,517,390 -> BP range: 47,417,390 - 47,617,390
CD2AP_sumstats = subset(df0, CHR==6 & BP > 47417390 & BP < 47617390)
# FERMT2 rs17125924 hit at chromosome 14, position 52,924,962 -> BP range: 52,824,962 - 53,024,962
FERMT2_sumstats = subset(df0, CHR==14 & BP > 52824962  & BP < 53024962)
# ABCA7 rs12151021 hit at chromosome 19, position 1,050,875 -> BP range: 950,875 - 1,150,875
ABCA7_sumstats = subset(df0, CHR==19 & BP > 950875 & BP < 1150875)
# COBL rs112404845 hit at chromosome 7, position 51,578,022 -> BP range: 51,478,022 - 51,678,022
COBL_sumstats = subset(df0, CHR==7 & BP > 51478022 & BP < 51678022)
# AKAP9 rs149979685 hit at chromosome 7, position 91,732,110 -> BP range: 91,632,110 - 91,832,110
AKAP9_sumstats = subset(df0, CHR==7 & BP > 91632110 & BP < 91832110)
# WWOX rs62039712 hit at chromosome 16, position 79,355,857 -> BP range: 79,255,857 - 79,455,857
WWOX_sumstats = subset(df0, CHR==16 & BP > 79255857 & BP < 79455857)
# ALCAM rs2633682 hit at chromosome 3, position 104,409,208 -> BP range: 104,309,208 - 104,509,208
ALCAM_sumstats = subset(df0, CHR==3 & BP > 104309208 & BP < 104509208)
# GPC6 rs9516245 hit at chromosome 13, position 94,159,800 -> BP range: 94,059,800 - 94,259,800
GPC6_sumstats = subset(df0, CHR==13 & BP > 94059800 & BP < 94259800)
# RBFOX1 rs79537509 hit at chromosome 16, position 8,288,401 -> BP range: 8,188,401 - 8,388,401
RBFOX1_sumstats = subset(df0, CHR==16 & BP > 8188401 & BP < 8388401)
# EDEM1 rs168193 hit at chromosome 3, position 5,302,077 -> BP range: 5,202,077 - 5,402,077
EDEM1_sumstats = subset(df0, CHR==3 & BP > 5202077 & BP < 5402077)
# VRK3 rs3745495 hit at chromosome 19, 50,524,332 -> BP range: 50,424,332 - 50,624,332
VRK3_sumstats = subset(df0, CHR==19 & BP > 50424332 & BP < 50624332)
# IGF1R rs570487962 hit at chromosome 15, position 97,992,685 -> BP range: 97,892,685 - 98,092,685
IGF1R_sumstats = subset(df0, CHR==15 & BP > 97892685 & BP < 98092685)
# API5 rs569584007 hit at chromosome 11, position 43,166,842 -> BP range: 43,066,842 - 43,266,842
API5_sumstats = subset(df0, CHR==11 & BP > 43066842 & BP < 43266842)
# SLC10A2 rs16961023 hit at chromosome 13, position 103,663,945 -> BP range: 103,563,945 - 103, 763,945
SLC10A2_sumstats = subset(df0, CHR==13 & BP > 103563945 & BP < 103763945)
# CR1 rs4844610 hit at chromosome 1, position 207,802,552 -> BP range: 207,702,552 - 207,902,552
CR1_sumstats = subset(df0, CHR==1 & BP > 207702552 & BP < 207902552)
# INPP5D rs10933431 hit at chromosome 2, 233,981,912 -> BP range: 233,881,912 - 234,081,912
INPP5D_sumstats = subset(df0, CHR==2 & BP > 233881912 & BP < 234081912)
# HLA-DRB1 rs78738018 hit at chromosome 6, position 32,575,406 -> BP range 32,475,406 - 32,675,406
HLA_sumstats = subset(df0, CHR==6 & BP > 32475406 & BP < 32675406)
# NYAP1 rs12539172 hit at chromosome 7, position 100,091,795 -> BP range: 99,991,795 - 100,191,795
NYAP1_sumstats = subset(df0, CHR==7 & BP > 99991795 & BP < 100191795)
# PTK2B rs73223431 hit at chromosome 8, position 27,219,987 -> BP range 27,119,987 - 27,319,987
PTK2B_sumstats = subset(df0, CHR==8 & BP > 27119987 & BP < 27319987)
# CLU rs9331896 hit at chromosome 8, position 27,467,686 -> BP range: 27,367,686 - 27,567,686
CLU_sumstats = subset(df0, CHR==8 & BP > 27367686 & BP < 27567686)
# ECHDC3 rs7920721 hit at chromosome 10, position 11,720,308 -> BP range: 11,620,308 - 11,820,308
ECHDC3_sumstats = subset(df0, CHR==10 & BP > 11620308 & BP < 11820308)
# SPI1 rs3740688 hit at chromosome 11, position 47,380,340 -> BP range: 47,280,340 - 47,480,340
SPI1_sumstats = subset(df0, CHR==11 & BP > 47280340 & BP < 47480340)
# MS4A2 rs7933202 hit at chromosome 11, position 59,936,926 -> BP range: 59,836,926 - 60,036,926
MS4A2_sumstats = subset(df0, CHR==11 & BP > 59836926 & BP < 60036926)
# PICALM rs3851179 hit at chromosome 11, position 85,868,640 -> BP range: 85,768,640 - 85,968,640
PICALM_sumstats = subset(df0, CHR==11 & BP > 85768640 & BP < 85968640)
# SORL1 rs11218343 hit at chromosome 11, position 121,435,587 -> BP range: 121,335,587 - 121,535,587
SORL1_sumstats = subset(df0, CHR==11 & BP > 121335587 & BP < 121535587)
# SLC24A4 rs12881735 hit at chromosome 14, position 92,932,828 -> BP range: 92,832,828 - 93,032,828
SLC24A4_sumstats = subset(df0, CHR==14 & BP > 92832828 & BP < 93032828)
# ADAM10 rs593742 hit at chromosome 15, position 59,045,774 -> BP range: 58,945,774 - 59,145,774
ADAM10_sumstats = subset(df0, CHR==15 & BP > 58945774 & BP < 59145774)
# IQCK rs7185636 hit at chromosome 16, position 19,808,163 -> BP range 19,708,163 - 19,908,163
IQCK_sumstats = subset(df0, CHR==16 & BP > 19708163 & BP < 19908163)
# ACE rs138190086 hit at chromosome 17, position 61,538,148 -> BP range 61,438,148 - 61,638,148
ACE_sumstats = subset(df0, CHR==17 & BP > 61438148 & BP < 61638148)
# CASS4 rs6024870 hit at chromosome 20, position 54,997,568 -> BP range: 54,897,568 - 55,097,568
CASS4_sumstats = subset(df0, CHR==20 & BP > 54897568 & BP < 55097568)
# ADAMTS1 rs2830500 hit at chromosome 21, position 28,156,856 -> BP range: 28,056,856 - 28,256,856
ADAMTS1_sumstats = subset(df0, CHR==21 & BP > 28056856 & BP < 28256856)
# SIPA1L2 rs115684722 hit at chromosome 1, position 232,376,163 -> BP range: 232,276,163 - 232,476,163
SIPA1L2_sumstats = subset(df0, CHR==1 & BP > 232276163 & BP < 232476163)
# WDR70 rs184179037 hit at chromosome 5, position 37,483,940 -> BP range: 37,383,940 - 37,583,940
WDR70_sumstats = subset(df0, CHR==5 & BP > 37383940 & BP < 37583940)
# ACER3 rs115816806 hit at chromosome 11, position 76,541,840 -> BP range: 76,441,840 - 76,641,840
ACER3_sumstats = subset(df0, CHR==11 & BP > 76441840 & BP < 76641840)
# PIK3C2G rs75739461 hit at chromosome 12, position 18,471,546 -> BP range: 18,371,546 - 18,571,546
PIK3C2G_sumstats = subset(df0, CHR==12 & BP > 18371546 & BP < 18571546)

In [7]:
## Run for genes
genes <- c("APOE","BIN1","TREM2","CD2AP","FERMT2","ABCA7","COBL","AKAP9","WWOX","ALCAM","GPC6","RBFOX1","EDEM1","VRK3","IGF1R","API5","SLC10A2","CR1","INPP5D","HLA","NYAP1","PTK2B","CLU","ECHDC3","SPI1","MS4A2","PICALM","SORL1","SLC24A4","ADAM10","IQCK","ACE","CASS4","ADAMTS1","SIPA1L2","WDR70","ACER3","PIK3C2G")

In [8]:
for (gene in genes) {
    # Assume gene_sumstats is a data frame with summary statistics for each gene
    gene_sumstats <- get(paste0(gene, "_sumstats"))  # Get the data frame for the current gene
    if (is.data.frame(gene_sumstats)) {
        write_tsv(gene_sumstats, paste0("{WORK_DIR}/PAR/", "/", gene, "_ad_variants_afr.tab"))
    } else {
        warning(paste("No data frame found for", gene))
    }
}

In [9]:
## Run for genes
genes <- c("APOE","BIN1","TREM2","CD2AP","FERMT2","ABCA7","COBL","AKAP9","WWOX","ALCAM","GPC6","RBFOX1","EDEM1","VRK3","IGF1R","API5","SLC10A2","CR1","INPP5D","HLA","NYAP1","PTK2B","CLU","ECHDC3","SPI1","MS4A2","PICALM","SORL1","SLC24A4","ADAM10","IQCK","ACE","CASS4","ADAMTS1","SIPA1L2","WDR70","ACER3","PIK3C2G")

In [10]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_ad_variants_afr.tab")
    output_file <- paste0("{WORK_DIR}/PAR/", gene, "_Kunkle_2020.csv")
    
    # Read in the dataset
    dataset1 <- fread(input_file, header = TRUE, sep = "\t")
    
    # Remove duplicated rows based on the 'MarkerName' column
    dataset1 <- dataset1[!duplicated(dataset1$MarkerName), ]
    
    # Add a new column 'StdErr_squared' by squaring 'StdErr'
    dataset_final <- dataset1 %>% mutate(StdErr_squared = StdErr^2)
    
    # Select the required columns and rename them
    output <- dataset_final[, c("MarkerName", "Effect", "P-value", "StdErr_squared")]
    colnames(output) <- c("SNP", "beta", "P", "varbeta")
    
    # Write the output to a CSV file
    fwrite(output, file = output_file, na = "NA", quote = FALSE, row.names = FALSE, sep = "\t")
}

In [11]:
## Run for genes
genes <- c("APOE","BIN1","TREM2","CD2AP","FERMT2","ABCA7","COBL","AKAP9","WWOX","ALCAM","GPC6","RBFOX1","EDEM1","VRK3","IGF1R","API5","SLC10A2","CR1","INPP5D","HLA","NYAP1","PTK2B","CLU","ECHDC3","SPI1","MS4A2","PICALM","SORL1","SLC24A4","ADAM10","IQCK","ACE","CASS4","ADAMTS1","SIPA1L2","WDR70","ACER3","PIK3C2G")

In [12]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_Kunkle_2020.csv")
    output <- fread(input_file, header = TRUE, sep = "\t")
    
    # Check if output has 0 rows
    if (nrow(output) == 0) {
        cat("No rows in output for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    SNP <- output$SNP
    beta <- output$beta
    varbeta <- output$varbeta
    N <- 8006  # 2,784 AD cases vs 8,006 total (2,784 cases, 5,222 controls - Kunkle et al 2020)
    s <- 0.348
    type <- 'cc'
    
    # Create dataset for fine-mapping
    dataset <- list(
        snp = SNP, 
        beta = beta, 
        varbeta = varbeta, 
        N = N, 
        s = s, 
        type = type)
        
    # Ensure dataset variables are numeric
    dataset$snp <- unlist(dataset$snp)
    dataset$beta <- unlist(dataset$beta)
    dataset$varbeta <- unlist(dataset$varbeta)
    
    # Assuming finemap.abf() works with a list, otherwise convert to a data.frame
    results <- finemap.abf(
        dataset = dataset,
        p1 = 1e-04  # Optional parameter for p-value threshold (can adjust based on your data)
    )
        
    # Check if results has 0 rows
    if (nrow(results) == 0) {
        cat("No results returned for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    # Combine the results with the original output
    combo <- cbind(results[1:(nrow(results) - 1),], output)
    
    # Subset results where SNP.PP > 0.2
    hits <- subset(combo, SNP.PP > 0.2)
    
    # Save the results to a CSV file
    final_output_file <- paste0("{WORK_DIR}/PAR/", gene, "_results_fine_map_Kunkle.csv")
    fwrite(combo, file = final_output_file, na = "NA", quote = F, row.names = F, sep = ",")
    }

“minimum p value is: 0.0097851
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 3.0146e-05
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.0045868
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.00020415
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the

In [13]:
# Define the directory containing the CSV files
input_directory <- "{WORK_DIR}/PAR/"  # Replace with your directory path

# List all files with the pattern "results_fine_map.csv" in the directory
file_list <- list.files(input_directory, pattern = "_results_fine_map_Kunkle.csv$", full.names = TRUE)

# Initialize an empty list to store the results
results_list <- list()

# Loop through each file
for (file in file_list) {
  # Extract the gene name from the file name (remove the "_results_fine_map.csv" suffix)
  gene <- gsub("_results_fine_map_Kunkle\\.csv$", "", basename(file))
  
  # Read the CSV file
  data <- read.csv(file)
  
  # Select the SNP with the highest SNP.PP value
  best_snp <- data %>%
    slice_max(SNP.PP, n = 1) %>%  # Select row(s) with the max SNP.PP
    mutate(gene = gene)           # Add the gene name
  
  # Append to the results list
  results_list[[gene]] <- best_snp
}

# Combine all results into a single dataframe
final_results <- bind_rows(results_list)

# Export the results to a CSV file
output_file <- "top_snp_per_gene_afr_ad.csv"  # Desired output file name
write.csv(final_results, output_file, row.names = FALSE)

# Print the first few rows of the final results
print(head(final_results))

          V.        z.        r.    lABF.         snp prior       SNP.PP
1 0.00462400  4.520588 0.8963786 8.025565  19:1001777 1e-04 0.1264281712
2 0.00568516  3.283820 0.8755578 3.678819 11:76583856 1e-04 0.0037792522
3 0.00385641  2.479871 0.9120674 1.588906 17:61599813 1e-04 0.0005046325
4 0.00295936 -2.650735 0.9311126 1.933543 15:59042452 1e-04 0.0007055228
5 0.01020100  2.896040 0.7967969 2.544611 21:28103644 1e-04 0.0013419609
6 0.04251844 -3.117362 0.4847401 2.023797  7:91825845 1e-04 0.0007676780
          SNP    beta         P    varbeta    gene
1  19:1001777  0.3074 6.115e-06 0.00462400   ABCA7
2 11:76583856  0.2476 1.029e-03 0.00568516   ACER3
3 17:61599813  0.1540 1.313e-02 0.00385641     ACE
4 15:59042452 -0.1442 8.026e-03 0.00295936  ADAM10
5 21:28103644  0.2925 3.788e-03 0.01020100 ADAMTS1
6  7:91825845 -0.6428 1.821e-03 0.04251844   AKAP9
