# Fine-mapping of AD-related risk loci in East Asian summary statistics
* Project: Cross-ancestry PAR
* Version: R/4.4
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook overview
* Extract chromosome and base pair positions from summary statistics for selected loci
* Perform fine-mapping and save results

In [35]:
library("data.table")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("snpStats")
library("robustbase")
library(ggplot2)
library(tidyr)
#devtools::install_github("chr1swallace/coloc")
library("coloc")
library("tidyverse")
library("readr")

In [37]:
## Read dataframe
df0 <- fread("{WORK_DIR}/AD/summary_stats/diverse_ancestry/Shigemizu_2021/NCGG_AD_GWAS2.txt", header =T)

In [38]:
head(df0)

CHR,SNP,BP,A1,A2,NMISS,NMISS_A,NMISS_U,MAF_A,MAF_U,OR,SE,L95,U95,STAT,P,Info_NCGG,Info_Niigata
<int>,<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,rs3094315,752566,G,A,8035,3961,4074,0.1471,0.1523,0.9691,0.04579,0.886,1.06,-0.6845,0.4937,1.0,0.999
1,rs3115860,753405,C,A,7936,3921,4015,0.1489,0.1549,0.9668,0.04571,0.884,1.057,-0.7385,0.4602,0.966,0.989
1,rs2073813,753541,A,G,7723,3797,3926,0.3741,0.3835,0.9759,0.03514,0.911,1.045,-0.6944,0.4874,0.969,0.979
1,rs3131969,754182,A,G,7724,3795,3929,0.374,0.3837,0.9746,0.03514,0.9097,1.044,-0.7322,0.464,0.969,0.978
1,rs3131968,754192,A,G,7724,3795,3929,0.374,0.3837,0.9746,0.03514,0.9097,1.044,-0.7322,0.464,0.969,0.978
1,rs4372192,876499,A,G,7959,3917,4042,0.04353,0.04181,1.038,0.08228,0.8836,1.22,0.4561,0.6483,0.98,0.907


In [39]:
# Rename columns
df0$Effect <- log(df0$OR)
df0$`P-value` <- df0$P
df0$StdErr <- df0$SE
df0$MarkerName <- paste(df0$CHR, df0$BP, sep = ":")

In [40]:
## EXTRACT CHRS - 100KB up & downstream from GWAS hits
# APOE rs429358 at chromosome 19, position 45,411,941 & rs7412 at chromosome 19, position 45,412,079
APOE_sumstats = subset(df0, CHR==19 & ((BP > 45311941 & BP < 45511941) | (BP > 45312079 & BP < 45512079)))
# FAM47E hit at chromosome 4, position 77,138,460 -> BP range: 77,038,460 - 77,238,460
FAM47E_sumstats = subset(df0, CHR==4 & BP > 77038460 & BP < 77238460)
# PAPOLG hit at chromosome 2, position 61,020,078 -> BP range: 60,920,078 - 61,120,078
PAPOLG_sumstats = subset(df0, CHR==2 & BP > 60920078 & BP < 61120078)
# RAB3C hit at chromosome 5, position 58,089,867 -> BP range: 57,989,867 - 58,189,867
RAB3C_sumstats = subset(df0, CHR==5 & BP > 57989867 & BP < 58189867)
# BANK1 hit at chromosome 4, position 102,551,570 -> BP range: 102,451,570 - 102,651,570
BANK1_sumstats = subset(df0, CHR==4 & BP > 102451570 & BP < 102651570)
# LINC01867 hit at chromosome 2, position 52,608,484 -> BP range: 52,508,484 - 52,708,484
LINC01867_sumstats = subset(df0, CHR==2 & BP > 52508484 & BP < 52708484)
# LINC00899 hit at chromosome 22, position 46,440,007: BP range: 46,340,007 - 46,540,007
LINC00899_sumstats = subset(df0, CHR==22 & BP > 46340007 & BP < 46540007)
# LOC101928516 hit at chromosome 6, position 74,763,411 -> BP range: 74,663,411 - 74,863,411
LOC101928516_sumstats = subset(df0, CHR==6 & BP > 74663411 & BP < 74863411)
# PICALM rs3851179 hit at chromosome 11, position -> BP range: 85,768,640 - 85,968,640
PICALM_sumstats = subset(df0, CHR==11 & BP > 85768640 & BP < 85968640)
# BIN1 rs7561528 hit at chromosome 2, position 127,889,637 -> BP range: 127,789,637 - 127,989,637
BIN1_sumstats = subset(df0, CHR==2 & BP > 127789637 & BP < 127989637)
# CLU rs1532276 hit at chromosome 8, position 27,466,157 -> BP range: 27,366,157 - 27,566,157
CLU_sumstats = subset(df0, CHR==8 & BP > 27366157 & BP < 27566157)
# CR1 rs679515 hit at chromosome 1, position 207,750,568 -> BP range: 207,650,568 - 207,850,568
CR1_sumstats = subset(df0, CHR==1 & BP > 207650568 & BP < 207850568)
# MS4A4A rs1582763 hit at chromosome 11, position 60,021,948 -> BP range: 59,921,948 - 60,121,948
MS4A4A_sumstats = subset(df0, CHR==11 & BP > 59921948 & BP < 60121948)
# SORL1 rs117807585 hit at chromosome 11, position 121,456,061 -> BP range: 121,356,061 - 121,556,061
SORL1_sumstats = subset(df0, CHR==11 & BP > 121356061 & BP < 121556061)
# MADD rs67472071 hit at chromosome 11, position 47,391,745 -> BP range: 47,291,745 - 47,491,745
MADD_sumstats = subset(df0, CHR==11 & BP > 47291745 & BP < 47491745)
# HLA-DRA rs4335021 hit at chromosome 6, position 32,386,619 -> BP range: 32,286,619 - 32,486,619
HLA_sumstats = subset(df0, CHR==6 & BP > 32286619 & BP < 32486619)
# CD2AP rs9473119 hit at chromosome 6, position 47,450,618 -> BP range: 47,350,618 - 47,550,618
CD2AP_sumstats = subset(df0, CHR==6 & BP > 47350618 & BP < 47550618)
# MTSS1L rs7195572 hit at chromosome 16, position 70,701,411 -> BP range: 70,601,411 - 70,801,411
MTSS1L_sumstats = subset(df0, CHR==16 & BP > 70601411 & BP < 70801411)
# EPHA1 rs11762262 hit at chromosome 7, position 143,107,876 
EPHA1_sumstats = subset(df0, CHR==7 & BP > 143007876 & BP < 143207876)
# ADAMTS1 rs2830489 hit at chromosome 21, position 28,148,191 -> BP range: 28,048,191 - 28,248,191
ADAMTS1_sumstats = subset(df0, CHR==21 & BP > 28048191 & BP < 28248191)
# SLC24A4 rs11160069 hit at chromosome 14, position 92,933,893 -> BP range: 92,833,893 - 93,033,893
SLC24A4_sumstats = subset(df0, CHR==14 & BP > 92833893 & BP < 93033893)
# CLEC3B rs7618668 (proxy rs7626571) hit at chromosome 3, position 45,097,509 -> BP range: 44,997,509 - 45,197,509
CLEC3B_sumstats = subset(df0, CHR==3 & BP > 44997509 & BP < 45197509)
# EFL1 rs905450 hit at chromosome 15, position 82,444,437 -> BP range: 82,344,437 - 82,544,437
EFL1_sumstats = subset(df0, CHR==15 & BP > 82344437 & BP < 82544437)
# LACTB2 rs13252043 hit at chromosome 8, position 71,551,628 -> BP range: 71,451,628 - 71,651,628
LACTB2_sumstats = subset(df0, CHR==8 & BP > 71451628 & BP < 71651628)
# ELL rs10405479 hit at chromosome 19, position 18,563,880 -> BP range: 18,463,880 - 18,663,880
ELL_sumstats = subset(df0, CHR==19 & BP > 18463880 & BP < 18663880)
# FAM155A rs9520713 hit at chromosome 13, position 108,672,385 -> 108,572,385 - 108,772,385
FAM155A_sumstats = subset(df0, CHR==13 & BP > 108572385 & BP < 108772385)
# FERMT2 rs74825460 hit at chromosome 14, position 53,390,015 -> BP range: 53,290,015 - 53,490,015
FERMT2_sumstats = subset(df0, CHR==14 & BP > 53290015 & BP < 53490015)
# ZCWPW1 rs34919929 hit at chromosome 7, position 100,012,334 -> BP range 99,912,334 - 100,112,334
ZCWPW1_sumstats = subset(df0, CHR==7 & BP > 99912334 & BP < 100112334)
# NTM rs9787911 hit at chromosome 11, position 131,769,402 -> BP range: 131,669,402 - 131,869,402
NTM_sumstats = subset(df0, CHR==11 & BP > 131669402 & BP < 131869402)
# OR2B2 rs1497525 (proxy rs1497526) hit at chromosome 6, position 27,883,269: BP range: 27,783,269 - 27,983,269
OR2B2_sumstats = subset(df0, CHR==6 & BP > 27783269 & BP < 27983269)
# C1S rs7311672 hit at chromosome 12, position 7,165,114 -> BP range: 7,065,114 - 7,265,114
C1S_sumstats = subset(df0, CHR==12 & BP > 7065114 & BP < 7265114)
# TSPOAP rs2526376 hit at chromosome 17, position 56,427,142 -> BP range: 56,327,142 - 56,527,142
TSPOAP_sumstats = subset(df0, CHR==17 & BP > 56327142 & BP < 56527142)
# TSPAN14 rs10748526 hit at chromosome 10, position 82,273,079 -> BP range: 82,173,079 - 82,373,079
TSPAN14_sumstats = subset(df0, CHR==10 & BP > 82173079 & BP < 82373079)

In [41]:
## Run for genes
genes <- c("APOE","FAM47E","PAPOLG","RAB3C","BANK1","LINC01867","LINC00899","LOC101928516","PICALM","BIN1","CLU","CR1","MS4A4A","SORL1","MADD","HLA","CD2AP","MTSS1L","EPHA1","ADAMTS1","SLC24A4","CLEC3B","EFL1","LACTB2","ELL","FAM155A","FERMT2","ZCWPW1","NTM","OR2B2","C1S","TSPOAP","TSPAN14")

In [42]:
for (gene in genes) {
    # Assume gene_sumstats is a data frame with summary statistics for each gene
    gene_sumstats <- get(paste0(gene, "_sumstats"))  # Get the data frame for the current gene
    if (is.data.frame(gene_sumstats)) {
        write_tsv(gene_sumstats, paste0("{WORK_DIR}/PAR/", "/", gene, "_ad_variants_eas.tab"))
    } else {
        warning(paste("No data frame found for", gene))
    }
}

In [43]:
## Run for genes
genes <- c("APOE","FAM47E","PAPOLG","RAB3C","BANK1","LINC01867","LINC00899","LOC101928516","PICALM","BIN1","CLU","CR1","MS4A4A","SORL1","MADD","HLA","CD2AP","MTSS1L","EPHA1","ADAMTS1","SLC24A4","CLEC3B","EFL1","LACTB2","ELL","FAM155A","FERMT2","ZCWPW1","NTM","OR2B2","C1S","TSPOAP","TSPAN14")

In [44]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_ad_variants_eas.tab")
    output_file <- paste0("{WORK_DIR}/PAR/", gene, "_Shigemizu_2021.csv")
    
    # Read in the dataset
    dataset1 <- fread(input_file, header = TRUE, sep = "\t")
    
    # Remove duplicated rows based on the 'MarkerName' column
    dataset1 <- dataset1[!duplicated(dataset1$MarkerName), ]
    
    # Add a new column 'StdErr_squared' by squaring 'StdErr'
    dataset_final <- dataset1 %>% mutate(StdErr_squared = StdErr^2)
    
    # Select the required columns and rename them
    output <- dataset_final[, c("MarkerName", "Effect", "P-value", "StdErr_squared")]
    colnames(output) <- c("SNP", "beta", "P", "varbeta")
    
    # Write the output to a CSV file
    fwrite(output, file = output_file, na = "NA", quote = FALSE, row.names = FALSE, sep = "\t")
}

In [45]:
## Run for genes
genes <- c("APOE","FAM47E","PAPOLG","RAB3C","BANK1","LINC01867","LINC00899","LOC101928516","PICALM","BIN1","CLU","CR1","MS4A4A","SORL1","MADD","HLA","CD2AP","MTSS1L","EPHA1","ADAMTS1","SLC24A4","CLEC3B","EFL1","LACTB2","ELL","FAM155A","FERMT2","ZCWPW1","NTM","OR2B2","C1S","TSPOAP","TSPAN14")

In [46]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_Shigemizu_2021.csv")
    output <- fread(input_file, header = TRUE, sep = "\t")
    
    # Check if output has 0 rows
    if (nrow(output) == 0) {
        cat("No rows in output for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    SNP <- output$SNP
    beta <- output$beta
    varbeta <- output$varbeta
    N <- 8006  # 3,962 AD cases vs 8,036 total (3,962 cases, 4,074 controls - Shigemizu et al 2021)
    s <- 0.493
    type <- 'cc'
    
    # Create dataset for fine-mapping
    dataset <- list(
        snp = SNP, 
        beta = beta, 
        varbeta = varbeta, 
        N = N, 
        s = s, 
        type = type)
        
    # Ensure dataset variables are numeric
    dataset$snp <- unlist(dataset$snp)
    dataset$beta <- unlist(dataset$beta)
    dataset$varbeta <- unlist(dataset$varbeta)
    
    # Assuming finemap.abf() works with a list, otherwise convert to a data.frame
    results <- finemap.abf(
        dataset = dataset,
        p1 = 1e-04  # Optional parameter for p-value threshold (can adjust based on your data)
    )
        
    # Check if results has 0 rows
    if (nrow(results) == 0) {
        cat("No results returned for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    # Combine the results with the original output
    combo <- cbind(results[1:(nrow(results) - 1),], output)
    
    # Subset results where SNP.PP > 0.2
    hits <- subset(combo, SNP.PP > 0.2)
    
    # Save the results to a CSV file
    final_output_file <- paste0("{WORK_DIR}/PAR/", gene, "_results_fine_map_Shigemizu.csv")
    fwrite(combo, file = final_output_file, na = "NA", quote = F, row.names = F, sep = ",")
    }

“minimum p value is: 1.0668e-06
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 1.7167e-06
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 2.7786e-06
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 3.29e-06
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the

In [47]:
# Define the directory containing the CSV files
input_directory <- "{WORK_DIR}/PAR/"  # Replace with your directory path

# List all files with the pattern "results_fine_map.csv" in the directory
file_list <- list.files(input_directory, pattern = "_results_fine_map_Shigemizu.csv$", full.names = TRUE)

# Initialize an empty list to store the results
results_list <- list()

# Loop through each file
for (file in file_list) {
  # Extract the gene name from the file name (remove the "_results_fine_map.csv" suffix)
  gene <- gsub("_results_fine_map_Shigemizu\\.csv$", "", basename(file))
  
  # Read the CSV file
  data <- read.csv(file)
  
  # Select the SNP with the highest SNP.PP value
  best_snp <- data %>%
    slice_max(SNP.PP, n = 1) %>%  # Select row(s) with the max SNP.PP
    mutate(gene = gene)           # Add the gene name
  
  # Append to the results list
  results_list[[gene]] <- best_snp
}

# Combine all results into a single dataframe
final_results <- bind_rows(results_list)

# Export the results to a CSV file
output_file <- "top_snp_per_gene_eas_ad.csv"  # Desired output file name
write.csv(final_results, output_file, row.names = FALSE)

# Print the first few rows of the final results
print(head(final_results))

           V.        z.        r.      lABF.         snp prior       SNP.PP
1 0.031364410 -2.933541 0.5605035   2.000689 21:28109583 1e-04 0.0007538447
2 0.002442336 24.697185 0.9424552 285.998125 19:45411941 1e-04 1.0000000000
3 0.022891690 -4.686539 0.6360141   6.479274 4:102551570 1e-04 0.0397722719
4 0.002139062  2.469646 0.9492380   1.404470 2:127910564 1e-04 0.0004175672
5 0.004239312 -2.445557 0.9041732   1.531210  12:7169661 1e-04 0.0004653143
6 0.002286752 -2.564193 0.9459227   1.651091  6:47380533 1e-04 0.0005290675
          SNP       beta          P     varbeta    gene
1 21:28109583 -0.5195301  3.360e-03 0.031364410 ADAMTS1
2 19:45411941  1.2205349 1.090e-134 0.002442336    APOE
3 4:102551570 -0.7090733  2.764e-06 0.022891690   BANK1
4 2:127910564  0.1142211  1.319e-02 0.002139062    BIN1
5  12:7169661 -0.1592302  1.446e-02 0.004239312     C1S
6  6:47380533 -0.1226197  1.031e-02 0.002286752   CD2AP
