# Fine-mapping of AD-related risk loci in Latino summary statistics
* Project: Cross-ancestry PAR
* Version: R/4.4
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook overview
* Extract chromosome and base pair positions from summary statistics for selected loci
* Perform fine-mapping and save results

In [12]:
library("data.table")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("snpStats")
library("robustbase")
library(ggplot2)
library(tidyr)
#devtools::install_github("chr1swallace/coloc")
library("coloc")
library("tidyverse")
library("readr")

In [13]:
getwd()

In [14]:
## Read dataframe
df0 <- fread("{WORK_DIR}/trans_ethnic_AD/format_stats/CarHisp_for_MRMEGA.no_multiAllelics_indels.MAF_0.01.txt", header =T)

In [15]:
head(df0)

MARKERNAME,CHROMOSOME,POSITION,EA,NEA,EAF,OR,OR_95U,OR_95L,N,P
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
1:48824,1,48824,C,T,0.0176655,0.795203,1.407862,0.4491546,2240,0.431702
1:54490,1,54490,A,G,0.0431574,0.853678,1.255188,0.580603,2240,0.421182
1:60351,1,60351,G,A,0.0248211,0.864445,1.395185,0.5356029,2240,0.550887
1:64931,1,64931,A,G,0.0245975,0.821548,1.334293,0.5058416,2240,0.426951
1:66861,1,66861,T,C,0.0449463,0.868805,1.244424,0.6065636,2240,0.442988
1:80346,1,80346,G,C,0.0677549,0.951718,1.277693,0.7089081,2240,0.741926


In [18]:
# Rename columns
df0$Effect <- log(df0$OR)
df0$`P-value` <- df0$P
df0$StdErr <- (log(df0$OR_95U) - log(df0$OR_95L)) / (2 * 1.96)
df0$MarkerName <- df0$MARKERNAME
df0$CHR <- df0$CHROMOSOME
df0$BP <- df0$POSITION

In [20]:
## EXTRACT CHRS - 100KB up & downstream from GWAS hits
# APOE rs429358 at chromosome 19, position 45,411,941 & rs7412 at chromosome 19, position 45,412,079
APOE_sumstats = subset(df0, CHR==19 & ((BP > 45311941 & BP < 45511941) | (BP > 45312079 & BP < 45512079)))
# ABCA7 rs12151021 at chromosome 19, BP 1,050,874
ABCA7_sumstats = subset(df0, CHR==19 & BP > 950874 & BP < 1150874)
# ACE rs4311 at chromosome 17, BP 61,560,763
ACE_sumstats = subset(df0, CHR==17 & BP > 61460763 & BP < 61660763)
# ADAMTS1 at chromosome 21, BP 28,148,191
ADAMTS1_sumstats = subset(df0, CHR==21 & BP > 28048191 & BP < 28248191)
# ANK3 rs2830489 at chromosome 10, BP 61,784,928
ANK3_sumstats = subset(df0, CHR==10 & BP > 61684928 & BP < 61884928)
# APH1B rs117618017 at chromosome 15, BP 63,569,902
APH1B_sumstats = subset(df0, CHR==15 & BP > 63469902 & BP < 63669902)
# APP rs2154481 at chromosome 21, BP 27,473,875
APP_sumstats = subset(df0, CHR==21 & BP > 27373875 & BP < 27573875)
# BIN1 rs6733839 at chromosome 2, BP 127,892,810
BIN1_sumstats = subset(df0, CHR==2 & BP > 127792810 & BP < 127992810)
# CLNK/HS3ST1 rs10939105 at chromosome 4, BP 11,023,682
CLNK_sumstats = subset(df0, CHR==4 & BP > 10923682 & BP < 11123682)
# CLU rs1532278 at chromosome 8, BP 27,466,315
CLU_sumstats = subset(df0, CHR==8 & BP > 27366315 & BP < 27566315)
# CR1 rs679515 at chromosome 1, BP 207,750,568
CR1_sumstats = subset(df0, CHR==1 & BP > 207650568 & BP < 207850568)
# CTSB rs1065712 at chromosome 8, BP 11,702,122
CTSB_sumstats = subset(df0, CHR==8 & BP > 11602122 & BP < 11802122)
# DOC2A rs1140239 at chromosome 16, BP 30,021,402
DOC2A_sumstats = subset(df0, CHR==16 & BP > 29921402 & BP < 30121402)
# FAM157C/PRDM7 rs56407236 at chromosome 16, BP	90,170,095
FAM157C_sumstats = subset(df0, CHR==16 & BP > 90070095 & BP < 90270095)
# FBXO33 rs74745468 at chromosome 14, BP 39,887,676
FBXO33_sumstats = subset(df0, CHR==14 & BP > 39787676 & BP < 39987676)
# FERMT2 rs2025632 at chromosome 14, BP	53,294,319
FERMT2_sumstats = subset(df0, CHR==14 & BP > 53194319 & BP < 53394319)
# GRN rs5848 at chromosome 17, BP 42,430,244
GRN_sumstats = subset(df0, CHR==17 & BP > 42330244 & BP < 42530244)
# HS3ST5 rs9374457 at chromosome 6, 114,679,925
HS3ST5_sumstats = subset(df0, CHR==6 & BP > 114579925 & BP < 114779925)
# IDUA rs4690221 at chromosome 4, BP 983,809
IDUA_sumstats = subset(df0, CHR==4 & BP > 883809 & BP < 1083809)
# IGH gene cluster rs74093831 at chromosome 14, BP 106,200,003
IGH_sumstats = subset(df0, CHR==14 & BP > 106100003 & BP < 106300003)
# KAT8 rs78924645 at chromosome 16, BP 31,154,358
KAT8_sumstats = subset(df0, CHR==16 & BP > 31054358 & BP < 31254358)
# LILRB2 rs12984029 at chromosome 19, BP 54,769,366
LILRB2_sumstats = subset(df0, CHR==19 & BP > 54669366 & BP < 54869366)
# MAF rs434626 at chromosome 16, BP	79,606,520
MAF_sumstats = subset(df0, CHR==16 & BP > 79506520 & BP < 79706520)
# MS4A gene cluster rs7232 at chromosome 11, BP	59,940,599
MS4A_sumstats = subset(df0, CHR==11 & BP > 59840599 & BP < 60040599)
# PICALM rs3851179 at chromosome 11, BP	85,868,640
PICALM_sumstats = subset(df0, CHR==11 & BP > 85768640 & BP < 85968640)
# PLCG2 rs9931998 at chromosome 16, BP 81,772,536
PLCG2_sumstats = subset(df0, CHR==16 & BP > 81672536 & BP < 81872536)
# PLEKHA1 rs2292626 at chromosome 10, BP 124,186,714
PLEKHA1_sumstats = subset(df0, CHR==10 & BP > 124086714 & BP < 124286714)
# RASGEF1C rs113706587 at chromosome 5, BP 179,628,150
RASGEF1C_sumstats = subset(df0, CHR==5 & BP > 179528150 & BP < 179728150)
# REXO1 rs732310 at chromosome 19, BP 1,833,338
REXO1_sumstats = subset(df0, CHR==19 & BP > 1733338 & BP < 1933338)
# SCIMP/RABEP1 rs57402520 at chromosome 17, BP 5,139,807
SCIMP_sumstats = subset(df0, CHR==17 & BP > 5039807 & BP < 5239807)
# SHARPIN rs34173062 at chromosome 8, BP 145,158,607
SHARPIN_sumstats = subset(df0, CHR==8 & BP > 145058607 & BP < 145258607)
# SLC24A4 rs12590654 at chromosome 14, BP 92,938,855
SLC24A4_sumstats = subset(df0, CHR==14 & BP > 92838855 & BP < 93038855)
# SNX31 rs1693551 at chromosome 8, BP 101,675,584
SNX31_sumstats = subset(df0, CHR==8 & BP > 101575584 & BP < 101775584)
# SORL1 rs117807585 at chromosome 11, BP 121,456,061
SORL1_sumstats = subset(df0, CHR==11 & BP > 121356061 & BP < 121556061)
# TRANK1 rs9867455 at chromosome 3, BP 36,953,424
TRANK1_sumstats = subset(df0, CHR==3 & BP > 36853424 & BP < 37053424)
# TSPOAP1 rs2526378 at chromosome 17, BP 56,404,349
TSPOAP1_sumstats = subset(df0, CHR==17 & BP > 56304349 & BP < 56504349)
# UMAD1 rs6943429 at chromosome 7, BP 785,694
UMAD1_sumstats = subset(df0, CHR==7 & BP > 685694 & BP < 885694)
# WDR12/ICA1L rs149163995 at chromosome 2, BP 203,777,226
WDR12_sumstats = subset(df0, CHR==2 & BP > 203677226 & BP < 203877226)

In [21]:
## Run for genes
genes <- c("APOE","ABCA7","ACE","ADAMTS1","ANK3","APH1B","APP","BIN1","CLNK","CLU","CR1","CTSB","DOC2A","FAM157C","FBXO33","FERMT2","GRN","HS3ST5","IDUA","IGH","KAT8","LILRB2","MAF","MS4A","PICALM","PLCG2","PLEKHA1","RASGEF1C","REXO1","SCIMP","SHARPIN","SLC24A4","SNX31","SORL1","TRANK1","TSPOAP1","UMAD1","WDR12")

In [22]:
for (gene in genes) {
    # Assume gene_sumstats is a data frame with summary statistics for each gene
    gene_sumstats <- get(paste0(gene, "_sumstats"))  # Get the data frame for the current gene
    if (is.data.frame(gene_sumstats)) {
        write_tsv(gene_sumstats, paste0("{WORK_DIR}/PAR/", "/", gene, "_ad_variants_lat.tab"))
    } else {
        warning(paste("No data frame found for", gene))
    }
}

In [23]:
## Run for genes
genes <- c("APOE","ABCA7","ACE","ADAMTS1","ANK3","APH1B","APP","BIN1","CLNK","CLU","CR1","CTSB","DOC2A","FAM157C","FBXO33","FERMT2","GRN","HS3ST5","IDUA","IGH","KAT8","LILRB2","MAF","MS4A","PICALM","PLCG2","PLEKHA1","RASGEF1C","REXO1","SCIMP","SHARPIN","SLC24A4","SNX31","SORL1","TRANK1","TSPOAP1","UMAD1","WDR12")

In [24]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_ad_variants_lat.tab")
    output_file <- paste0("{WORK_DIR}/PAR/", gene, "_Lake_2023.csv")
    
    # Read in the dataset
    dataset1 <- fread(input_file, header = TRUE, sep = "\t")
    
    # Remove duplicated rows based on the 'MarkerName' column
    dataset1 <- dataset1[!duplicated(dataset1$MarkerName), ]
    
    # Add a new column 'StdErr_squared' by squaring 'StdErr'
    dataset_final <- dataset1 %>% mutate(StdErr_squared = StdErr^2)
    
    # Select the required columns and rename them
    output <- dataset_final[, c("MarkerName", "Effect", "P-value", "StdErr_squared")]
    colnames(output) <- c("SNP", "beta", "P", "varbeta")
    
    # Write the output to a CSV file
    fwrite(output, file = output_file, na = "NA", quote = FALSE, row.names = FALSE, sep = "\t")
}

In [25]:
## Run for genes
genes <- c("APOE","ABCA7","ACE","ADAMTS1","ANK3","APH1B","APP","BIN1","CLNK","CLU","CR1","CTSB","DOC2A","FAM157C","FBXO33","FERMT2","GRN","HS3ST5","IDUA","IGH","KAT8","LILRB2","MAF","MS4A","PICALM","PLCG2","PLEKHA1","RASGEF1C","REXO1","SCIMP","SHARPIN","SLC24A4","SNX31","SORL1","TRANK1","TSPOAP1","UMAD1","WDR12")

In [26]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_Lake_2023.csv")
    output <- fread(input_file, header = TRUE, sep = "\t")
    
    # Check if output has 0 rows
    if (nrow(output) == 0) {
        cat("No rows in output for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    SNP <- output$SNP
    beta <- output$beta
    varbeta <- output$varbeta
    N <- 2274  # 1,095 AD cases vs 2,274 total (1,095 cases, 1,179 controls - Lake et al 2023)
    s <- 0.482
    type <- 'cc'
    
    # Create dataset for fine-mapping
    dataset <- list(
        snp = SNP, 
        beta = beta, 
        varbeta = varbeta, 
        N = N, 
        s = s, 
        type = type)
        
    # Ensure dataset variables are numeric
    dataset$snp <- unlist(dataset$snp)
    dataset$beta <- unlist(dataset$beta)
    dataset$varbeta <- unlist(dataset$varbeta)
    
    # Assuming finemap.abf() works with a list, otherwise convert to a data.frame
    results <- finemap.abf(
        dataset = dataset,
        p1 = 1e-04  # Optional parameter for p-value threshold (can adjust based on your data)
    )
        
    # Check if results has 0 rows
    if (nrow(results) == 0) {
        cat("No results returned for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    # Combine the results with the original output
    combo <- cbind(results[1:(nrow(results) - 1),], output)
    
    # Subset results where SNP.PP > 0.2
    hits <- subset(combo, SNP.PP > 0.2)
    
    # Save the results to a CSV file
    final_output_file <- paste0("{WORK_DIR}/PAR/", gene, "_results_fine_map_Lake.csv")
    fwrite(combo, file = final_output_file, na = "NA", quote = F, row.names = F, sep = ",")
    }

“minimum p value is: 7.8536e-05
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.029846
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.0010264
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.0012185
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the v

In [27]:
# Define the directory containing the CSV files
input_directory <- "{WORK_DIR}/PAR/"  # Replace with your directory path

# List all files with the pattern "results_fine_map.csv" in the directory
file_list <- list.files(input_directory, pattern = "_results_fine_map_Lake.csv$", full.names = TRUE)

# Initialize an empty list to store the results
results_list <- list()

# Loop through each file
for (file in file_list) {
  # Extract the gene name from the file name (remove the "_results_fine_map.csv" suffix)
  gene <- gsub("_results_fine_map_Lake\\.csv$", "", basename(file))
  
  # Read the CSV file
  data <- read.csv(file)
  
  # Select the SNP with the highest SNP.PP value
  best_snp <- data %>%
    slice_max(SNP.PP, n = 1) %>%  # Select row(s) with the max SNP.PP
    mutate(gene = gene)           # Add the gene name
  
  # Append to the results list
  results_list[[gene]] <- best_snp
}

# Combine all results into a single dataframe
final_results <- bind_rows(results_list)

# Export the results to a CSV file
output_file <- "top_snp_per_gene_lat_ad.csv"  # Desired output file name
write.csv(final_results, output_file, row.names = FALSE)

# Print the first few rows of the final results
print(head(final_results))

           V.        z.        r.     lABF.         snp prior       SNP.PP
1 0.042194911  3.948823 0.4866481  3.460805  19:1100550 1e-04 0.0031912971
2 0.012879299  2.172129 0.7564397  1.078301 17:61586471 1e-04 0.0002970457
3 0.011601875 -2.563746 0.7751656  1.801306 21:28123425 1e-04 0.0006158356
4 0.011030671 -3.234525 0.7838423  3.334466 10:61879846 1e-04 0.0028022552
5 0.007990143 -2.280541 0.8335045  1.271080 15:63657737 1e-04 0.0003611486
6 0.009074449  7.519604 0.8150881 22.200412 19:45411941 1e-04 0.9999355772
          SNP       beta           P     varbeta    gene
1  19:1100550  0.8111435 7.85388e-05 0.042194911   ABCA7
2 17:61586471  0.2465085 2.98485e-02 0.012879299     ACE
3 21:28123425 -0.2761462 1.03554e-02 0.011601875 ADAMTS1
4 10:61879846 -0.3397125 1.21843e-03 0.011030671    ANK3
5 15:63657737 -0.2038521 2.25757e-02 0.007990143   APH1B
6 19:45411941  0.7163167 5.49416e-14 0.009074449    APOE
