# Fine-mapping of PD-related risk loci in Latino summary statistics
* Project: Cross-ancestry PAR
* Version: R/4.4
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook overview
* Extract chromosome and base pair positions from summary statistics for selected loci
* Perform fine-mapping and save results

In [1]:
library("data.table")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("snpStats")
library("robustbase")
library(ggplot2)
library(tidyr)
devtools::install_github("chr1swallace/coloc")
library("coloc")
library("tidyverse")
library("readr")

Skipping install of 'coloc' from a github remote, the SHA1 (fd1c0351) has not changed since last install.
  Use `force = TRUE` to force installation

This is coloc version 5.2.3

── [1mAttaching core tidyverse packages[22m ────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m     masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m      masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m       masks [34mdata.table[39m::first()
[31m✖[39m [34mlubridate[39m::[32mhour()[39m    masks [34md

In [2]:
## Read dataframe
df0 <- fread("{WORK_DIR}/Sumstat/per-cohort/AMR.final.txt", header =T)
head(df0)

MARKERNAME,CHROMOSOME,POSITION,EA,NEA,EAF,BETA,SE,OR,OR_95U,OR_95L,N,NMISS,P
<chr>,<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
chr1:662622,1,662622,A,G,0.065474,-0.1875751,0.2100987,0.8289669,1.251344,0.5491583,1481,1487.856,0.3719668
chr1:666249,1,666249,T,C,0.07331668,0.26633693,0.1809179,1.3051747,1.860669,0.9155208,1481,1487.856,0.1409824
chr1:668394,1,668394,A,AG,0.01880756,-0.17170011,0.4917997,0.8422317,2.208309,0.3212205,1481,1487.856,0.7269946
chr1:676118,1,676118,T,C,0.03893248,0.04431366,0.2828777,1.0453102,1.81985,0.6004195,1481,1487.856,0.8755183
chr1:693625,1,693625,C,T,0.02188589,-0.31083938,0.3586972,0.7328316,1.480244,0.3628064,1481,1487.856,0.3861729
chr1:693731,1,693731,G,A,0.06359318,-0.17158118,0.2129181,0.8423319,1.278565,0.554937,1481,1487.856,0.4203263


In [3]:
df0$MarkerName <- df0$MARKERNAME
df0$StdErr <- df0$SE
df0$Effect <- df0$BETA
df0$`P-value` <- df0$P
df0$CHR <- df0$CHROMOSOME
df0$BP <- df0$POSITION

In [14]:
## EXTRACT CHRS
# Chromosome 1: 155,234,452 - 155,244,627
GBA1_sumstats = subset(df0, CHR==1 & BP > 155234452 & BP< 155244627)
# Chromosome 12: 40,340,400 -> 40,240,400 - 40,440,400
LRRK2_sumstats = subset(df0, CHR==12 & BP > 40240400 & BP< 40440400)
# Chromosome 4: 89,722,606 -> 89,622,606 - 89,822,606
SNCA_sumstats = subset(df0, CHR==4 & BP > 89622606 & BP< 89822606)
# Chromosome 17: 45,974,480 -> 45,874,480 - 46,074,480
MAPT_sumstats = subset(df0, CHR==17 & BP > 45874480 & BP < 46074480)

In [15]:
## Run for genes
genes <- c("GBA1", "MAPT", "LRRK2", "SNCA")

In [16]:
for (gene in genes) {
    # Assume gene_sumstats is a data frame with summary statistics for each gene
    gene_sumstats <- get(paste0(gene, "_sumstats"))  # Get the data frame for the current gene
    if (is.data.frame(gene_sumstats)) {
        write_tsv(gene_sumstats, paste0("{WORK_DIR}/PAR/", "/", gene, "_variants_lat.tab"))
    } else {
        warning(paste("No data frame found for", gene))
    }
}

In [17]:
## Run for genes
genes <- c("GBA1", "MAPT", "LRRK2", "SNCA")

In [18]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_variants_lat.tab")
    output_file <- paste0("{WORK_DIR}/PAR/", gene, "_Loesch_2021.csv")
    
    # Read in the dataset
    dataset1 <- fread(input_file, header = TRUE, sep = "\t")
    
    # Remove duplicated rows based on the 'MarkerName' column
    dataset1 <- dataset1[!duplicated(dataset1$MarkerName), ]
    
    # Add a new column 'StdErr_squared' by squaring 'StdErr'
    dataset_final <- dataset1 %>% mutate(StdErr_squared = StdErr^2)
    
    # Select the required columns and rename them
    output <- dataset_final[, c("MarkerName", "Effect", "P-value", "StdErr_squared")]
    colnames(output) <- c("SNP", "beta", "P", "varbeta")
    
    # Write the output to a CSV file
    fwrite(output, file = output_file, na = "NA", quote = FALSE, row.names = FALSE, sep = "\t")
}

In [19]:
## Run for genes
genes <- c("GBA1", "MAPT", "LRRK2", "SNCA")

In [20]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_Loesch_2021.csv")
    output <- fread(input_file, header = TRUE, sep = "\t")
    
    # Check if output has 0 rows
    if (nrow(output) == 0) {
        cat("No rows in output for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    SNP <- output$SNP
    beta <- output$beta
    varbeta <- output$varbeta
    N <- 1497  # 807 PD cases vs 1497 total (807 cases, 690 controls - Loesch et al 2021)
    s <- 0.539
    type <- 'cc'
    
    # Create dataset for fine-mapping
    dataset <- list(
        snp = SNP, 
        beta = beta, 
        varbeta = varbeta, 
        N = N, 
        s = s, 
        type = type)
        
    # Ensure dataset variables are numeric
    dataset$snp <- unlist(dataset$snp)
    dataset$beta <- unlist(dataset$beta)
    dataset$varbeta <- unlist(dataset$varbeta)
    
    # Assuming finemap.abf() works with a list, otherwise convert to a data.frame
    results <- finemap.abf(
        dataset = dataset,
        p1 = 1e-04  # Optional parameter for p-value threshold (can adjust based on your data)
    )
        
    # Check if results has 0 rows
    if (nrow(results) == 0) {
        cat("No results returned for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    # Combine the results with the original output
    combo <- cbind(results[1:(nrow(results) - 1),], output)
    
    # Subset results where SNP.PP > 0.2
    hits <- subset(combo, SNP.PP > 0.2)
    
    # Save the results to a CSV file
    final_output_file <- paste0("{WORK_DIR}/PAR/", gene, "_results_fine_map_Loesch.csv")
    fwrite(combo, file = final_output_file, na = "NA", quote = F, row.names = F, sep = ",")
    }

“minimum p value is: 0.45266
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.012797
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.0052142
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.0020453
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varb

In [21]:
# Define the directory containing the CSV files
input_directory <- "{WORK_DIR}/PAR/"  # Replace with your directory path

# List all files with the pattern "results_fine_map.csv" in the directory
file_list <- list.files(input_directory, pattern = "_results_fine_map_Loesch.csv$", full.names = TRUE)

# Initialize an empty list to store the results
results_list <- list()

# Loop through each file
for (file in file_list) {
  # Extract the gene name from the file name (remove the "_results_fine_map.csv" suffix)
  gene <- gsub("_results_fine_map_Loesch\\.csv$", "", basename(file))
  
  # Read the CSV file
  data <- read.csv(file)
  
  # Select the SNP with the highest SNP.PP value
  best_snp <- data %>%
    slice_max(SNP.PP, n = 1) %>%  # Select row(s) with the max SNP.PP
    mutate(gene = gene)           # Add the gene name
  
  # Append to the results list
  results_list[[gene]] <- best_snp
}

# Combine all results into a single dataframe
final_results <- bind_rows(results_list)

# Export the results to a CSV file
output_file <- "top_snp_per_gene_lat.csv"  # Desired output file name
write.csv(final_results, output_file, row.names = FALSE)

# Print the first few rows of the final results
print(head(final_results))

          V.         z.        r.       lABF.            snp prior       SNP.PP
1 0.01595388  2.5630248 0.7148744  1.72062664 chr6:111871411 1e-04 5.596847e-04
2 0.23006075  0.1466591 0.1481148 -0.07855887 chr1:155237103 1e-04 9.250008e-05
3 0.03281979  2.7934910 0.5493012  1.74478332 chr12:40385910 1e-04 5.800107e-04
4 0.01035702 -2.3988447 0.7943282  1.49472645 chr17:45874509 1e-04 4.533845e-04
5 0.02563770  2.4515184 0.6094059  1.36120403  chr4:89667591 1e-04 3.928434e-04
             SNP       beta           P    varbeta  gene
1 chr6:111871411  0.3237323 0.010376463 0.01595388   FYN
2 chr1:155237103  0.0703445 0.883401139 0.23006075  GBA1
3 chr12:40385910  0.5060753 0.005214248 0.03281979 LRRK2
4 chr17:45874509 -0.2441291 0.016446890 0.01035702  MAPT
5  chr4:89667591  0.3925316 0.014225491 0.02563770  SNCA
