# Fine-mapping of PD-related risk loci in African/African Admixed summary statistics
* Project: Cross-ancestry PAR
* Version: R/4.4
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook overview
* Extract chromosome and base pair positions from summary statistics for selected loci
* Perform fine-mapping and save results

In [60]:
library("data.table")
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("snpStats")
library("robustbase")
library(ggplot2)
library(tidyr)
devtools::install_github("chr1swallace/coloc")
library("coloc")
library("tidyverse")
library("readr")

Skipping install of 'coloc' from a github remote, the SHA1 (fd1c0351) has not changed since last install.
  Use `force = TRUE` to force installation



In [61]:
## Read dataframe
df0 <- fread("{WORK_DIR}/PD/summary_stats/AFR_AAC_metaGWAS_MAF0.05_hg38_noindels_full_with23andMe.tab", header =T)
df0$CHR <- 	df0$chromosome
df0$BP <- df0$base_pair_location
df0$MarkerName <- df0$variant_id
df0$StdErr <- df0$standard_error
df0$Effect <- df0$beta
df0$`P-value` <- df0$p_value
df0$ID <- df0$rsid

In [62]:
head(df0)

chromosome,base_pair_location,effect_allele,other_allele,beta,standard_error,effect_allele_frequency,p_value,variant_id,ref_allele,⋯,HetDf,HetPVal,rsid,CHR,BP,MarkerName,StdErr,Effect,P-value,ID
<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,⋯,<int>,<dbl>,<chr>,<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
1,66861,T,C,-0.1072,0.4023,0.0724,0.7899,chr1:66861:C:T,C,⋯,0,1,rs28375825,1,66861,chr1:66861:C:T,0.4023,-0.1072,0.7899,rs28375825
1,80346,C,G,0.4608,0.3456,0.8338,0.1824,chr1:80346:C:G,C,⋯,0,1,rs376665626,1,80346,chr1:80346:C:G,0.3456,0.4608,0.1824,rs376665626
1,595259,A,G,-0.0456,0.4097,0.0597,0.9114,chr1:595259:G:A,G,⋯,0,1,rs201764041,1,595259,chr1:595259:G:A,0.4097,-0.0456,0.9114,rs201764041
1,664938,A,G,-0.1109,0.2264,0.939,0.6242,chr1:664938:A:G,A,⋯,0,1,rs536144132,1,664938,chr1:664938:A:G,0.2264,-0.1109,0.6242,rs536144132
1,665098,A,G,-0.0315,0.3587,0.1262,0.9301,chr1:665098:G:A,G,⋯,0,1,rs114979547,1,665098,chr1:665098:G:A,0.3587,-0.0315,0.9301,rs114979547
1,665115,C,G,0.0437,0.2225,0.0632,0.8443,chr1:665115:G:C,G,⋯,0,1,rs147241137,1,665115,chr1:665115:G:C,0.2225,0.0437,0.8443,rs147241137


In [63]:
## EXTRACT CHRS
# Chromosome 1: 155,234,452 - 155,244,627
GBA1_sumstats = subset(df0, CHR==1 & BP > 155234452 & BP< 155244627)
# Chromosome 6: 111,660,332-111,873,452
FYN_sumstats = subset(df0, CHR==6 & BP > 111660332 & BP< 111873452)
# Chromosome 12: 40,196,744-40,369,285
LRRK2_sumstats = subset(df0, CHR==12 & BP > 40196744 & BP< 40369285)
# Chromosome 4: 89,700,345-89,838,315
SNCA_sumstats = subset(df0, CHR==4 & BP > 89700345 & BP< 89838315)

In [64]:
## Run for genes
genes <- c("GBA1", "FYN", "LRRK2", "SNCA")

In [65]:
for (gene in genes) {
    # Assume gene_sumstats is a data frame with summary statistics for each gene
    gene_sumstats <- get(paste0(gene, "_sumstats"))  # Get the data frame for the current gene
    if (is.data.frame(gene_sumstats)) {
        write_tsv(gene_sumstats, paste0("{WORK_DIR}/PAR/", "/", gene, "_variants_afr.tab"))
    } else {
        warning(paste("No data frame found for", gene))
    }
}

In [66]:
## Run for genes
genes <- c("GBA1", "FYN", "LRRK2", "SNCA")

In [67]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_variants_afr.tab")
    output_file <- paste0("{WORK_DIR}/PAR/", gene, "_Rizig_2023_afr.csv")
    
    # Read in the dataset
    dataset1 <- fread(input_file, header = TRUE, sep = "\t")
    
    # Remove duplicated rows based on the 'MarkerName' column
    dataset1 <- dataset1[!duplicated(dataset1$MarkerName), ]
    
    # Add a new column 'StdErr_squared' by squaring 'StdErr'
    dataset_final <- dataset1 %>% mutate(StdErr_squared = StdErr^2)
    
    # Select the required columns and rename them
    output <- dataset_final[, c("MarkerName", "Effect", "P-value", "StdErr_squared", "ID")]
    colnames(output) <- c("SNP", "beta", "P", "varbeta", "rsID")
    
    # Write the output to a CSV file
    fwrite(output, file = output_file, na = "NA", quote = FALSE, row.names = FALSE, sep = "\t")
}

In [68]:
## Run for genes
genes <- c("GBA1", "FYN", "LRRK2", "SNCA")

In [69]:
for (gene in genes) {
    input_file <- paste0("{WORK_DIR}/PAR/", gene, "_Rizig_2023_afr.csv")
    output <- fread(input_file, header = TRUE, sep = "\t")
    
    # Check if output has 0 rows
    if (nrow(output) == 0) {
        cat("No rows in output for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    SNP <- output$SNP
    beta <- output$beta
    varbeta <- output$varbeta
    N <- 197918  # 1488 PD cases vs 197918 total (1488 cases, 196430 controls - Rizig et al 2023)
    s <- 0.008
    type <- 'cc'
    
    # Create dataset for fine-mapping
    dataset <- list(
        snp = SNP, 
        beta = beta, 
        varbeta = varbeta, 
        N = N, 
        s = s, 
        type = type)
        
    # Ensure dataset variables are numeric
    dataset$snp <- unlist(dataset$snp)
    dataset$beta <- unlist(dataset$beta)
    dataset$varbeta <- unlist(dataset$varbeta)
    
    # Assuming finemap.abf() works with a list, otherwise convert to a data.frame
    results <- finemap.abf(
        dataset = dataset,
        p1 = 1e-04  # Optional parameter for p-value threshold (can adjust based on your data)
    )
        
    # Check if results has 0 rows
    if (nrow(results) == 0) {
        cat("No results returned for gene: ", gene, ". Skipping...\n")
        next  # Skip to the next gene in the loop
    }
    
    # Combine the results with the original output
    combo <- cbind(results[1:(nrow(results) - 1),], output)
    
    # Subset results where SNP.PP > 0.2
    hits <- subset(combo, SNP.PP > 0.2)
    
    # Save the results to a CSV file
    final_output_file <- paste0("{WORK_DIR}/PAR/", gene, "_results_fine_map_Rizig_afr.csv")
    fwrite(combo, file = final_output_file, na = "NA", quote = F, row.names = F, sep = ",")
    }

“minimum p value is: 0.00011795
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”
“minimum p value is: 0.0064364
If this is what you expected, this is not a problem.
If this is not as small as you expected, please check you supplied var(beta) and not sd(beta) for the varbeta argument. If that's not the explanation, please check the 02_data vignette.”


In [76]:
# Define the directory containing the CSV files
input_directory <- "{WORK_DIR}/PAR/"  # Replace with your directory path

# List all files with the pattern "results_fine_map.csv" in the directory
file_list <- list.files(input_directory, pattern = "_results_fine_map_Rizig_afr.csv$", full.names = TRUE)

# Initialize an empty list to store the results
results_list <- list()

# Loop through each file
for (file in file_list) {
  # Extract the gene name from the file name (remove the "_results_fine_map.csv" suffix)
  gene <- gsub("_results_fine_map_Rizig_afr\\.csv$", "", basename(file))
  
  # Read the CSV file
  data <- read.csv(file)
  
  # Select the SNP with the highest SNP.PP value
  best_snp <- data %>%
    slice_max(SNP.PP, n = 1) %>%  # Select row(s) with the max SNP.PP
    mutate(gene = gene)           # Add the gene name
  
  # Append to the results list
  results_list[[gene]] <- best_snp
}

# Combine all results into a single dataframe
final_results <- bind_rows(results_list)

# Export the results to a CSV file
output_file <- "top_snp_per_gene_afr.csv"  # Desired output file name
write.csv(final_results, output_file, row.names = FALSE)

# Print the first few rows of the final results
print(head(final_results))

          V.        z.        r.     lABF.                snp prior      SNP.PP
1 0.00345744 -3.850340 0.9204408  5.557195 chr6:111873293:G:C 1e-04 0.022360604
2 0.00346921 -7.629881 0.9201916 25.520453 chr1:155235878:G:T 1e-04 0.999999872
3 0.00304704 -2.597826 0.9292160  1.811439 chr12:40296904:C:A 1e-04 0.000621984
4 0.00310249  5.003591 0.9280206 10.301237  chr4:89837238:G:C 1e-04 0.333758744
                 SNP    beta         P    varbeta      rsID  gene
1 chr6:111873293:G:C -0.2264 1.174e-04 0.00345744 rs1057979   FYN
2 chr1:155235878:G:T -0.4494 2.397e-14 0.00346921 rs3115534  GBA1
3 chr12:40296904:C:A -0.1434 9.374e-03 0.00304704 rs7957057 LRRK2
4  chr4:89837238:G:C  0.2787 5.722e-07 0.00310249 rs2301135  SNCA
