# Review of fine-mapping
* Project: Cross-ancestry PAR
* Version: python/3.10
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook overview
* Combine results of fine-mapping analysis
* Add summary statistics needed for PAR calculations
* Calculate PAR for variants with high posterior probability and save results

In [1]:
# Import packages
import os
import numpy as np
import pandas as pd
import sys
import openpyxl
import seaborn as sns
import matplotlib.pyplot as plt
import glob

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Extract all fine map results and combine

In [None]:
# Define path and filename pattern
dir = '{WORK_DIR}/PAR/'
file_pattern = os.path.join(dir, '*_results_fine_map_*.csv')

# List to store dataframes
filtered_dfs = []

files = glob.glob(file_pattern)
if not files:
    print("No files matched the pattern. Review your directory/file pattern.")
else:
    print(f"Found files: {files}")

# Loop over results files
for file in glob.glob(file_pattern):
    print(f"Processing file: {file}")
    df = pd.read_csv(file)

    # We only want SNPs with SNP.PP > 80%
    filtered_df = df[df['SNP.PP'] > 0.8]

    # Extract  gene, ancestry from filename
    base_name = os.path.basename(file)
    gene_name = base_name.split('_')[0]
    ancestry = base_name.split('_')[-1].replace('.csv','')

    # Columns for gene, ancestry
    filtered_df['Gene'] = gene_name
    filtered_df['Ancestry'] = ancestry

    filtered_dfs.append(filtered_df)

# Combine all filtered dataframes into one large dataframe for processing
if filtered_dfs:
    final_df = pd.concat(filtered_dfs, ignore_index=True)
    # Save the combined dataframe to a new csv
    final_df.to_csv('finemap_results_with_highpp.csv', index=False)
    # Display the final dataframe
    final_df.head(25)
else:
    print("No filtered dataframes to combine.")

# Sort by ancestry and posterior probability
final_df = final_df.sort_values(by=['Ancestry', 'SNP.PP'], ascending=[True, False])

In [None]:
# Add a disease column
disease_mapping = {
    'Bellenguez': 'AD',
    'Kunkle': 'AD',
    'Shigemizu': 'AD',
    'Lake': 'AD',
    'Nalls': 'PD',
    'afr': 'PD',
    'Foo': 'PD',
    'Loesch': 'PD'
}

# Match data source to disease
final_df['Disease'] = final_df['Ancestry'].map(disease_mapping)

# Map data source to ancestry
ancestry_mapping = {
    'Bellenguez': 'European',
    'Nalls': 'European',
    'Kunkle': 'African/African Admixed',
    'afr': 'African/African Admixed',
    'Foo': 'East Asian',
    'Shigemizu': 'East Asian',
    'Lake': 'Latino',
    'Loesch': 'Latino'
}

# Replace ancestry values using the mapping
final_df['Ancestry'] = final_df['Ancestry'].replace(ancestry_mapping)
final_df = final_df.sort_values(by=['Ancestry', 'Disease'])

# Rename column
final_df = final_df.rename(columns={'SNP':'MarkerName'})

# Export results to csv
final_df.to_csv('{WORK_DIR}/PAR/results/finemap_results_with_80pp.csv')

### Read in summary statistics

In [8]:
# AD files
ad_eur = pd.read_csv('{WORK_DIR}/AD/summary_stats/Bellenguez_2022/noProxy/EADB-minus-UKB_Nov2022.tsv.gz', sep='\t', header=0, low_memory=False)
ad_afr = pd.read_csv('{WORK_DIR}/AD/summary_stats/diverse_ancestry/NG00100_Kunkle2021/Kunkle2020_ADGC_AA_META_Model1_SummaryStats.withAlleleFreqs_REFORMATTED.txt',sep='\t', header=0, low_memory=False)
ad_lat = pd.read_csv('{WORK_DIR}/trans_ethnic_AD/format_stats/CarHisp_for_MRMEGA.no_multiAllelics_indels.MAF_0.01.txt', sep='\t', header=0, low_memory=False)
ad_eas = pd.read_csv('{WORK_DIR}/AD/summary_stats/diverse_ancestry/Shigemizu_2021/NCGG_AD_GWAS2.txt', sep='\t', header=0, low_memory=False)

# PD files
pd_eur = pd.read_csv('{WORK_DIR}/PD/summary_stats/META5_all_with_rsid_hg38.txt', sep='\t', header=0).rename(columns={
    'ID':'SNP'})
pd_afr = pd.read_csv('{WORK_DIR}/PD/summary_stats/AFR_AAC_metaGWAS_MAF0.05_hg38_noindels_full_with23andMe.tab', sep='\t', header=0, low_memory=False).rename(columns={
    'rsid':'SNP'})
pd_lat = pd.read_csv('{WORK_DIR}/Sumstat/per-cohort/AMR.final.txt', sep='\t', header=0, low_memory=False)
pd_eas = pd.read_csv('{WORK_DIR}/Sumstat/per-cohort/EAS.final.txt', sep='\t', header=0, low_memory=False)

### Prepare AD summary statistics for calculations

In [9]:
# Split the 'MarkerName' column and extract 'Chromosome' and 'Position'
ad_eur['Chromosome'] = ad_eur['MarkerName'].str.split(':').str[0].str.replace('chr', '', regex=False)
ad_eur['Position'] = ad_eur['MarkerName'].str.split(':').str[1]

# Rename columns
ad_eur = ad_eur[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
ad_eur['Ancestry'] = 'European'
ad_eur.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,chr11:12541586:A:G,11,12541586,A,G,0.3748,0.0007,0.94860
1,chr8:102803998:G:A,8,102803998,A,G,0.0026,-0.1579,0.19910
2,chr13:55101557:T:C,13,55101557,T,C,0.9856,-0.0252,0.56260
3,chr7:17885109:C:G,7,17885109,C,G,0.9989,0.5638,0.01901
4,chr3:97729697:A:G,3,97729697,A,G,0.0183,-0.0126,0.81340
...,...,...,...,...,...,...,...,...
21101592,chr14:32118446:G:A,14,32118446,A,G,0.0011,0.3314,0.17720
21101593,chr1:160763389:C:A,1,160763389,A,C,0.0006,0.2565,0.40880
21101594,chr6:90702181:A:T,6,90702181,A,T,0.7871,-0.0151,0.23820
21101595,chr9:17691435:C:T,9,17691435,T,C,0.0376,-0.0175,0.55210


In [11]:
# Rename columns
ad_afr = ad_afr.rename(columns={'Chr':'Chromosome','Pos':'Position','Effect_allele':'effect_allele','Non_Effect_allele':'other_allele','Beta':'beta','Pvalue':'p_value',
                                'Effect_allele_Freq':'effect_allele_frequency'})
ad_afr = ad_afr[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
ad_afr['Ancestry'] = 'African/African Admixed'
ad_afr.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,10:100000012,10,100000012,A,G,0.0180,0.1734,0.2524
1,10:100000122,10,100000122,A,T,0.0083,0.3347,0.1445
2,10:100000354,10,100000354,T,C,0.9989,0.6226,0.5404
3,10:100000588,10,100000588,T,C,0.9603,-0.0061,0.9531
4,10:100000625,10,100000625,A,G,0.7182,0.0059,0.8873
...,...,...,...,...,...,...,...,...
27724500,9:99999366,9,99999366,T,G,0.9970,0.8480,0.1425
27724501,9:99999388,9,99999388,A,C,0.9736,0.1170,0.3396
27724502,9:99999468,9,99999468,A,G,0.8633,0.0523,0.3623
27724503,9:99999608,9,99999608,T,C,0.0156,-0.1658,0.3139


In [13]:
# Create MarkerName column
ad_eas['MarkerName'] = ad_eas['CHR'].astype(str) + ':' + ad_eas['BP'].astype(str)

# Rename columns
ad_eas = ad_eas.rename(columns={'CHR':'Chromosome','BP':'Position','A1':'effect_allele','A2':'other_allele','P':'p_value',
                                'MAF_A':'effect_allele_frequency'})
# Calculate odds ratio
ad_eas['beta'] = np.log(ad_eas['OR'])
ad_eas = ad_eas[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
ad_eas['Ancestry'] = 'East Asian'
ad_eas.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,1:752566,1,752566,G,A,0.147100,-0.031387,0.4937
1,1:753405,1,753405,C,A,0.148900,-0.033764,0.4602
2,1:753541,1,753541,A,G,0.374100,-0.024395,0.4874
3,1:754182,1,754182,A,G,0.374000,-0.025728,0.4640
4,1:754192,1,754192,A,G,0.374000,-0.025728,0.4640
...,...,...,...,...,...,...,...,...
4852952,22:51185740,22,51185740,ACATTGT,A,0.008116,-0.229162,0.2024
4852953,22:51185743,22,51185743,A,AAGC,0.008116,-0.229162,0.2024
4852954,22:51189403,22,51189403,G,A,0.027350,-0.037702,0.7084
4852955,22:51208005,22,51208005,G,GTC,0.008122,-0.177812,0.3272


In [15]:
# Rename columns
ad_lat = ad_lat.rename(columns={'MARKERNAME':'MarkerName','CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele','P':'p_value',
                                'EAF':'effect_allele_frequency'})

# Calculate odds ratio
ad_lat['beta'] = np.log(ad_lat['OR'])
ad_lat = ad_lat[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
ad_lat['Ancestry'] = 'Latino'
ad_lat.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,1:48824,1,48824,C,T,0.017666,-0.229158,0.431702
1,1:54490,1,54490,A,G,0.043157,-0.158201,0.421182
2,1:60351,1,60351,G,A,0.024821,-0.145668,0.550887
3,1:64931,1,64931,A,G,0.024598,-0.196565,0.426951
4,1:66861,1,66861,T,C,0.044946,-0.140637,0.442988
...,...,...,...,...,...,...,...,...
12466227,22:51242036,22,51242036,C,G,0.030188,0.092679,0.666683
12466228,22:51242137,22,51242137,A,T,0.013417,0.003723,0.992015
12466229,22:51242271,22,51242271,G,C,0.115385,0.184028,0.113002
12466230,22:51242557,22,51242557,T,A,0.063283,0.139475,0.364707


### Prepare PD summary statistics for PAR calculations

In [None]:
# Rename columns
pd_eur = pd_eur.rename(columns={'MarkerName':'MarkerName_old','SNP':'MarkerName','Allele1':'effect_allele','Allele2':'other_allele','Freq1':'effect_allele_frequency','Effect':'beta',
                                'P-value':'p_value','Chr':'Chromosome','BP':'Position'})
pd_eur = pd_eur[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Uppercase for allele columns
pd_eur['effect_allele'] = pd_eur['effect_allele'].str.upper()
pd_eur['other_allele'] = pd_eur['other_allele'].str.upper()

# Add ancestry column
pd_eur['Ancestry'] = 'European'
pd_eur.head()

In [20]:
# Rename columns
pd_afr  = pd_afr.rename(columns={'chromosome':'Chromosome','base_pair_location':'Position','variant_id':'MarkerName'})
pd_afr = pd_afr[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
pd_afr['Ancestry'] = 'African/African Admixed'
pd_afr.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,chr1:66861:C:T,1,66861,T,C,0.0724,-0.1072,0.789900
1,chr1:80346:C:G,1,80346,C,G,0.8338,0.4608,0.182400
2,chr1:595259:G:A,1,595259,A,G,0.0597,-0.0456,0.911400
3,chr1:664938:A:G,1,664938,A,G,0.9390,-0.1109,0.624200
4,chr1:665098:G:A,1,665098,A,G,0.1262,-0.0315,0.930100
...,...,...,...,...,...,...,...,...
9339045,chrX:156025429:T:C,X,156025429,T,C,0.9476,0.2412,0.291900
9339046,chrX:156025488:G:A,X,156025488,A,G,0.0565,0.0024,0.992000
9339047,chrX:156030815:T:G,X,156030815,T,G,0.4469,0.2225,0.085630
9339048,chrX:156030858:G:T,X,156030858,T,G,0.4391,-0.4200,0.001933


In [22]:
# Rename columns
pd_eas = pd_eas.rename(columns={'CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele','P':'p_value',
                                'EAF':'effect_allele_frequency','BETA':'beta','MARKERNAME':'MarkerName'})
pd_eas = pd_eas[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
pd_eas['Ancestry'] = 'East Asian'
pd_eas.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,chr1:794332,1,794332,A,G,0.139423,0.0013,0.96910
1,chr1:832359,1,832359,T,C,0.031250,-0.0932,0.18680
2,chr1:834056,1,834056,C,A,0.028846,-0.0766,0.32270
3,chr1:834263,1,834263,T,C,0.028846,-0.0846,0.28530
4,chr1:834956,1,834956,A,G,0.026442,-0.0704,0.36590
...,...,...,...,...,...,...,...,...
5629375,chr22:51158499,22,51158499,G,T,0.512019,-0.0029,0.90000
5629376,chr22:51164109,22,51164109,C,G,0.560096,0.0059,0.80760
5629377,chr22:51164115,22,51164115,T,C,0.562500,0.0030,0.90240
5629378,chr22:51181685,22,51181685,A,G,0.694712,0.0484,0.07563


In [24]:
# Rename columns
pd_lat = pd_lat.rename(columns={'CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele','P':'p_value',
                                'EAF':'effect_allele_frequency','BETA':'beta','MARKERNAME':'MarkerName'})
pd_lat = pd_lat[['MarkerName','Chromosome','Position','effect_allele','other_allele','effect_allele_frequency','beta','p_value']]

# Add ancestry column
pd_lat['Ancestry'] = 'Latino'
pd_lat.head()

Unnamed: 0,MarkerName,Chromosome,Position,effect_allele,other_allele,effect_allele_frequency,beta,p_value
0,chr1:662622,1,662622,A,G,0.065474,-0.187575,0.371967
1,chr1:666249,1,666249,T,C,0.073317,0.266337,0.140982
2,chr1:668394,1,668394,A,AG,0.018808,-0.171700,0.726995
3,chr1:676118,1,676118,T,C,0.038932,0.044314,0.875518
4,chr1:693625,1,693625,C,T,0.021886,-0.310839,0.386173
...,...,...,...,...,...,...,...,...
8668488,chr22:51235959,22,51235959,C,T,0.085745,-0.080132,0.643497
8668489,chr22:51235979,22,51235979,A,G,0.281600,-0.046489,0.668448
8668490,chr22:51236013,22,51236013,AT,A,0.230838,0.031699,0.778325
8668491,chr22:51237063,22,51237063,C,T,0.307650,0.125151,0.241345


### Create dataframe for calculations with summary statistics

In [27]:
# Concatenate the summary statistics 
summary_stats = pd.concat([ad_eur,ad_afr,ad_eas,ad_lat,pd_eur,pd_afr,pd_eas,pd_lat], ignore_index=True)

# Merge final_df with summ stats on MarkerName and Ancestry
final_df = final_df.merge(summary_stats, on=['MarkerName','Ancestry'], how='left')

# Drop unnecessary column
final_df = final_df.drop(columns='beta_x')

# Rename column
final_df = final_df.rename(columns={'beta_y':'beta'})

# Export to csv
final_df.to_csv('{WORK_DIR}/PAR/reuslts/80_pp_par_sumstats.csv')

### PAR calculations

In [31]:
# Negative beta -> flipped alleles, new maf = 1 - maf
final_df['eaf_risk'] = final_df['effect_allele_frequency'].where(final_df['beta'] > 0, 1 - final_df['effect_allele_frequency'])

# Absolute value of beta
final_df['beta_abs'] = final_df['beta'].abs()

# Risk allele
final_df['risk_allele'] = np.where(final_df['beta'] < 0, final_df['other_allele'], final_df['effect_allele'])

# Calculate the new odds ratio with beta_abs
final_df['odds_ratio_new'] = np.exp(final_df['beta_abs'])

#Calculate the population attributable risk (PAR = MAF(OR - 1)/1 + MAF(OR - 1))
final_df['par'] = (final_df['eaf_risk']*(final_df['odds_ratio_new']-1)) / (1 + final_df['eaf_risk']*(final_df['odds_ratio_new']-1))

# Replacing values -> CRHR1 is always MAPT, GBAP1 -> GBA1
final_df['Gene'] = final_df['Gene'].replace({'CRHR1':'MAPT'})
final_df['Gene'] = final_df['Gene'].replace({'GBAP1':'GBA1'})

# Export results to csv
final_df.to_csv('{WORK_DIR}/PAR/results/80_pp_par_results.csv')