# Calculate PAR for PD-related targets
* Project: Cross-ancestry PAR analysis
* Version: Python/3.9
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook Overview
* Calculate population attributable risk for each target
* Generate table with summary statistics and PAR

In [53]:
# Import packages
import os
import glob
import numpy as np
import pandas as pd
import sys
import openpyxl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date

### Import most recent processed data

In [None]:
# Data path
path = '{WORK_DIR}/PD/processed/'

# Find csvs in path
files = glob.glob(os.path.join(path, '*.csv'))

# Find newest file
files.sort(key=os.path.getmtime, reverse=True)

# Read the most recent data file
if files:
    d = files[0]
    df = pd.read_csv(d)
    print(f'Most recent data file: {d}')
    print(df)
else:
    print('No files found in the specified path.')

### Calculate population attributable risk for each target

In [55]:
# Negative beta -> flipped alleles, new maf = 1 - maf
df['eaf_risk'] = df['effect_allele_frequency'].where(df['beta'] > 0, 1 - df['effect_allele_frequency'])

# Absolute value of beta
df['beta_abs'] = df['beta'].abs()

# Risk allele
df['risk_allele'] = np.where(df['beta'] < 0, df['other_allele'], df['effect_allele'])

# Calculate the new odds ratio with beta_abs
df['odds_ratio_new'] = np.exp(df['beta_abs'])

#Calculate the population attributable risk (PAR = MAF(OR - 1)/1 + MAF(OR - 1))
df['par'] = (df['eaf_risk']*(df['odds_ratio_new']-1)) / (1 + df['eaf_risk']*(df['odds_ratio_new']-1))

In [56]:
# CRHR1 is always MAPT, GBAP1 -> GBA1
df['Locus'] = df['Locus'].replace({'CRHR1':'MAPT'})
df['Locus'] = df['Locus'].replace({'GBAP1':'GBA1'})

In [26]:
eur = df[df["ancestry"] == 'European']

top_eur = eur.nlargest(n=20, columns='par')
top_eur

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
79,MAPT,rs62053943,17,45666837,T,C,-0.27,3.5840000000000003e-68,0.1552,European,0.8448,0.27,C,1.309964,0.207518
80,MAPT,rs117615688,17,45720942,A,G,-0.2324,6.711e-16,0.067,European,0.933,0.2324,G,1.261624,0.196203
21,GAK,rs873786,4,931588,T,C,-0.1731,1.793e-21,0.0988,European,0.9012,0.1731,C,1.188985,0.145528
37,HLA-DRB5,rs112485576,6,32610995,A,C,-0.1676,6.96e-28,0.163,European,0.837,0.1676,C,1.182464,0.132488
42,GS1-124K5.11,rs76949143,7,66544864,A,T,-0.1432,1.426e-08,0.0507,European,0.9493,0.1432,T,1.153961,0.127517
20,MCCC1,rs10513789,3,183042285,T,G,0.1485,1.221e-34,0.8112,European,0.8112,0.1485,T,1.160093,0.11494
28,SNCA,rs356182,4,89704960,A,G,-0.2774,3.89e-154,0.6278,European,0.3722,0.2774,G,1.319694,0.106337
74,CHD9,rs10221156,16,52935514,A,G,-0.1156,1.082e-10,0.0932,European,0.9068,0.1156,G,1.122547,0.100012
68,VPS13C,rs2251086,15,61705186,T,C,-0.1186,6.077e-18,0.1417,European,0.8583,0.1186,C,1.125919,0.097535
9,KCNS3,rs76116224,2,17966582,A,T,0.1104,1.266e-08,0.9042,European,0.9042,0.1104,A,1.116725,0.095467


In [27]:
afr = df[df["ancestry"] == 'African/African Admixed']

top_afr = afr.nlargest(n=20, columns='par')
top_afr

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
93,RAB29,rs11557080,1,205768611,A,G,-0.3481,0.154,0.0503,African/African Admixed,0.9497,0.3481,G,1.416374,0.283375
133,GBF1,rs10748818,10,102255522,A,G,0.187,0.01233,0.8696,African/African Admixed,0.8696,0.187,A,1.205627,0.151689
105,MCCC1,rs10513789,3,183042285,T,G,0.1727,0.006314,0.8122,African/African Admixed,0.8122,0.1727,T,1.188509,0.132778
166,GBA1,rs3115534,1,155235878,T,G,-0.4494,2.397e-14,0.7816,African/African Admixed,0.2184,0.4494,G,1.567371,0.110252
148,VPS13C,rs2251086,15,61705186,T,C,-0.1192,0.01957,0.3778,African/African Admixed,0.6222,0.1192,C,1.126595,0.073016
157,FAM171A2,rs850738,17,44357262,A,G,-0.1049,0.07846,0.2905,African/African Admixed,0.7095,0.1049,G,1.1106,0.072761
160,DNAH17,rs666463,17,78429399,A,T,0.0889,0.1856,0.8391,African/African Admixed,0.8391,0.0889,A,1.092971,0.072367
112,SNCA,rs356182,4,89704960,A,G,-0.2188,2.481e-05,0.6818,African/African Admixed,0.3182,0.2188,G,1.244582,0.072207
159,BRIP1,rs61169879,17,61840005,T,C,-0.079,0.4237,0.0792,African/African Admixed,0.9208,0.079,C,1.082204,0.070367
122,FYN,rs997368,6,111922088,A,G,0.2066,0.0002744,0.3057,African/African Admixed,0.3057,0.2066,A,1.229491,0.065556


In [28]:
ea = df[df['ancestry'] == 'East Asian']

top_ea = ea.nlargest(n=20, columns='par')
top_ea

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
192,HLA-DRB5,rs112485576,6,32578772,A,C,-0.1605,0.0002972,0.069711,East Asian,0.930288,0.1605,C,1.174098,0.139386
227,ASXL3,rs1941685,18,31304318,T,G,0.1539,5.9e-05,0.884615,East Asian,0.884615,0.1539,T,1.166374,0.128295
169,RAB29,rs11557080,1,205737739,A,G,0.2126,2.91e-20,0.519231,East Asian,0.519231,0.2126,A,1.23689,0.109528
215,VPS13C,rs2251086,15,61997385,C,T,0.128,5.56e-05,0.829327,East Asian,0.829327,0.128,C,1.136553,0.101727
170,ITPKB,rs4653767,1,226916078,C,T,-0.1305,3.18e-07,0.264423,East Asian,0.735577,0.1305,T,1.139398,0.093002
187,CLCN3,rs62333164,4,170583157,A,G,-0.1028,0.03004,0.060096,East Asian,0.939904,0.1028,G,1.10827,0.092364
205,DLG2,rs12283611,11,83487277,A,C,-0.1187,5.82e-06,0.269231,East Asian,0.730769,0.1187,C,1.126032,0.084333
216,SETD1A,rs11150601,16,30977799,A,G,0.092,0.0258,0.949519,East Asian,0.949519,0.092,A,1.096365,0.08383
179,MCCC1,rs10513789,3,182760073,G,T,-0.1901,1.1e-16,0.608173,East Asian,0.391827,0.1901,T,1.209371,0.075817
183,SCARB2,rs6825004,4,77110365,G,C,-0.1192,4.38e-07,0.379808,East Asian,0.620192,0.1192,C,1.126595,0.072798


In [29]:
lat = df[df['ancestry'] == 'Latino']

top_lat = lat.nlargest(n=20, columns='par')
top_lat

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
300,MAPT,rs117615688,17,43798308,A,G,-0.74073,0.0002291157,0.045562,Latino,0.954438,0.74073,G,2.097466,0.511591
238,KCNS3,rs76116224,2,18147848,T,A,-0.359278,0.02684601,0.073345,Latino,0.926655,0.359278,A,1.432296,0.286015
256,SNCA,rs356182,4,90626111,A,G,-0.460053,2.484767e-08,0.555666,Latino,0.444334,0.460053,G,1.584159,0.206073
294,CHD9,rs10221156,16,52969426,A,G,-0.246284,0.1094036,0.075788,Latino,0.924212,0.246284,G,1.279263,0.205149
277,RNF141,rs7938782,11,10558777,G,A,-0.246127,0.0707713,0.105757,Latino,0.894243,0.246127,A,1.279062,0.199712
289,VPS13C,rs2251086,15,61997385,C,T,0.236127,0.04489795,0.855848,Latino,0.855848,0.236127,C,1.266335,0.185629
291,SETD1A,rs11150601,16,30977799,A,G,0.278549,0.0009806977,0.558559,Latino,0.558559,0.278549,A,1.321212,0.152122
252,LCORL,rs34025766,4,17968811,A,T,-0.181487,0.1131676,0.162728,Latino,0.837272,0.181487,T,1.198999,0.14282
299,MAPT,rs62053943,17,43744203,T,C,-0.157566,0.3039939,0.081547,Latino,0.918453,0.157566,C,1.170658,0.135503
284,MBNL2,rs4771268,13,97865021,C,T,0.164418,0.1597881,0.849848,Latino,0.849848,0.164418,C,1.178706,0.131849


## T-test

In [57]:
from scipy import stats
from scipy.stats import ttest_ind

In [58]:
# Dictionary to map sample size to study (each ancestry is a different study)
study_mapping = {
    'European': 1142025,
    'African/African Admixed': 3645,
    'East Asian': 31575,
    'Latino': 1497
}

# Creating the new column
df['N'] = df['ancestry'].map(study_mapping)

In [29]:
df_lrrk2 = df[df.Locus == 'LRRK2']
df_lrrk2

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par,N
57,LRRK2,rs76904798,12,40220632,T,C,0.1439,1.524e-28,0.1444,European,0.1444,0.1439,T,1.154769,0.02186,1142025
58,LRRK2,rs34637584,12,40340400,A,G,2.4289,3.61e-148,0.0015,European,0.0015,2.4289,A,11.346394,0.015282,1142025
138,LRRK2,rs76904798,12,40220632,T,C,0.0902,0.271,0.0925,African/African Admixed,0.0925,0.0902,T,1.094393,0.008656,3645
207,LRRK2,rs76904798,12,40614434,T,C,-0.02,0.707,0.045673,East Asian,0.954327,0.02,C,1.020201,0.018914,31575
280,LRRK2,rs76904798,12,40614434,T,C,0.024496,0.8114177,0.198712,Latino,0.198712,0.024496,T,1.024798,0.004904,1497


In [44]:
df_lrrk2_eur = df_lrrk2[df_lrrk2.ancestry == 'European']
df_lrrk2_afr = df_lrrk2[df_lrrk2.ancestry == 'African/African Admixed']
df_lrrk2_eas = df_lrrk2[df_lrrk2.ancestry == 'East Asian']
df_lrrk2_lat = df_lrrk2[df_lrrk2.ancestry == 'Latino']

print('EUR v AA: ', stats.ttest_ind(df_lrrk2_eur.par, df_lrrk2_afr.par))
print('EUR v EAS: ', stats.ttest_ind(df_lrrk2_eur.par, df_lrrk2_eas.par))
print('EUR v LAT: ', stats.ttest_ind(df_lrrk2_eur.par, df_lrrk2_lat.par))

EUR v AA:  TtestResult(statistic=1.7406513159867456, pvalue=0.3319696007159764, df=1.0)
EUR v EAS:  TtestResult(statistic=-0.060180655238289575, pvalue=0.9617339566167049, df=1.0)
EUR v LAT:  TtestResult(statistic=2.3993578304609198, pvalue=0.2513923225565754, df=1.0)


In [47]:
df_gba1 = df[df.Locus == 'GBA1']

df_gba1_eur = df_gba1[df_gba1.ancestry == 'European']
df_gba1_afr = df_gba1[df_gba1.ancestry == 'African/African Admixed']
df_gba1_eas = df_gba1[df_gba1.ancestry == 'East Asian']
df_gba1_lat = df_gba1[df_gba1.ancestry == 'Latino']

print('EUR v AA: ', stats.ttest_ind(df_gba1_eur.par, df_gba1_afr.par))
print('EUR v EAS: ', stats.ttest_ind(df_gba1_eur.par, df_gba1_eas.par))
print('EUR v LAT: ', stats.ttest_ind(df_gba1_eur.par, df_gba1_lat.par))

EUR v AA:  TtestResult(statistic=nan, pvalue=nan, df=0.0)
EUR v EAS:  TtestResult(statistic=nan, pvalue=nan, df=0.0)
EUR v LAT:  TtestResult(statistic=nan, pvalue=nan, df=0.0)


In [50]:
from itertools import combinations

In [52]:
# Get unique studies
studies = df['ancestry'].unique()

# Perform pairwise t-tests
results = []
for cat1, cat2 in combinations(studies, 2):
    group1 = df[df['ancestry'] == cat1]['par']
    group2 = df[df['ancestry'] == cat2]['par']
    
    t_stat, p_value = ttest_ind(group1, group2)
    results.append({'Category1': cat1, 'Category2': cat2, 'T-statistic': t_stat, 'P-value': p_value})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

print(results_df)

                 Category1                Category2  T-statistic   P-value
0                 European  African/African Admixed     2.300592  0.022666
1                 European               East Asian     1.998342  0.047451
2                 European                   Latino    -2.232804  0.026891
3  African/African Admixed               East Asian    -0.348012  0.728354
4  African/African Admixed                   Latino    -3.492760  0.000624
5               East Asian                   Latino    -3.147543  0.002007


### Generate tables with summary statistics and PAR

In [32]:
# Top 20, not filtered by p-value
top_all = pd.concat([top_eur, top_afr, top_ea, top_lat])

In [35]:
# Top 20, filtered by p-value
df_filt = df[df.p_value < 0.05]

eur_filt = df_filt[df_filt['ancestry'] == 'European']
top_eur_filt = eur_filt.nlargest(n=20, columns='par')

afr_filt = df_filt[df_filt['ancestry'] == 'African/African Admixed']
top_afr_filt = afr_filt.nlargest(n=20, columns='par')

ea_filt = df_filt[df_filt['ancestry'] == 'East Asian']
top_ea_filt = ea_filt.nlargest(n=20, columns='par')

lat_filt = df_filt[df_filt['ancestry'] == 'Latino']
top_lat_filt = lat_filt.nlargest(n=20, columns='par')

top_all_filt = pd.concat([top_eur_filt, top_afr_filt, top_ea_filt, top_lat_filt])

### Export results to csv

In [None]:
# Current date in YYYY-MM-DD 
today = datetime.now().strftime('%Y-%m-%d')

# Save path
path = '{WORK_DIR}/PD/results/' 

# List of dataframes and filenames
dfs = [(df, f'{today}_PD_PAR_results.csv'),
              (top_all, f'{today}_PD_PAR_results_top20.csv.csv'),
              (top_all_filt, f'{today}_PD_PAR_results_top20_filtered.csv')]

# Export each dataframe to CSV at the specific path
for df, filename in dfs:
    full_path = os.path.join(path, filename)
    df.to_csv(full_path, index=False, header=True)
    print(f'Results exported to {full_path}')