# Prepare AD Summary Statistics
* Project: Cross-ancestry PAR
* Version: Python/3.9
* Status: Complete
* Last Updated: 13-FEB-2025

## Notebook Overview
* Calculate population attributable risk for each target
* Generate table with summary statistics and PAR

In [2]:
# Import packages
import os
import glob
import numpy as np
import pandas as pd
import sys
import openpyxl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date

### Import most recent processed data

In [None]:
# Data path
path = '{WORK_DIR}/AD/processed/'

# Find csvs in path
files = glob.glob(os.path.join(path, '*.csv'))

# Find newest file
files.sort(key=os.path.getmtime, reverse=True)

# Read the most recent data file
if files:
    d = csv_files[0]
    df = pd.read_csv(d)
    print(f'Most recent file read: {d}')
    print(df)
else:
    print('No files found in the specified path.')

### Calculate population attributable risk for each target

In [4]:
# Negative beta -> flipped alleles, new maf = 1 - maf
df['eaf_risk'] = df['effect_allele_frequency'].where(df['beta'] > 0, 1 - df['effect_allele_frequency'])

# Absolute value of beta
df['beta_abs'] = df['beta'].abs()

# Risk allele
df['risk_allele'] = np.where(df['beta'] < 0, df['other_allele'], df['effect_allele'])

# Calculate the new odds ratio with beta_abs
df['odds_ratio_new'] = np.exp(df['beta_abs'])

#Calculate the population attributable risk (PAR = MAF(OR - 1)/1 + MAF(OR - 1))
df['par'] = (df['eaf_risk']*(df['odds_ratio_new']-1)) / (1 + df['eaf_risk']*(df['odds_ratio_new']-1))

In [4]:
eur = df[df["ancestry"] == 'European']

top_eur = eur.nlargest(n=20, columns='par')
top_eur

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
30,APOE,rs1065853,19,44909976,T,G,-0.6908,2.85e-174,0.0589,European,0.9411,0.6908,G,1.995311,0.483654
6,CASS4,rs113221226,20,56449045,A,G,-0.1264,1.89e-11,0.0807,European,0.9193,0.1264,G,1.134736,0.110212
15,HLA-DRB1;HLA-DQA1,rs35472547,6,32592593,T,G,-0.0869,4.923e-09,0.1479,European,0.8521,0.0869,G,1.090788,0.071805
8,CLU,rs1532278,8,27608798,T,C,-0.1092,1.048e-25,0.3768,European,0.6232,0.1092,C,1.115385,0.067084
29,APOE,rs1081105,19,44909698,A,C,-0.9599,3.8499999999999997e-295,0.9582,European,0.0418,0.9599,C,2.611435,0.063107
27,TSPAN14,rs7922621,10,80491788,A,C,0.0802,3.99e-10,0.8017,European,0.8017,0.0802,A,1.083504,0.062745
24,SLC24A4,rs36026988,14,92472038,T,C,0.0802,1.567e-10,0.7905,European,0.7905,0.0802,T,1.083504,0.061922
20,PICALM;EED,rs9787874,11,86149263,A,G,0.1047,3.381e-25,0.5291,European,0.5291,0.1047,A,1.110377,0.055178
19,MS4A6A,rs2278867,11,60175636,A,T,0.0875,2.783e-17,0.6034,European,0.6034,0.0875,A,1.091442,0.052291
14,EPHA1-AS1,rs11771145,7,143413669,A,G,-0.0748,1.81e-12,0.3459,European,0.6541,0.0748,G,1.077669,0.048347


In [5]:
afr = df[df["ancestry"] == 'Black/African American']

top_afr = afr.nlargest(n=20, columns='par')
top_afr

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
40,APOE,rs7412,19,45412079,T,C,-0.4585,2.544e-12,0.1059,Black/African American,0.8941,0.4585,C,1.5817,0.342148
39,APOE,rs429358,19,45411941,T,C,-0.8237,2.591e-60,0.7542,Black/African American,0.2458,0.8237,C,2.278916,0.239172
38,TSPAN14,rs7922621,10,82251544,A,C,0.1577,0.03986,0.93,Black/African American,0.93,0.1577,A,1.170815,0.137081
37,TRANK1,rs9867455,3,36953424,A,T,-0.1071,0.007598,0.326,Black/African American,0.674,0.1071,T,1.113046,0.070798
34,PICALM,rs9787874,11,85860305,A,G,0.0851,0.02684,0.5445,Black/African American,0.5445,0.0851,A,1.088826,0.046134
32,CD2AP,rs7738044,6,47469273,A,G,-0.1012,0.01092,0.6669,Black/African American,0.3331,0.1012,G,1.106498,0.034259
31,BIN1,rs34779859,2,127892768,T,G,0.143,0.01698,0.2221,Black/African American,0.2221,0.143,T,1.15373,0.033016
36,SPI1,rs4434960,11,47454551,A,T,-0.1187,0.01149,0.7978,Black/African American,0.2022,0.1187,T,1.126032,0.02485
35,SCIMP/RABEP1,rs57402520,17,5139807,A,G,0.1443,0.006655,0.1479,Black/African American,0.1479,0.1443,A,1.155231,0.022443
33,MME,rs61762319,3,154801978,A,G,-0.8326,0.03729,0.9935,Black/African American,0.0065,0.8326,G,2.299289,0.008375


In [6]:
ea = df[df['ancestry'] == 'East Asian']

top_ea = ea.nlargest(n=20, columns='par')
top_ea

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
46,APOE,rs7412,19,45412079,T,C,-0.514666,2.545e-08,0.02701,East Asian,0.97299,0.514666,C,1.67308,0.395734
45,APOE,rs429358,19,45411941,C,T,1.220535,1.0899999999999999e-134,0.2428,East Asian,0.2428,1.220535,C,3.389,0.367108
44,SORL1,rs3781837,11,121448972,C,T,-0.250643,2.964e-10,0.2139,East Asian,0.7861,0.250643,T,1.284852,0.182954
41,CLU,rs1532278,8,27466315,T,C,-0.128515,0.0005621,0.2608,East Asian,0.7392,0.128515,C,1.137139,0.092042
43,PICALM,rs9787874,11,85860305,G,A,-0.138917,3.576e-05,0.4058,East Asian,0.5942,0.138917,A,1.149029,0.081349
42,KANSL1,rs7225002,17,44189067,A,G,0.079735,0.01649,0.5067,East Asian,0.5067,0.079735,A,1.083,0.040359


In [7]:
ch = df[df['ancestry'] == 'Latino']

top_ch = ch.nlargest(n=20, columns='par')
top_ch

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
49,APOE,rs429358,19,45411941,C,T,0.716317,5.49416e-14,0.210868,Latino,0.210868,0.716317,C,2.04688,0.180834
48,PICALM,rs9787874,11,85860305,G,A,-0.184856,0.0162685,0.447227,Latino,0.552773,0.184856,A,1.203045,0.100912
47,ABCA7,rs12151021,19,1050874,A,G,0.184236,0.0199669,0.337433,Latino,0.337433,0.184236,A,1.2023,0.063901


## T-test

In [5]:
from scipy import stats
from scipy.stats import ttest_ind
from itertools import combinations

# Get unique studies
studies = df['ancestry'].unique()

# Perform pairwise t-tests
results = []
for cat1, cat2 in combinations(studies, 2):
    group1 = df[df['ancestry'] == cat1]['par']
    group2 = df[df['ancestry'] == cat2]['par']
    
    t_stat, p_value = ttest_ind(group1, group2)
    results.append({'Category1': cat1, 'Category2': cat2, 'T-statistic': t_stat, 'P-value': p_value})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

print(results_df)

                Category1               Category2  T-statistic   P-value
0                European  Black/African American    -1.319935  0.194552
1                European              East Asian    -3.276091  0.002379
2                European                  Latino    -1.267900  0.213982
3  Black/African American              East Asian    -1.474253  0.162543
4  Black/African American                  Latino    -0.283200  0.782282
5              East Asian                  Latino     0.827194  0.435415


### Generate tables with summary statistics and PAR

In [8]:
# Top 20, not filtered by p-value
top_all = pd.concat([top_eur, top_afr, top_ea, top_ch])

In [None]:
# Top 20, filtered by p-value
df_filt = df[df.p_value < 0.05]

eur_filt = df_filt[df_filt['ancestry'] == 'European']
top_eur_filt = eur_filt.nlargest(n=20, columns='par')

afr_filt = df_filt[df_filt['ancestry'] == 'Black/African American']
top_afr_filt = afr_filt.nlargest(n=20, columns='par')

ea_filt = df_filt[df_filt['ancestry'] == 'East Asian']
top_ea_filt = ea_filt.nlargest(n=20, columns='par')

lat_filt = df_filt[df_filt['ancestry'] == 'Latino']
top_lat_filt = lat_filt.nlargest(n=20, columns='par')

top_all_filt = pd.concat([top_eur_filt, top_afr_filt, top_ea_filt, top_lat_filt])

### Export results to csv

In [None]:
# Current date in YYYY-MM-DD 
today = datetime.now().strftime('%Y-%m-%d')

# Save path
path = '{WORK_DIR}/AD/results/' 

# List of dataframes and filenames
dfs = [(df, f'{today}_AD_PAR_results.csv'),
              (top_all, f'{today}_AD_PAR_results_top20.csv.csv'),
              (top_all_filt, f'{today}_AD_PAR_results_top20_filtered.csv')]

# Export each dataframe to CSV at the specific path
for df, filename in dfs:
    full_path = os.path.join(path, filename)
    df.to_csv(full_path, index=False, header=True)
    print(f'Results exported to {full_path}')