# Prepare AD Summary Statistics
* Project: Cross-ancestry PAR analysis
* Version: Python/3.12
* Status: COMPLETE
* Last Updated: 20-OCT-2025

## Notebook Overview
* Import datasets for each ancestry
* Clean and filter p < 0.05
* Select top hits for each ancestry and generate dataset for calculations

In [1]:
# Import packages
from datetime import datetime
import os
import glob
import numpy as np
import pandas as pd
import sys
import openpyxl
import seaborn as sns
import matplotlib.pyplot as plt

### European data from Bellenguez
Annotated differently from top hits summary stats, so matching chromosome/position will not work. Will need to annotate using ANNOVAR before merging with top hits.

In [2]:
# Without proxies
df_eur = pd.read_csv('{WORK_DIR}/noProxy/EADB-minus-UKB_Nov2022.tsv.gz', sep='\t', header=0)
df_eur.head()

Unnamed: 0,MarkerName,p_value,effect_allele,other_allele,effect_allele_frequency,beta,standard_error,n_cases,n_controls,het_isq,het_pvalue
0,chr11:12541586:A:G,0.9486,A,G,0.3748,0.0007,0.0106,36659,63137,32.7,0.1371
1,chr8:102803998:G:A,0.1991,A,G,0.0026,-0.1579,0.123,32228,42115,0.0,0.7297
2,chr13:55101557:T:C,0.5626,T,C,0.9856,-0.0252,0.0436,36227,62057,9.0,0.3599
3,chr7:17885109:C:G,0.01901,C,G,0.9989,0.5638,0.2404,23331,28992,0.0,0.6406
4,chr3:97729697:A:G,0.8134,A,G,0.0183,-0.0126,0.0532,36131,56626,0.0,0.9542


In [3]:
eur = df_eur[df_eur['p_value'] < 0.00000005].drop_duplicates(subset=['MarkerName'])

# Split the MarkerName column into different columns
eur[['Chromosome', 'Position', 'Ref', 'Alt']] = eur['MarkerName'].str.split(':', expand=True)
eur['Chromosome'] = eur['Chromosome'].str.replace('chr', '')

# Prep annovar input 
df_annovar_eur = pd.DataFrame()
df_annovar_eur['Chr'] = eur['Chromosome']
df_annovar_eur['Start'] = eur['Position']
df_annovar_eur['End'] = eur['Position']
df_annovar_eur['Ref'] = eur['Ref']
df_annovar_eur['Alt'] = eur['Alt']
df_annovar_eur

Unnamed: 0,Chr,Start,End,Ref,Alt
1591,6,32689933,32689933,A,G
6824,19,44946776,44946776,C,T
14557,11,60327769,60327769,T,C
23540,11,86069406,86069406,T,C
27778,19,44749457,44749457,ATAT,A
...,...,...,...,...,...
20999262,19,44829875,44829875,T,G
21069925,19,45134987,45134987,C,T
21072633,8,27603975,27603975,C,A
21085415,17,63476980,63476980,C,T


In [5]:
# Export ANNOVAR input file
df_annovar_eur.to_csv('{WORK_DIR}/EUR_no_proxy_for_annovar.txt', sep=' ', index=False, header=None)

#### Run ANNOVAR to annotate

In [19]:
%%bash

module load annovar

table_annovar.pl EUR_no_proxy_for_annovar.txt $ANNOVAR_DATA/hg38 \
-buildver hg38 \
-arg '-splicing 15',,, \
-remove \
-protocol refGene,avsnp150,ljb26_all,gnomad312_genome \
-operation g,f,f,f \
-nastring . \
-polish

[+] Loading annovar 2020-06-08 on cn4272 
-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=refGene

NOTICE: Running with system command <annotate_variation.pl -geneanno -buildver hg38 -dbtype refGene -outfile EUR_no_proxy_for_annovar.txt.refGene -exonsort -nofirstcodondel EUR_no_proxy_for_annovar.txt /fdb/annovar/current/hg38 -splicing 15>
NOTICE: Output files are written to EUR_no_proxy_for_annovar.txt.refGene.variant_function, EUR_no_proxy_for_annovar.txt.refGene.exonic_variant_function
NOTICE: Reading gene annotation from /fdb/annovar/current/hg38/hg38_refGene.txt ... Done with 88819 transcripts (including 21511 without coding sequence annotation) for 28307 unique genes
NOTICE: Processing next batch with 2936 unique variants in 2936 input lines
NOTICE: Reading FASTA sequences from /fdb/annovar/current/hg38/hg38_refGeneMrna.fa ... Done with 108 sequences
NOTICE: Variants with invalid input format are written to EUR_no_proxy_for

#### ANNOVAR output

In [6]:
# Read in ANNOVAR output
annovar_output_eur = pd.read_csv('{WORK_DIR}/AD/EUR_no_proxy_for_annovar.txt.hg38_multianno.txt', sep="\t")
annovar_output_eur.head()

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,gnomad312_AF_afr,gnomad312_AF_ami,gnomad312_AF_amr,gnomad312_AF_asj,gnomad312_AF_eas,gnomad312_AF_fin,gnomad312_AF_mid,gnomad312_AF_nfe,gnomad312_AF_oth,gnomad312_AF_sas
0,6,32689933,32689933,A,G,intergenic,HLA-DQB1;HLA-DQA2,dist=23276;dist=51458,.,.,...,0.0449,0.1629,0.2200,0.2053,0.1295,0.1245,0.1384,0.1158,0.1116,0.1021
1,19,44946776,44946776,C,T,ncRNA_intronic,APOC4-APOC2,.,.,.,...,0.5836,0.4346,0.5655,0.4398,0.5610,0.4893,0.3861,0.4547,0.4909,0.5810
2,11,60327769,60327769,T,C,intronic,MS4A6E,.,.,.,...,0.2732,0.2852,0.3920,0.2998,0.4075,0.5789,0.2025,0.3583,0.3319,0.1916
3,11,86069406,86069406,T,C,intronic,PICALM,.,.,.,...,0.8912,0.6527,0.7834,0.7597,0.6900,0.8276,0.7229,0.8061,0.7955,0.7417
4,19,44749457,44749457,ATAT,A,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,.


In [7]:
# Format and clean columns
annovar_output_eur['MarkerName'] = 'chr' + annovar_output_eur['Chr'].astype(str) + ':' + annovar_output_eur['Start'].astype(str) + ':' + annovar_output_eur['Ref'].astype(str) + ':' + annovar_output_eur['Alt'].astype(str)

annovar_output_formatted_eur = annovar_output_eur[['MarkerName', 'avsnp150', 'Chr', 'Start', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene']]
annovar_output_formatted_eur.head()

Unnamed: 0,MarkerName,avsnp150,Chr,Start,Ref,Alt,Func.refGene,Gene.refGene
0,chr6:32689933:A:G,rs9275207,6,32689933,A,G,intergenic,HLA-DQB1;HLA-DQA2
1,chr19:44946776:C:T,rs9304644,19,44946776,C,T,ncRNA_intronic,APOC4-APOC2
2,chr11:60327769:T:C,rs12803929,11,60327769,T,C,intronic,MS4A6E
3,chr11:86069406:T:C,rs3016327,11,86069406,T,C,intronic,PICALM
4,chr19:44749457:ATAT:A,.,19,44749457,ATAT,A,.,.


In [8]:
# Merge annotated output with original stats
merge_stats_eur = pd.merge(eur, annovar_output_formatted_eur, how = 'left', on='MarkerName')
merge_stats_eur.columns

Index(['MarkerName', 'p_value', 'effect_allele', 'other_allele',
       'effect_allele_frequency', 'beta', 'standard_error', 'n_cases',
       'n_controls', 'het_isq', 'het_pvalue', 'Chromosome', 'Position',
       'Ref_x', 'Alt_x', 'avsnp150', 'Chr', 'Start', 'Ref_y', 'Alt_y',
       'Func.refGene', 'Gene.refGene'],
      dtype='object')

In [9]:
# Subset columns
merge_stats_eur_subset = merge_stats_eur[['Gene.refGene','avsnp150','Chromosome','Position','effect_allele','other_allele','beta','p_value','effect_allele_frequency']].rename(columns={
    'Gene.refGene':'Locus','avsnp150':'SNP'})
merge_stats_eur_subset.head()

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency
0,HLA-DQB1;HLA-DQA2,rs9275207,6,32689933,A,G,0.0997,1.296e-08,0.9022
1,APOC4-APOC2,rs9304644,19,44946776,T,C,0.1022,4.8010000000000007e-23,0.4508
2,MS4A6E,rs12803929,11,60327769,T,C,-0.0588,3.016e-08,0.6467
3,PICALM,rs3016327,11,86069406,T,C,-0.1028,1.578e-15,0.1956
4,.,.,19,44749457,A,ATAT,-0.1291,6.791e-26,0.2867


In [10]:
# Add a column for ancestry
merge_stats_eur_subset['ancestry'] = 'European'

### European *APOE* data from Kunkle 2019

In [11]:
df_k = pd.read_csv('{WORK_DIR}/Kunkle_etal_2019_IGAP_Summary_statistics.with_allelefreqs.txt', delim_whitespace=True, low_memory=False)
df_k.head()

  df_k = pd.read_csv('/data/CARD/AD/summary_stats/Kunkle_2019/Kunkle_etal_2019_IGAP_Summary_statistics.with_allelefreqs.txt', delim_whitespace=True, low_memory=False)


Unnamed: 0,Chromosome,Position,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue,Effect_allele_freq,Test_MarkerName
0,1,100000012,rs10875231,T,G,-0.0026,0.0168,0.8758,0.2406,1:100000012:G_T
1,1,100000827,rs6678176,T,C,0.0008,0.0156,0.9574,0.2926,1:100000827:C_T
2,1,100000843,rs78286437,T,C,-0.0136,0.033,0.6792,0.9477,1:100000843:C_T
3,1,100000989,chr1:100000989:I,A,ATC,-0.0099,0.0343,0.7731,0.9511,1:100000989:A_ATC
4,1,100001138,rs144406489,A,G,-0.0061,0.0612,0.9204,0.9769,1:100001138:A_G


In [12]:
# Clean up dataframe
df_eur_apoe = df_k[(df_k.MarkerName == 'rs429358') | (df_k.MarkerName == 'rs7412')]
df_eur_apoe['ancestry'] = 'European'
df_eur_apoe['Locus'] = 'APOE'

df_eur_apoe = df_eur_apoe[['Locus', 'MarkerName', 'Chromosome', 'Position', 'Effect_allele', 'Non_Effect_allele', 'Beta', 'Pvalue',
                          'Effect_allele_freq', 'ancestry']].rename(columns={
    'MarkerName':'SNP','Non_Effect_allele':'other_allele','Effect_allele':'effect_allele','Beta':'beta',
    'Pvalue':'p_value','Effect_allele_freq':'effect_allele_frequency'})
df_eur_apoe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eur_apoe['ancestry'] = 'European'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eur_apoe['Locus'] = 'APOE'


Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
10828822,APOE,rs429358,19,45411941,T,C,-1.2017,0.0,0.7841,European
10828823,APOE,rs7412,19,45412079,T,C,-0.4673,6.401e-53,0.0736,European


### Black/African American data from Kunkle

In [14]:
df_afr = pd.read_csv('{WORK_DIR}/Kunkle2020_ADGC_AA_META_Model1_SummaryStats.withAlleleFreqs_REFORMATTED.txt', sep='\t', header=0, low_memory=False)
df_afr.head()

Unnamed: 0,Chr,Pos,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue,Effect_allele_Freq
0,10,100000012,10:100000012,A,G,0.1734,0.1515,0.2524,0.018
1,10,100000122,10:100000122,A,T,0.3347,0.2294,0.1445,0.0083
2,10,100000354,10:100000354,T,C,0.6226,1.017,0.5404,0.9989
3,10,100000588,10:100000588,T,C,-0.0061,0.1033,0.9531,0.9603
4,10,100000625,10:100000625,A,G,0.0059,0.0415,0.8873,0.7182


In [15]:
# Add a column for ancestry
df_afr['ancestry'] = 'Black/African American'

# Subset and rename columns
afr = df_afr[['Chr','Pos','Effect_allele','Non_Effect_allele','Beta','Pvalue','Effect_allele_Freq','ancestry']].rename(columns={
    'Chr':'Chromosome','Pos':'Position','Effect_allele':'effect_allele','Non_Effect_allele':'other_allele',
    'Beta':'beta','Pvalue':'p_value','Effect_allele_Freq':'effect_allele_frequency'})

# Filter p < 0.05
#afr = afr[afr['p_value'] < 0.05]

### East Asian data from Shigemizu

In [17]:
df_ea = pd.read_csv('{WORK_DIR}/NCGG_AD_GWAS2.txt', sep='\t', header=0, low_memory=False)
df_ea.head()

Unnamed: 0,CHR,SNP,BP,A1,A2,NMISS,NMISS_A,NMISS_U,MAF_A,MAF_U,OR,SE,L95,U95,STAT,P,Info_NCGG,Info_Niigata
0,1,rs3094315,752566,G,A,8035,3961,4074,0.1471,0.1523,0.9691,0.04579,0.886,1.06,-0.6845,0.4937,1.0,0.999
1,1,rs3115860,753405,C,A,7936,3921,4015,0.1489,0.1549,0.9668,0.04571,0.884,1.057,-0.7385,0.4602,0.966,0.989
2,1,rs2073813,753541,A,G,7723,3797,3926,0.3741,0.3835,0.9759,0.03514,0.911,1.045,-0.6944,0.4874,0.969,0.979
3,1,rs3131969,754182,A,G,7724,3795,3929,0.374,0.3837,0.9746,0.03514,0.9097,1.044,-0.7322,0.464,0.969,0.978
4,1,rs3131968,754192,A,G,7724,3795,3929,0.374,0.3837,0.9746,0.03514,0.9097,1.044,-0.7322,0.464,0.969,0.978


In [18]:
# Add a column for ancestry
df_ea['ancestry'] = 'East Asian'

# Subset and rename columns
ea = df_ea[['CHR','BP','SNP','A1','A2','OR','P','MAF_A','ancestry']].rename(columns={
    'CHR':'Chromosome','BP':'Position','A1':'effect_allele','A2':'other_allele',
    'P':'p_value','MAF_A':'effect_allele_frequency'})

# Filter p < 0.05
#ea = ea[ea['p_value'] < 0.05]

# beta = ln(odds_ratio)
ea['beta'] = np.log(ea['OR'])

### Caribbean/Hispanic data

In [19]:
df_ch = pd.read_csv('{WORK_DIR}/CarHisp_for_MRMEGA.no_multiAllelics_indels.MAF_0.01.txt', sep='\t', header=0, low_memory=False)
df_ch.head()

Unnamed: 0,MARKERNAME,CHROMOSOME,POSITION,EA,NEA,EAF,OR,OR_95U,OR_95L,N,P
0,1:48824,1,48824,C,T,0.017666,0.795203,1.407862,0.449155,2240,0.431702
1,1:54490,1,54490,A,G,0.043157,0.853678,1.255188,0.580603,2240,0.421182
2,1:60351,1,60351,G,A,0.024821,0.864445,1.395185,0.535603,2240,0.550887
3,1:64931,1,64931,A,G,0.024598,0.821548,1.334293,0.505842,2240,0.426951
4,1:66861,1,66861,T,C,0.044946,0.868805,1.244424,0.606564,2240,0.442988


In [20]:
# Add a column for ancestry
df_ch['ancestry'] = 'Latino'

# beta = ln(odds_ratio)
df_ch['beta'] = np.log(df_ch['OR'])

# Subset and rename columns
ch = df_ch[['CHROMOSOME','POSITION','EA','NEA','beta','P','EAF','ancestry']].rename(columns={
    'CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele',
    'P':'p_value','EAF':'effect_allele_frequency'})

# Filter p < 0.05
#ch = ch[ch['p_value'] < 0.05]

### Select top hits from each ancestry

In [21]:
# top hits from multi-ancestry GWAS meta-analysis
top_hits = pd.read_excel('{WORK_DIR}/AD_stats.xlsx', sheet_name='S2b').rename(columns={'SNP, random effects':'SNP',
                                                                                                   'Chromosome, random effects':'Chromosome',
                                                                                                   'Position, random effects':'Position'})
top_hits.head()

Unnamed: 0,Locus,SNP,Chromosome,Position,"Effect allele, random effects","Reference allele, random effects","P, random effects","beta, random effects","SE, random effects",I2,...,"SE, Caribbean Hispanic","P, Bellenguez et al. 2022 and FinnGen R6 (EXMORE)","Beta, Bellenguez et al. 2022 and FinnGen R6 (EXMORE)","SE, Bellenguez et al. 2022 and FinnGen R6 (EXMORE)","P, Kunkle et al. 2021","Beta, Kunkle et al. 2021","SE, Kunkle et al. 2021","P, Shigemizu et al. 2021","Beta, Shigemizu et al. 2021","SE, Shigemizu et al. 2021"
0,ABCA7,rs12151021,19,1050874,A,G,3.9000000000000004e-33,0.1028,0.008572,0.0,...,0.079176,3.99e-32,0.1043,0.0088,0.1748,0.0536,0.0395,,,
1,ACE,rs4311,17,61560763,T,C,2.19e-16,-0.062,0.007551,0.0,...,0.079062,3.11e-16,-0.063,0.0077,0.3329,-0.041,0.0424,,,
2,ADAM10,rs653765,15,59042012,T,C,2.57e-12,0.057,0.008144,0.0,...,,3.05e-12,0.0592,0.0085,0.7464,0.0133,0.0411,0.6829,0.020783,0.050324
3,ADAMTS4,rs11265557,1,161106354,T,G,2.45e-09,-0.048,0.008047,0.0,...,,9.7e-09,-0.0482,0.0084,0.9681,0.0026,0.0662,,,
4,ANK3,rs7068231,10,61784928,T,G,6.14e-10,-0.0484,0.007823,0.0,...,0.078698,3.47e-10,-0.0501,0.008,0.7864,0.012,0.0441,,,


#### European top hits

In [22]:
# Merge with AD top hits
eur_top = pd.merge(top_hits, merge_stats_eur_subset, on='SNP', how='inner')
eur_top = eur_top[['Locus_y','SNP','Chromosome_y','Position_y','effect_allele','other_allele','beta',
                   'p_value','effect_allele_frequency','ancestry']].rename(columns={'Locus_y':'Locus',
                                                                                    'Chromosome_y':'Chromosome',
                                                                                    'Position_y':'Position'})

In [23]:
# Add in APOE
eur_final = pd.concat([eur_top, df_eur_apoe])

# only keep rs429358 <> rs1081105, rs7412 <> rs1065853
condition = eur_final['Locus'] == 'APOE'
snps_to_keep = ['rs429358','rs7412']
filtered_subset = eur_final.loc[condition & eur_final['SNP'].isin(snps_to_keep)]
eur_final = pd.concat([eur_final[~condition], filtered_subset])
eur_final

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
0,ABCA7,rs12151021,19,1050875,A,G,0.1226,7.691999999999999e-30,0.3391,European
1,ACE,rs4311,17,63483402,T,C,-0.062,8.077e-10,0.4911,European
2,ADAM10,rs653765,15,58749813,T,C,0.0657,6.408e-09,0.7223,European
3,APH1B,rs75763893,15,63279621,T,C,0.1171,4.584e-15,0.1462,European
4,CBLC,rs80168591,19,44781370,A,G,0.8483,4.272e-65,0.0145,European
5,BIN1;CYP27C1,rs34779859,2,127135192,T,G,0.1259,9.07e-30,0.3013,European
6,CASS4,rs113221226,20,56449045,A,G,-0.1264,1.89e-11,0.0807,European
7,CD2AP,rs7738044,6,47501537,A,G,-0.068,2.102e-09,0.7274,European
8,CLU,rs1532278,8,27608798,T,C,-0.1092,1.048e-25,0.3768,European
9,LINC02059;MIR4280,rs58233521,5,87079937,T,C,0.0737,2.66e-08,0.1836,European


#### Black/African American top hits

In [24]:
# Merge with AD top hits
afr_top = pd.merge(top_hits, afr, on=['Chromosome','Position'], how='inner')
afr_top = afr_top[['Locus','SNP','Chromosome','Position','effect_allele','other_allele',
                   'beta','p_value','effect_allele_frequency','ancestry']]

In [25]:
# Add in APOE
afr_apoe1 = afr[(afr['Chromosome'] == 19) & (afr['Position'] == 45411941)]
afr_apoe2 = afr[(afr['Chromosome'] == 19) & (afr['Position'] == 45412079)]
afr_apoe = pd.concat([afr_apoe1, afr_apoe2])
afr_apoe['Locus'] = 'APOE'
afr_apoe['SNP'] = afr_apoe['Position'].map({45411941: 'rs429358', 45412079: 'rs7412'})

afr_final = pd.concat([afr_top, afr_apoe])

#### East Asian top hits

In [26]:
# Merge with AD top hits
ea_top = pd.merge(top_hits, ea, on='SNP', how='inner')
ea_top = ea_top[['Locus','SNP','Chromosome_y','Position_y','effect_allele','other_allele',
                   'beta','p_value','effect_allele_frequency','ancestry']].rename(columns={'Chromosome_y':'Chromosome',
                                                                                           'Position_y':'Position'})

In [27]:
# Add in APOE
ea_apoe = ea[(ea['SNP'] == 'rs429358') | (ea['SNP'] == 'rs7412')].drop(columns='OR')
ea_apoe['Locus'] = 'APOE'
ea_final = pd.concat([ea_top, ea_apoe])

#### Latino top hits

In [28]:
# Merge with top hits
ch_top = pd.merge(top_hits, ch, on=['Chromosome','Position'], how='inner')
ch_top = ch_top[['Locus','SNP','Chromosome','Position','effect_allele','other_allele',
                   'beta','p_value','effect_allele_frequency','ancestry']]

In [29]:
# Add in APOE
ch_apoe1 = ch[(ch['Chromosome'] == 19) & (ch['Position'] == 45411941)]
ch_apoe2 = ch[(ch['Chromosome'] == 19) & (ch['Position'] == 45412079)]
ch_apoe = pd.concat([ch_apoe1, ch_apoe2])
ch_apoe['Locus'] = 'APOE'
ch_apoe['SNP'] = ch_apoe['Position'].map({45411941: 'rs429358', 45412079: 'rs7412'})

ch_final = pd.concat([ch_top, ch_apoe])

### Generate dataset for analysis

In [30]:
# Concat all dataframes
df_calcs = pd.concat([eur_final, afr_final, ea_final, ch_final])

In [None]:
# Current date in YYYY-MM-DD 
today = datetime.now().strftime('%Y-%m-%d')

# Save path
path = '{WORK_DIR}/AD/processed/'

# Filename and path
file = f'{path}_AD_PAR_data_for_analysis_pval.csv'
full_path = os.path.join(path, file)

# Export the dataframe to CSV
df_calcs.to_csv(full_path, index=False, header=True)

print(f'Data exported as {file}')