# Prepare PD Summary Statistics
* Project: Cross-ancestry PAR analysis
* Version: Python/3.9
* Status: COMPLETE
* Last Updated: 02-FEB-2025

## Notebook Overview
* Load list of 90 risk loci associated with PD from Nalls publication
* Import summary statistics from ancestry-specific GWAS
* Select top hits for each ancestry and generate dataset for calculations
* Identify known risk variants, population-specific variants in summary stats

In [1]:
# Import packages
from datetime import datetime
import os
import glob
import numpy as np
import pandas as pd
import sys
import openpyxl
import seaborn as sns
import matplotlib.pyplot as plt

### Load list of 90 risk loci from Nalls paper

In [2]:
df_nalls = pd.read_excel('{WORK_DIR}/90risk_loci.xlsx').rename(columns={'CHR':'Chromosome','BP':'Position', 'Nearest Gene':'Locus'})
df_nalls.head()

Unnamed: 0,SNP,Chromosome,Position,Locus,QTL Nominated Gene (nearest QTL),Effect allele,Other allele,Effect allele frequency,"Beta, all studies","SE, all studies",...,"Freq1, new studies","Beta, new studies","StdErr, new studies","P, new studies","I2, new studies",Passes pooled 23andMe QC,Known GWAS locus within 1MB,Failed final filtering and QC,Locus within 250KB,Locus Number
0,rs114138760,1,154898185,PMVK,,c,g,0.0112,0.2812,0.0478,...,0.0109,0.1997,0.0843,0.01779,0.0,T,1,0,1,1
1,rs35749011,1,155135036,KRTCAP2,,a,g,0.0169,0.6068,0.0342,...,0.0177,0.6798,0.0615,2.16e-28,0.0,T,1,0,1,1
2,rs76763715,1,155205634,GBAP1,GBAP1,t,c,0.9953,-0.7467,0.0765,...,0.9941,-0.6693,0.1269,1.34e-07,0.0,F,1,0,1,1
3,rs6658353,1,161469054,FCGR2A,FCGR2A,c,g,0.5011,0.065,0.0094,...,0.5105,0.0662,0.0171,0.0001101,46.4,T,0,0,0,2
4,rs11578699,1,171719769,VAMP4,VAMP4,t,c,0.1949,-0.0704,0.012,...,0.1947,-0.0663,0.0226,0.003313,13.4,T,0,0,0,3


### Summary statistics for each ancestry

#### European from Nalls et al. 2019

In [3]:
df_eur = pd.read_csv('{WORK_DIR}/summary_stats/META5_all_with_rsid_hg38.txt', sep='\t', header=0).rename(columns={
    'ID':'SNP'})
df_eur.head()

Unnamed: 0,MarkerName,Allele1,Allele2,Freq1,FreqSE,MinFreq,MaxFreq,Effect,StdErr,P-value,Direction,HetISq,HetChiSq,HetDf,HetPVal,freqSpan,SNP,Chr,BP
0,chr10:98240868,a,g,0.5665,0.0067,0.5525,0.5943,0.011,0.0095,0.2476,+++++++-++----+-+,0.0,15.14,16,0.5144,0.0418,rs7899632,10,98240868
1,chr10:98240888,a,c,0.7953,0.008,0.7208,0.8111,-0.0091,0.0116,0.4295,-+++-------+-+-+-,0.0,12.625,16,0.6999,0.0903,rs61875309,10,98240888
2,chr10:98242110,t,c,0.014,0.0017,0.0044,0.0178,-0.0152,0.0649,0.8147,+??+--?++??+-++--,0.0,8.126,11,0.7019,0.0134,rs150203744,10,98242110
3,chr10:98242642,a,g,0.0018,0.0005,0.0014,0.0043,-0.1331,0.1778,0.4541,-?+--?+??+??-?+??,0.0,4.328,7,0.7413,0.0029,rs8181398,10,98242642
4,chr10:98242707,t,c,0.9868,0.002,0.9792,0.9912,0.0347,0.0742,0.6396,+???--?++??+---++,0.0,8.593,10,0.5711,0.012,rs111551711,10,98242707


In [4]:
# Merge summary stats with 90 risk loci from Nalls
eur = pd.merge(df_nalls, df_eur, how='inner', on='SNP')

# Add a column for ancestry
eur['ancestry'] = 'European'

# Subset and rename columns
eur = eur[['Locus','SNP','Chr','BP','Allele1','Allele2','Effect','P-value','Freq1','ancestry']].rename(columns={
    'ID':'SNP','Chr':'Chromosome','BP':'Position','Allele1':'effect_allele','Allele2':'other_allele',
    'Effect':'beta','P-value':'p_value','Freq1':'effect_allele_frequency'})

# Change alleles to be uppercase
eur['effect_allele'] = eur['effect_allele'].str.upper()
eur['other_allele'] = eur['other_allele'].str.upper()

# Filter p < 0.05
eur = eur[eur['p_value'] < 0.05]
eur

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
0,PMVK,rs114138760,1,154925709,C,G,0.2812,4.193000e-09,0.0112,European
1,KRTCAP2,rs35749011,1,155162560,A,G,0.6068,1.720000e-70,0.0169,European
2,GBAP1,rs76763715,1,155235843,T,C,-0.7467,1.592000e-22,0.9953,European
3,FCGR2A,rs6658353,1,161499264,C,G,0.0650,6.099000e-12,0.5011,European
4,VAMP4,rs11578699,1,171750629,T,C,-0.0704,4.468000e-09,0.1949,European
...,...,...,...,...,...,...,...,...,...,...
85,RIT2,rs12456492,18,43093415,A,G,-0.0983,3.798000e-23,0.6816,European
86,MEX3C,rs8087969,18,51157219,T,G,-0.0578,1.412000e-08,0.5496,European
87,SPPL2B,rs55818311,19,2341049,T,C,-0.0696,4.180000e-10,0.6937,European
88,CRLS1,rs77351827,20,6025395,T,C,0.0802,8.867000e-09,0.1275,European


#### African/African Admixed from Rizig et al. 2023

In [5]:
df_afr = pd.read_csv('{WORK_DIR}/summary_stats/AFR_AAC_metaGWAS_MAF0.05_hg38_noindels_full_with23andMe.tab', sep='\t', header=0, low_memory=False).rename(columns={
    'rsid':'SNP'})
df_afr.head()

Unnamed: 0,chromosome,base_pair_location,effect_allele,other_allele,beta,standard_error,effect_allele_frequency,p_value,variant_id,ref_allele,direction,HetISq,HetChiSq,HetDf,HetPVal,SNP
0,1,66861,T,C,-0.1072,0.4023,0.0724,0.7899,chr1:66861:C:T,C,??-?,0.0,0.0,0,1.0,rs28375825
1,1,80346,C,G,0.4608,0.3456,0.8338,0.1824,chr1:80346:C:G,C,??+?,0.0,0.0,0,1.0,rs376665626
2,1,595259,A,G,-0.0456,0.4097,0.0597,0.9114,chr1:595259:G:A,G,??-?,0.0,0.0,0,1.0,rs201764041
3,1,664938,A,G,-0.1109,0.2264,0.939,0.6242,chr1:664938:A:G,A,???-,0.0,0.0,0,1.0,rs536144132
4,1,665098,A,G,-0.0315,0.3587,0.1262,0.9301,chr1:665098:G:A,G,??-?,0.0,0.0,0,1.0,rs114979547


In [6]:
# Merge summary stats with 90 risk loci from Nalls
afr = pd.merge(df_nalls, df_afr, how='inner', on='SNP')

# A/AA GBA variant not in df_nalls
afr_gba = df_afr[df_afr.SNP == 'rs3115534']
afr_gba['Locus'] = 'GBA1'
afr = pd.concat([afr, afr_gba])

# Add a column for ancestry
afr['ancestry'] = 'African/African Admixed'

# Subset and rename columns
afr = afr[['Locus','SNP','chromosome','base_pair_location','effect_allele','other_allele','beta','p_value',
              'effect_allele_frequency','ancestry']].rename(columns={'chromosome':'Chromosome',
                                                                     'base_pair_location':'Position'})

# Get African-specific GBA1 variant
afr_gba = df_afr[df_afr.SNP == 'rs3115534']

# Filter p < 0.05
afr = afr[afr['p_value'] < 0.05]
afr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  afr_gba['Locus'] = 'GBA1'


Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
10,LINC00693,rs6808178,3,28664199,T,C,0.2227,0.009219,0.1211,African/African Admixed
15,MCCC1,rs10513789,3,183042285,T,G,0.1727,0.006314,0.8122,African/African Admixed
18,LCORL,rs34025766,4,17967188,A,T,-0.1061,0.03149,0.4289,African/African Admixed
22,SNCA,rs356182,4,89704960,A,G,-0.2188,2.481e-05,0.6818,African/African Admixed
29,LOC100131289,rs4140646,6,27771022,A,G,0.1077,0.04796,0.2516,African/African Admixed
31,RIMS1,rs12528068,6,71778059,T,C,0.1037,0.03324,0.4097,African/African Admixed
32,FYN,rs997368,6,111922088,A,G,0.2066,0.0002744,0.3057,African/African Admixed
39,SH3GL2,rs13294100,9,17579692,T,G,-0.1849,0.002815,0.7672,African/African Admixed
43,GBF1,rs10748818,10,102255522,A,G,0.187,0.01233,0.8696,African/African Admixed
47,IGSF9B,rs3802920,11,133917106,T,G,0.3455,0.00467,0.0729,African/African Admixed


In [7]:
# checking for SNPs reviewers might ask for:
# LRRK2: R1441G (rs33939927), R1628P (rs33949390), G2385R (rs34778348), G2019S (rs34637584)
list_check_snps = ['rs33939927','rs33949390','rs34778348','rs34637584']
check_snps_eur = df_eur[df_eur['SNP'].isin(list_check_snps)]
check_snps_afr = df_afr[df_afr['SNP'].isin(list_check_snps)]

#### East Asian from Foo et al. 2020

In [8]:
df_ea = pd.read_csv('{WORK_DIR}/summary_stats/EAS.final.txt', sep='\t', header=0, low_memory=False)
df_ea.head()

Unnamed: 0,MARKERNAME,CHROMOSOME,POSITION,EA,NEA,EAF,BETA,SE,OR,OR_95U,OR_95L,N,NMISS,P
0,chr1:794332,1,794332,A,G,0.139423,0.0013,0.03356,1.001301,1.069378,0.937558,31575,21168.408424,0.9691
1,chr1:832359,1,832359,T,C,0.03125,-0.0932,0.0706,0.911011,1.046213,0.793282,31575,21168.408424,0.1868
2,chr1:834056,1,834056,C,A,0.028846,-0.0766,0.077458,0.92626,1.078119,0.795792,31575,21168.408424,0.3227
3,chr1:834263,1,834263,T,C,0.028846,-0.0846,0.079177,0.91888,1.073138,0.786795,31575,21168.408424,0.2853
4,chr1:834956,1,834956,A,G,0.026442,-0.0704,0.077861,0.932021,1.08568,0.800109,31575,21168.408424,0.3659


In [9]:
# check for LRRK2 G2385R (rs34778348)
df_ea[df_ea.MARKERNAME == 'chr12:40757328']

Unnamed: 0,MARKERNAME,CHROMOSOME,POSITION,EA,NEA,EAF,BETA,SE,OR,OR_95U,OR_95L,N,NMISS,P
3151430,chr12:40757328,12,40757328,A,G,0.036058,0.6634,,1.941382,,,31575,21168.408424,4.68e-24


In [10]:
# Rename columns and merge with risk loci
df_ea = df_ea.rename(columns={'CHROMOSOME':'Chromosome','POSITION':'Position'})
ea = pd.merge(df_nalls, df_ea, how='inner', on=['Chromosome','Position'])

# Add in reported top GWAS hits from Foo
list_ea_snps = ['chr1:155210185','chr5:75599208','chr7:70750493']
ea_snps = df_ea[df_ea['MARKERNAME'].isin(list_ea_snps)]

# Fill in missing rsID's and locus names
ea_snps['SNP'] = ea_snps['MARKERNAME'].map({'chr1:155210185':'rs146532106','chr5:75599208':'rs246814',
                                            'chr7:70750493':'rs9638616'})
ea_snps['Locus'] = ea_snps['MARKERNAME'].map({'chr1:155210185':'GBA1','chr5:75599208':'SV2C',
                                              'chr7:70750493':'WBSCR17'})
ea = pd.concat([ea,ea_snps])

# Add a column for ancestry
ea['ancestry'] = 'East Asian'

# Subset and rename columns
ea = ea[['Locus','SNP','Chromosome','Position','EA','NEA','BETA','P','EAF','ancestry']].rename(columns={
    'EA':'effect_allele','NEA':'other_allele','BETA':'beta',
    'P':'p_value','EAF':'effect_allele_frequency'})

# Filter p < 0.05
ea = ea[ea['p_value'] < 0.05]
ea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ea_snps['SNP'] = ea_snps['MARKERNAME'].map({'chr1:155210185':'rs146532106','chr5:75599208':'rs246814',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ea_snps['Locus'] = ea_snps['MARKERNAME'].map({'chr1:155210185':'GBA1','chr5:75599208':'SV2C',


Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
2,RAB29,rs11557080,1,205737739,A,G,0.2126,2.91e-20,0.519231,East Asian
3,ITPKB,rs4653767,1,226916078,C,T,-0.1305,3.18e-07,0.264423,East Asian
10,MED12L,rs11707416,3,151108965,A,T,-0.0645,0.04794,0.120192,East Asian
12,MCCC1,rs10513789,3,182760073,G,T,-0.1901,1.1e-16,0.608173,East Asian
13,TMEM175,rs34311866,4,951947,C,T,0.152,7.03e-06,0.127404,East Asian
14,BST1,rs4698412,4,15737348,A,G,0.1105,1.51e-06,0.360577,East Asian
15,LCORL,rs34025766,4,17968811,A,T,-0.0854,0.008808,0.144231,East Asian
16,SCARB2,rs6825004,4,77110365,G,C,-0.1192,4.38e-07,0.379808,East Asian
20,CLCN3,rs62333164,4,170583157,A,G,-0.1028,0.03004,0.060096,East Asian
23,C5orf24,rs11950533,5,134199105,A,C,-0.0557,0.03602,0.293269,East Asian


#### Latino from Loesch et al. 2021

In [11]:
df_lat = pd.read_csv('{WORK_DIR}/summary_stats/AMR.final.txt', sep='\t', header=0, low_memory=False)
df_lat.head()

Unnamed: 0,MARKERNAME,CHROMOSOME,POSITION,EA,NEA,EAF,BETA,SE,OR,OR_95U,OR_95L,N,NMISS,P
0,chr1:662622,1,662622,A,G,0.065474,-0.187575,0.210099,0.828967,1.251344,0.549158,1481,1487.855711,0.371967
1,chr1:666249,1,666249,T,C,0.073317,0.266337,0.180918,1.305175,1.860669,0.915521,1481,1487.855711,0.140982
2,chr1:668394,1,668394,A,AG,0.018808,-0.1717,0.4918,0.842232,2.208309,0.32122,1481,1487.855711,0.726995
3,chr1:676118,1,676118,T,C,0.038932,0.044314,0.282878,1.04531,1.81985,0.60042,1481,1487.855711,0.875518
4,chr1:693625,1,693625,C,T,0.021886,-0.310839,0.358697,0.732832,1.480244,0.362806,1481,1487.855711,0.386173


In [12]:
# GBA stats are in 23andMe data
df_lat23 = pd.read_csv('{WORK_DIR}/summary_stats/23AMR.formatted.txt.gz', sep='\t', header=0, low_memory=False)

In [13]:
# Get GBA1 from 23andMe datafame
lat_gba = df_lat23[df_lat23.MARKERNAME == 'chr1:155205634']
df_lat = pd.concat([df_lat,lat_gba])

# Chromsome, position columns are objects -> convert to int before merging
df_lat['CHROMOSOME'] = pd.to_numeric(df_lat['CHROMOSOME'], errors='coerce')
# Some values were strings, so drop the na's 
df_lat = df_lat.dropna(subset=['CHROMOSOME'])
df_lat['CHROMOSOME'] = df_lat['CHROMOSOME'].astype(int)
df_lat['POSITION'] = pd.to_numeric(df_lat['POSITION'], errors='coerce')
df_lat = df_lat.rename(columns={'CHROMOSOME':'Chromosome','POSITION':'Position'})

# Merge with 90 risk loci
lat = pd.merge(df_lat, df_nalls, how='inner', on=['Chromosome', 'Position'])

# Add a column for ancestry
lat['ancestry'] = 'Latino'

# beta = ln(odds_ratio)
lat['beta'] = np.log(lat['OR'])

# Subset and rename columns
lat = lat[['Locus','SNP','Chromosome','Position','EA','NEA','beta','P','EAF','ancestry']].rename(columns={
    'EA':'effect_allele','NEA':'other_allele',
    'P':'p_value','EAF':'effect_allele_frequency'})

# Filter p < 0.05
lat = lat[lat['p_value'] < 0.05]
lat

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
6,KCNS3,rs76116224,2,18147848,T,A,-0.359278,0.02684601,0.073345,Latino
7,KCNIP3,rs2042477,2,96000943,T,A,0.195749,0.02921862,0.614394,Latino
11,LINC00693,rs6808178,3,28705690,C,T,-0.228445,0.03913275,0.810031,Latino
13,KPNA1,rs55961674,3,122196892,T,C,0.272241,0.0413819,0.118806,Latino
24,SNCA,rs356182,4,90626111,A,G,-0.460053,2.484767e-08,0.555666,Latino
27,ELOVL7,rs1867598,5,60137959,G,A,0.292834,0.02627049,0.107321,Latino
37,BIN3,rs2280104,8,22525980,C,T,-0.214305,0.02515152,0.758349,Latino
42,ITGA8,rs896435,10,15557406,T,C,0.192646,0.02817975,0.673742,Latino
57,VPS13C,rs2251086,15,61997385,C,T,0.236127,0.04489795,0.855848,Latino
59,SETD1A,rs11150601,16,30977799,A,G,0.278549,0.0009806977,0.558559,Latino


In [14]:
# Concat dataframes
df_out = pd.concat([eur,afr,ea,lat])
df_out

Unnamed: 0,Locus,SNP,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry
0,PMVK,rs114138760,1,154925709,C,G,0.281200,4.193000e-09,0.011200,European
1,KRTCAP2,rs35749011,1,155162560,A,G,0.606800,1.720000e-70,0.016900,European
2,GBAP1,rs76763715,1,155235843,T,C,-0.746700,1.592000e-22,0.995300,European
3,FCGR2A,rs6658353,1,161499264,C,G,0.065000,6.099000e-12,0.501100,European
4,VAMP4,rs11578699,1,171750629,T,C,-0.070400,4.468000e-09,0.194900,European
...,...,...,...,...,...,...,...,...,...,...
37,BIN3,rs2280104,8,22525980,C,T,-0.214305,2.515152e-02,0.758349,Latino
42,ITGA8,rs896435,10,15557406,T,C,0.192646,2.817975e-02,0.673742,Latino
57,VPS13C,rs2251086,15,61997385,C,T,0.236127,4.489795e-02,0.855848,Latino
59,SETD1A,rs11150601,16,30977799,A,G,0.278549,9.806977e-04,0.558559,Latino


In [15]:
# Current date in YYYY-MM-DD 
today = datetime.now().strftime('%Y-%m-%d')

# Path to save data
path = '{WORK_DIR}/PD/processed_p_val_filtered/'

# Filenames and paths
file = f'{today}_PD_PAR_data_for_analysis_pval.csv'
full_path = os.path.join(path, file)

# Export data to csv
df_out.to_csv(full_path, index=False, header=True)

print(f'DataFrame exported as {file}')

### Select top hits for each ancestry and generate dataset for analysis

#### Based on previously identified risk loci
Added on June 10th, 2024

##### Nalls

In [15]:
df_nalls = pd.read_excel('{WORK_DIR}/90risk_loci.xlsx').rename(columns={'CHR':'Chromosome','BP':'Position', 'Nearest Gene':'Locus'})
df_nalls.head()

Unnamed: 0,SNP,Chromosome,Position,Locus,QTL Nominated Gene (nearest QTL),Effect allele,Other allele,Effect allele frequency,"Beta, all studies","SE, all studies",...,"Freq1, new studies","Beta, new studies","StdErr, new studies","P, new studies","I2, new studies",Passes pooled 23andMe QC,Known GWAS locus within 1MB,Failed final filtering and QC,Locus within 250KB,Locus Number
0,rs114138760,1,154898185,PMVK,,c,g,0.0112,0.2812,0.0478,...,0.0109,0.1997,0.0843,0.01779,0.0,T,1,0,1,1
1,rs35749011,1,155135036,KRTCAP2,,a,g,0.0169,0.6068,0.0342,...,0.0177,0.6798,0.0615,2.16e-28,0.0,T,1,0,1,1
2,rs76763715,1,155205634,GBAP1,GBAP1,t,c,0.9953,-0.7467,0.0765,...,0.9941,-0.6693,0.1269,1.34e-07,0.0,F,1,0,1,1
3,rs6658353,1,161469054,FCGR2A,FCGR2A,c,g,0.5011,0.065,0.0094,...,0.5105,0.0662,0.0171,0.0001101,46.4,T,0,0,0,2
4,rs11578699,1,171719769,VAMP4,VAMP4,t,c,0.1949,-0.0704,0.012,...,0.1947,-0.0663,0.0226,0.003313,13.4,T,0,0,0,3


In [None]:
# Merge summary stats with 90 risk loci from Nalls
eur_risk = pd.merge(eur, df_nalls, how='inner', on='SNP')
eur_risk = eur_risk[['Locus', 'SNP', 'Chromosome_x', 'Position_x', 'effect_allele', 'other_allele',
       'beta', 'p_value', 'effect_allele_frequency', 'ancestry']].rename(columns={'Chromosome_x':'Chromosome',
                                                                                  'Position_x':'Position'})

##### Rizig

In [None]:
# Genome-wide significant SNPs identified in the African and African Admixed GWAS meta-analysis with frequency metrics
df_rizig_gwsig = pd.read_excel('{WORK_DIR}/Rizig_Genomewide_signif.xlsx', sheet_name='Tab 2 ST2').rename(columns={'Gene.refGene':'Locus',
                                                                                                       'rsID':'SNP'})

# Add in LRRK2 for comparison (had a high p-val in summary stats, so was filtered out)
list_afr_snps = ['rs76904798','rs34637584']
afr_snps = df_afr[df_afr['rsid'].isin(list_afr_snps)]
afr_snps = afr_snps[['rsid','chromosome','base_pair_location','effect_allele','other_allele','beta',
                     'effect_allele_frequency','p_value','ancestry']].rename(columns={'chromosome':'Chromosome',
                                                                                      'base_pair_location':'Position','rsid':'SNP'})
afr_snps['Locus'] = 'LRRK2'

# Merge summary stats with 90 risk loci from Nalls
df_rizig = pd.concat([df_rizig_gwsig, df_nalls])
afr_risk = pd.merge(afr, df_rizig, how='inner', on='SNP')
afr_risk = afr_risk[['Locus', 'SNP', 'Chromosome_x', 'Position_x', 'effect_allele', 'other_allele',
       'beta', 'p_value', 'effect_allele_frequency', 'ancestry']].rename(columns={'Chromosome_x':'Chromosome',
                                                                                  'Position_x':'Position'})
afr_risk = pd.concat([afr_risk,afr_snps])

##### Foo

In [None]:
# Merge with 90 risk loci
ea_risk = pd.merge(ea, df_nalls, how='inner', on=['Chromosome', 'Position'])

# Add in reported top GWAS hits from Foo
list_ea_snps = ['chr1:155210185','chr5:75599208','chr1:205756484','chr1:226846712','chr3:182735211','chr4:77101068','chr4:90682474','chr6:112151452',
             'chr7:70750493','chr11:83510117','chr12:40387749','chr12:40757328','chr18:40678235']
ea_snps = df_ea[df_ea['MARKERNAME'].isin(list_ea_snps)]

# Fill in missing rsID's and locus names
ea_snps['SNP'] = ea_snps['MARKERNAME'].map({'chr1:155210185':'rs146532106','chr5:75599208':'rs246814',
                                            'chr1:205756484':'rs6679073','chr1:226846712':'rs16846351',
                                            'chr3:182735211':'rs2292056','chr4:77101068':'rs3816248',
                                            'chr4:90682474':'rs6826785','chr6:112151452':'rs1887316',
                                            'chr7:70750493':'rs9638616','chr11:83510117':'rs12278023',
                                            'chr12:40387749':'rs141336855','chr18:40678235':'rs4130047',
                                            'chr12:40757328':'rs34778348'})
ea_snps['Locus'] = ea_snps['MARKERNAME'].map({'chr1:155210185':'GBA','chr5:75599208':'SV2C',
                                              'chr1:205756484':'PARK16','chr1:226846712':'ITPKB',
                                              'chr3:182735211':'MCCC1','chr4:77101068':'FAM47E-SCARB2',
                                              'chr4:90682474':'SNCA','chr6:112151452':'FYN',
                                              'chr7:70750493':'WBSCR17','chr11:83510117':'DLG2',
                                              'chr12:40387749':'LRRK2','chr18:40678235':'RIT2',
                                              'chr12:40757328':'LRRK2'})

# Subset and rename columns
ea_snps = ea_snps[['Locus','SNP','CHROMOSOME','POSITION','EA','NEA','beta','P','EAF','ancestry']].rename(columns={
    'CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele',
    'P':'p_value','EAF':'effect_allele_frequency'})

# Combine with top hits
ea_risk = pd.concat([ea_risk, ea_snps])
ea_risk = ea_risk[['Locus', 'SNP', 'Chromosome', 'Position', 'effect_allele', 'other_allele',
       'beta', 'p_value', 'effect_allele_frequency', 'ancestry']]

##### Loesch

In [None]:
# Chromsome, position columns are objects -> convert to int before merging
lat['Chromosome'] = pd.to_numeric(lat['Chromosome'], errors='coerce')
# Some values were strings, so drop the na's 
lat = lat.dropna(subset=['Chromosome'])
lat['Chromosome'] = lat['Chromosome'].astype(int)
lat['Position'] = pd.to_numeric(lat['Position'], errors='coerce')

# Merge with 90 risk loci
lat_risk = pd.merge(lat, df_nalls, how='inner', on=['Chromosome', 'Position'])

# Add in reported top GWAS hits from Loesch
list_lat_snps = ['chr1:155205634','chr1:155135036','chr1:155206167','chr4:90626111','chr4:90626111','chr4:90744993','chr4:90471245',
                'chr4:90819961','chr3:196357126','chr17:44051846','chr4:951947']
lat_snps = df_lat[df_lat['MARKERNAME'].isin(list_lat_snps)]

# Fill in missing rsID's and locus names
lat_snps['SNP'] = lat_snps['MARKERNAME'].map({'chr1:155205634':'rs76763715','chr1:155135036':'rs35749011',
                                              'chr1:155206167':'rs2230288','chr4:90626111':'rs356182',
                                              'chr4:90643757':'rs356225','chr12:40734202':'rs34637584',
                                              'chr4:90744993':'rs6830166','chr4:90471245':'rs2870004',
                                              'chr4:90819961':'rs763443','chr3:196357126':'rs78820950',
                                              'chr17:44051846':'rs1800547','chr4:951947':'rs34311866'})
lat_snps['Locus'] = lat_snps['MARKERNAME'].map({'chr1:155205634':'GBA','chr1:155135036':'GBA',
                                                'chr1:155206167':'GBA','chr4:90626111':'SNCA',
                                                'chr4:90643757':'SNCA','chr12:40734202':'LRRK2',
                                                'chr4:90744993':'SNCA','chr4:90471245':'SNCA',
                                                'chr4:90819961':'MMRN1','chr3:196357126':'NRROS',
                                                'chr17:44051846':'MAPT','chr4:951947':'TMEM175'})

# Subset and rename columns
lat_snps = lat_snps[['Locus','SNP','CHROMOSOME','POSITION','EA','NEA','beta','P','EAF','ancestry']].rename(columns={
    'CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele',
    'P':'p_value','EAF':'effect_allele_frequency'})
lat_snps

In [None]:
# Add in GBA data from 23andMe
gba_snps = ['chr1:155206167','chr1:155135036','chr1:155205634']
lat23_gba = df_lat23[df_lat23['MARKERNAME'].isin(gba_snps)]
# Fill in missing locus, ancestry, beta, and rsID's
lat23_gba['Locus'] = 'GBA'
lat23_gba['ancestry'] = 'Latino'
lat23_gba['beta'] = np.log(lat23_gba['OR'])
lat23_gba['SNP'] = lat23_gba['MARKERNAME'].map({'chr1:155206167':'rs2230288','chr1:155135036':'rs35749011','chr1:155205634':'rs76763715'})
lat23_gba

In [None]:
# Add in LRRK2 data from 23andMe
lat23_lrrk2 = df_lat23[df_lat23['MARKERNAME'] == 'chr12:40610864']
# Fill in missing locus, ancestry, beta, and rsID's
lat23_lrrk2['Locus'] = 'LRRK2'
lat23_lrrk2['ancestry'] = 'Latino'
lat23_lrrk2['beta'] = np.log(lat23_lrrk2['OR'])
lat23_lrrk2['SNP'] = 'rs11175620'
lat23_lrrk2

In [None]:
lat23_snps = pd.concat([lat23_gba, lat23_lrrk2])
lat23_snps

In [None]:
# Subset and rename columns
lat23_snps = lat23_snps[['Locus','SNP','CHROMOSOME','POSITION','EA','NEA','beta','P','EAF','ancestry']].rename(columns={
    'CHROMOSOME':'Chromosome','POSITION':'Position','EA':'effect_allele','NEA':'other_allele',
    'P':'p_value','EAF':'effect_allele_frequency'})
lat23_snps

In [None]:
# Concat all Latino data
lat_risk = pd.concat([lat_risk, lat_snps, lat23_snps])
lat_risk = lat_risk[['Locus', 'SNP', 'Chromosome', 'Position', 'effect_allele', 'other_allele',
       'beta', 'p_value', 'effect_allele_frequency', 'ancestry']]
lat_risk

In [216]:
# Concat all dataframes
df_calcs = pd.concat([eur_risk, afr_risk, ea_risk, lat_risk])

In [None]:
# Save path
path = '{WORK_DIR}/PD/processed/'

# Filenames and path
file = f'{today}_PD_PAR_data_for_analysis.csv'
full_path = os.path.join(path, file)

# Export the dataframe to CSV
df_calcs.to_csv(full_path, index=False, header=True)

print(f'Data exported as {file}')

## Checking other SNPs of interest

In [26]:
# All pops
list_check_all_pops = ['chr1:155236376','chr1:155236246','chr1:155235252','chr12:40310434','chr12:40340400','chr1:155206167','chr1:155206037','chr1:155205043','chr12:40704236','chr12:40734202']
## Latino
lat_snps = df_lat[df_lat['MARKERNAME'].isin(list_check_all_pops)]
lat23_snps = df_lat23[df_lat23['MARKERNAME'].isin(list_check_all_pops)]
## East Asian
ea_snps = df_ea[df_ea['MARKERNAME'].isin(list_check_all_pops)]
## European
eur_snps = df_eur[df_eur['MarkerName'].isin(list_check_all_pops)]
## African/African Admixed
pattern = '|'.join(list_check_all_pops)
afr_snps = df_afr[df_afr['variant_id'].str.contains(pattern, case=False, na=False)]

# EAS only
list_check_ea_only = ['chr12:40363526','chr12:40320043','chr12:40757328','chr12:40713845']
ea_only_snps = df_ea[df_ea['MARKERNAME'].isin(list_check_ea_only)]

In [36]:
lat23_snps = lat23_snps[['Chromosome','Position','effect_allele','other_allele','beta','p_value','effect_allele_frequency']]
lat23_snps['ancestry'] = 'Latino'
eur_snps = eur_snps[['SNP','Chromosome','Position','effect_allele','other_allele','beta','p_value','effect_allele_frequency']]
eur_snps['ancestry'] = 'European'
ea_only_snps = ea_only_snps[['Chromosome','Position','EA','NEA','BETA','P','EAF']].rename(columns={
    'EA':'effect_allele','NEA':'other_allele',
    'BETA':'beta','P':'p_value','EAF':'effect_allele_frequency'})
ea_only_snps['ancestry'] = 'East Asian'
check_snps_all = pd.concat([lat23_snps,eur_snps,ea_only_snps])

In [62]:
check_snps_all

Unnamed: 0,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,SNP
934915,1,155206037,G,A,-0.583414,0.0078148,0.99459,Latino,
934916,1,155206167,T,C,0.80083,9.57191e-06,0.00589,Latino,
1439210,12,40340400,a,g,2.4289,3.61e-148,0.0015,European,rs34637584
4191451,1,155236246,a,g,0.3619,1.42e-07,0.0109,European,rs75548401
4191452,1,155236376,t,c,0.6357,9.124999999999999e-48,0.0162,European,rs2230288
3151430,12,40757328,A,G,0.6634,4.68e-24,0.036058,East Asian,


In [37]:
# Adding rsID, Locus, and Protein change for these variants
check_snps_all['SNP'] = check_snps_all['Position'].map({155206037:'rs75548401',155206167:'rs2230288',
                                              155236246:'rs75548401',155236376:'rs2230288',
                                              40757328:'rs34778348',40340400:'rs34637584'})
check_snps_all['Locus'] = check_snps_all['Position'].map({155206037:'GBA1',155206167:'GBA1',
                                              155236246:'GBA1',155236376:'GBA1',
                                              40757328:'LRRK2',40340400:'LRRK2'})
check_snps_all['Protein change'] = check_snps_all['Position'].map({155206037:'T369M',155206167:'E326K',
                                              155236246:'T369M',155236376:'E326K',
                                              40757328:'G2385R',40340400:'G2019S'})

In [38]:
# Some alleles are lowercase -> convert to uppercase
check_snps_all['effect_allele'] = check_snps_all['effect_allele'].str.upper()
check_snps_all['other_allele'] = check_snps_all['other_allele'].str.upper()
check_snps_all

Unnamed: 0,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,SNP,Locus,Protein change
934915,1,155206037,G,A,-0.583414,0.0078148,0.99459,Latino,rs75548401,GBA1,T369M
934916,1,155206167,T,C,0.80083,9.57191e-06,0.00589,Latino,rs2230288,GBA1,E326K
1439210,12,40340400,A,G,2.4289,3.61e-148,0.0015,European,rs34637584,LRRK2,G2019S
4191451,1,155236246,A,G,0.3619,1.42e-07,0.0109,European,rs75548401,GBA1,T369M
4191452,1,155236376,T,C,0.6357,9.124999999999999e-48,0.0162,European,rs2230288,GBA1,E326K
3151430,12,40757328,A,G,0.6634,4.68e-24,0.036058,East Asian,rs34778348,LRRK2,G2385R


In [39]:
# Negative beta -> flipped alleles, new maf = 1 - maf
check_snps_all['eaf_risk'] = check_snps_all['effect_allele_frequency'].where(check_snps_all['beta'] > 0, 1 - check_snps_all['effect_allele_frequency'])

# Absolute value of beta
check_snps_all['beta_abs'] = check_snps_all['beta'].abs()

# Risk allele
check_snps_all['risk_allele'] = np.where(check_snps_all['beta'] < 0, check_snps_all['other_allele'], check_snps_all['effect_allele'])

# Calculate the new odds ratio with beta_abs
check_snps_all['odds_ratio_new'] = np.exp(check_snps_all['beta_abs'])

#Calculate the population attributable risk (PAR = MAF(OR - 1)/1 + MAF(OR - 1))
check_snps_all['par'] = (check_snps_all['eaf_risk']*(check_snps_all['odds_ratio_new']-1)) / (1 + check_snps_all['eaf_risk']*(check_snps_all['odds_ratio_new']-1))

In [40]:
check_snps_all

Unnamed: 0,Chromosome,Position,effect_allele,other_allele,beta,p_value,effect_allele_frequency,ancestry,SNP,Locus,Protein change,eaf_risk,beta_abs,risk_allele,odds_ratio_new,par
934915,1,155206037,G,A,-0.583414,0.0078148,0.99459,Latino,rs75548401,GBA1,T369M,0.00541,0.583414,A,1.792146,0.004267
934916,1,155206167,T,C,0.80083,9.57191e-06,0.00589,Latino,rs2230288,GBA1,E326K,0.00589,0.80083,T,2.227389,0.007177
1439210,12,40340400,A,G,2.4289,3.61e-148,0.0015,European,rs34637584,LRRK2,G2019S,0.0015,2.4289,A,11.346394,0.015282
4191451,1,155236246,A,G,0.3619,1.42e-07,0.0109,European,rs75548401,GBA1,T369M,0.0109,0.3619,A,1.436055,0.004731
4191452,1,155236376,T,C,0.6357,9.124999999999999e-48,0.0162,European,rs2230288,GBA1,E326K,0.0162,0.6357,T,1.888344,0.014187
3151430,12,40757328,A,G,0.6634,4.68e-24,0.036058,East Asian,rs34778348,LRRK2,G2385R,0.036058,0.6634,A,1.941382,0.03283


In [41]:
# Save path
path = '{WORK_DIR}/PD/known_variants/'

# Filename and path
file = f'{today}_PD_PAR_lrrk2_gba1.csv'
full_path = os.path.join(path, file)

# Export the dataframe to CSV
check_snps_all.to_csv(full_path, index=False, header=True)

print(f'Data exported as {today}')