## Imports

In [39]:
import os
import subprocess
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Get ADPD SNPs

In [40]:
adpd_snps = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/snps_final/191121_ld_buddies_table_stage3.tsv', sep='\t')
display(adpd_snps.columns)

Index(['chr', 'pos', 'r2_with_ld_tag', 'ld_tag_chr', 'ld_tag_pos',
       'source_gwas', 'source', 'snp_id', 'locus_num', 'number_ld_buddies',
       'ld_tag_locus', 'rsid', 'chrom_hg19', 'snp_pos_hg19', 'file',
       'effect_allele', 'noneffect_allele', 'direction', 'pvalue', 'has_coloc',
       'direct_atac_overlap_narrow_tissue_regions',
       'containing_atac_tissues_narrow_tissue_regions',
       'nearest_atac_tissue_narrow_tissue_regions',
       'start_narrow_tissue_regions', 'end_narrow_tissue_regions',
       'dist_narrow_tissue_regions',
       'direct_atac_overlap_broad_tissue_regions',
       'containing_atac_tissues_broad_tissue_regions',
       'nearest_atac_tissue_broad_tissue_regions',
       'start_broad_tissue_regions', 'end_broad_tissue_regions',
       'dist_broad_tissue_regions', 'direct_atac_overlap_single_cell',
       'containing_atac_tissues_single_cell',
       'nearest_atac_tissue_single_cell', 'start_single_cell',
       'end_single_cell', 'dist_single_cel

## Make hg19 SNP BED files

In [41]:
snps_bed_hg19 = adpd_snps[['chrom_hg19', 'snp_pos_hg19']]
snps_bed_hg19 = snps_bed_hg19.loc[snps_bed_hg19['chrom_hg19'].notnull()]
snps_bed_hg19['chrom_hg19'] = snps_bed_hg19['chrom_hg19'].astype(int).astype(str)
snps_bed_hg19['snp_pos_hg19'] = snps_bed_hg19['snp_pos_hg19'].astype(int)
snps_bed_hg19['chrom_hg19'] = 'chr' + snps_bed_hg19['chrom_hg19']
snps_bed_hg19['start'] = snps_bed_hg19['snp_pos_hg19'] - 1
snps_bed_hg19 = snps_bed_hg19[['chrom_hg19', 'start', 'snp_pos_hg19']]
snps_bed_hg19.sort_values(by=['chrom_hg19', 'start', 'snp_pos_hg19'], inplace=True)
snps_bed_hg19.drop_duplicates(inplace=True)
display(snps_bed_hg19)
snps_bed_hg19.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allele_bias/snps/hg19_snps.bed', sep='\t', index=False, header=False)

nochr_snps_bed_hg19 = adpd_snps[['chrom_hg19', 'snp_pos_hg19']]
nochr_snps_bed_hg19 = nochr_snps_bed_hg19.loc[nochr_snps_bed_hg19['chrom_hg19'].notnull()]
nochr_snps_bed_hg19['chrom_hg19'] = nochr_snps_bed_hg19['chrom_hg19'].astype(int).astype(str)
nochr_snps_bed_hg19['snp_pos_hg19'] = nochr_snps_bed_hg19['snp_pos_hg19'].astype(int)
nochr_snps_bed_hg19['start'] = nochr_snps_bed_hg19['snp_pos_hg19'] - 1
nochr_snps_bed_hg19 = nochr_snps_bed_hg19[['chrom_hg19', 'start', 'snp_pos_hg19']]
nochr_snps_bed_hg19.sort_values(by=['chrom_hg19', 'start', 'snp_pos_hg19'], inplace=True)
nochr_snps_bed_hg19.drop_duplicates(inplace=True)
nochr_snps_bed_hg19.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allele_bias/snps/nochr_hg19_snps.bed', sep='\t', index=False, header=False)

Unnamed: 0,chrom_hg19,start,snp_pos_hg19
2334,chr1,40287822,40287823
2335,chr1,40287916,40287917
2336,chr1,40287931,40287932
2337,chr1,40288463,40288464
2338,chr1,40290896,40290897
...,...,...,...
13768,chr9,34036528,34036529
13769,chr9,34039704,34039705
13770,chr9,34040773,34040774
13771,chr9,34046390,34046391


## Get hg38 Coordinates for 1KG intersected SNPs

In [48]:
hg19_vcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allele_bias/snps/1KG_ADPD_hg19_snps.vcf', header=None, sep='\t')
hg19_vcf = hg19_vcf[[2, 3, 4, 5, 6, 7]]
hg19_vcf.columns = ['rsid', 'ref', 'alt', 'score', 'qc', 'info']
hg19_vcf.sort_values(by=['rsid', 'ref'], inplace=True)
hg19_vcf.drop_duplicates(inplace=True)
hg19_vcf

Unnamed: 0,rsid,ref,alt,score,qc,info
1230,.;esv3626682,T,"<CN0>,<CN2>",100,PASS,"AC=0,1;AF=0,0.000199681;AN=5008;CS=DUP_gs;END=..."
9680,DUP_uwash_chr5_60140563_60179087,A,<CN0>,100,PASS,AC=0;AF=0;AN=5008;CS=DUP_uwash;END=60179087;NS...
0,esv3585792,G,<CN0>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=0,500;CIPOS=..."
45,esv3585794,C,<CN0>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=-23,24;CIPOS..."
64,esv3587615,T,<CN2>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=-150,150;CIP..."
...,...,...,...,...,...,...
8127,rs9984928,G,A,100,PASS,AC=1660;AF=0.33147;AN=5008;NS=2504;DP=16963;EA...
7588,rs9989786,A,G,100,PASS,AC=3719;AF=0.742612;AN=5008;NS=2504;DP=19926;E...
8852,rs9994159,C,T,100,PASS,AC=599;AF=0.119609;AN=5008;NS=2504;DP=20517;EA...
9357,rs9995651,C,G,100,PASS,AC=310;AF=0.061901;AN=5008;NS=2504;DP=13879;EA...


In [49]:
adpd_snps = adpd_snps[['chr', 'pos', 'rsid']]
adpd_snps.sort_values(by=['chr', 'pos'], inplace=True)
adpd_snps.drop_duplicates(inplace=True)
adpd_snps

Unnamed: 0,chr,pos,rsid
2334,1,39822151,rs34640847
2335,1,39822245,rs36015266
2336,1,39822260,rs61779808
2337,1,39822792,rs61779809
2338,1,39825225,rs72666941
...,...,...,...
10156,21,37517450,rs2236688
10157,21,37517672,rs2835776
10158,21,37519947,rs11701836
10159,21,37527140,rs7280075


In [53]:
merged = adpd_snps.merge(hg19_vcf, on='rsid')
merged['chr'] = 'chr' + merged['chr'].astype('str')
merged['start'] = merged['pos'] - 1
merged = merged.loc[merged['alt'].apply(lambda x : len(x) < 2)]
merged = merged.loc[merged['ref'].apply(lambda x : len(x) < 2)]
hg38_bed = merged[['chr', 'start', 'pos', 'rsid', 'ref', 'alt', 'info']]
hg38_bed['info'] = hg38_bed['info'].apply(lambda x : x.split('EUR_AF=')[1].split(';')[0])
hg38_bed.sort_values(by=['chr', 'pos'], inplace=True)
hg38_bed.drop_duplicates(inplace=True)
hg38_bed.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allele_bias/snps/hg38_snps_1KG_ADPD.bed', header=False, index=False, sep='\t')