In [1]:
import pandas as pd

variants = pd.read_csv("data/variants.tsv", sep='\t')
variants.head()

6346


Unnamed: 0,#dbSNP_hg38_chr,dbSNP_hg38_position,Top SNP,P-value,LocusName,RA 1(Reported Allele 1),nonref_allele,nonref_effect,OR_nonref,nearest_gene_symb,Study type,Study Design,Pubmed PMID,Population_map,Cohort_simple3,Sample size,Analysis group,Phenotype,Phenotype-derived,most_severe_consequence
0,chr1,6434683,rs12074379,0.00726,ESPN,T,T,NR,,ESPN,SNP-based,Disease risk,30636644,Caucasian,"ADGC, CHS, CHARGE, HRS",10191,Plan 3 (only females),AD,AD,intron_variant
1,chr1,6434683,rs12074379,8.509999999999999e-40,NR,T,T,NR,,ESPN,SNP-based,eQTL,30636644,Caucasian,"ADGC, CHS, CHARGE, HRS",10191,Plan 3 (only females),ESPN (ILMN_1806710) expression,Expression,intron_variant
2,chr1,8708071,rs112053331,0.0009,RERE,NR,NR,NR,,RERE,SNP-based,Cross phenotype,30010129,Caucasian,IGAP,54162,All,AD,AD,intron_variant
3,chr1,8708071,rs112053331,0.08392,,NR,NR,NR,,RERE,Gene-based,Cross phenotype,30010129,Caucasian,IGAP,54162,All,AD,AD,intron_variant
4,chr1,11487007,rs2379135,0.0156,PTCHD2,NR,NR,NR,,DISP3,SNP-based,Endophenotype,22245343,Caucasian,ADNI,757,All,MRI,Imaging,intron_variant


In [2]:
# order variants by p-value and select the top 150 variants
top_variants = variants.sort_values('P-value')
top_variants['Top SNP']

2206    rs17182607
3374     rs2075650
1182    rs11602981
4870    rs62341097
1886     rs1629316
           ...    
4629    rs11711889
5418    rs10271466
5419    rs10271466
6163     rs6477258
6164     rs6477258
Name: Top SNP, Length: 6346, dtype: object

In [3]:
cohort = pd.read_csv("data/cohort_tabular.csv")
# filter where snps are empty
cohort = cohort[cohort['rs2075650'].notna() & cohort['DIAGNOSIS'].notna() & cohort['MMSCORE'].notna()]
cohort

Unnamed: 0,subject_id,visit,PTEDUCAT,DIAGNOSIS,MMSCORE,BCPREDX,subject_age,PTGENDER,rs2075650,rs11136000,...,rs11767557,rs2899472,rs6583826,rs11568563,rs2830500,rs1800795,rs7185636,rs3818361,rs7741604,rs4735340
948,041_S_4004,sc,14.0,2.0,30.0,,66.80,2.0,AA,CC,...,CT,CC,AG,TT,CC,CG,TT,AG,AC,TT
949,098_S_0172,m60,18.0,1.0,29.0,,70.63,2.0,AA,CT,...,TT,CC,GG,TT,AC,GG,TT,GG,AA,AT
950,012_S_4012,sc,16.0,2.0,29.0,,68.89,2.0,AA,CT,...,TT,CC,AG,TT,AC,CG,CT,GG,AA,AT
951,116_S_4010,sc,18.0,1.0,29.0,,70.80,2.0,AA,CT,...,CT,CC,AG,GG,AC,GG,CT,GG,AA,TT
952,023_S_1190,m48,13.0,1.0,29.0,,76.55,2.0,AA,CT,...,CT,CC,AA,TT,AC,CC,TT,GG,AA,AT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15233,033_S_6352,m78,,2.0,28.0,,71.42,1.0,AA,CT,...,TT,CC,AA,TT,,GG,,GG,,
15241,941_S_6080,m84,,2.0,30.0,,76.90,2.0,AA,TT,...,CT,AC,AG,TT,,CC,,GG,,
15245,941_S_6499,m78,,1.0,26.0,,69.52,2.0,AA,CT,...,TT,CC,GG,TT,,GG,,AG,,
15248,007_S_2394,m162,,2.0,29.0,,67.49,1.0,AG,CT,...,TT,AC,AG,TT,AC,CG,TT,GG,AA,AT


In [4]:
# get unique subject ids
unique_subjects = cohort['subject_id'].unique()
len(unique_subjects)

1129

In [9]:
# get snp files in data/snps
import glob

snp_files = glob.glob("data/snps/*.vcf")
subjects_with_snps = []

for file in snp_files:
    subject_id = file.split("/")[-1].replace("_SNPs.vcf", "")
    if subject_id in unique_subjects:
        subjects_with_snps.append(subject_id)

print(f"Number of subjects with snps: {len(subjects_with_snps)}")

Number of subjects with snps: 22


In [30]:
top_k_snps = 1500
top_snps = top_variants['Top SNP'].head(top_k_snps).tolist()
top_snps = set(top_snps)

In [43]:
import vcfpy
snp_sets = []
n_worker = 8

def get_snps_from_vcf(vcf_file, top_snps):
    matching_snps = []

    reader = vcfpy.Reader.from_path(vcf_file)
    # check if snp is in top snps
    for record in reader:
        # check if any id is in the top snps
        if not set(record.ID).isdisjoint(top_snps):
            matching_snps.append(record.ID[0])
    snp_sets.append(set(matching_snps))

for vcf_file in snp_files:
    get_snps_from_vcf(vcf_file, top_snps)

ERROR! Session/line number was not unique in database. History logging moved to new session 219


In [45]:
# get the intersection of all snp sets
common_snps = snp_sets[0].intersection(*snp_sets)
len(common_snps)

53

In [46]:
union_snps = snp_sets[0].union(*snp_sets)
len(union_snps)

905