In [13]:
import numpy as np

# First, create GWAS bed file

In [8]:
sites = set()
with open('../data/gwas_catalog_v1.0-associations_e98_r2020-02-08.tsv', 'r') as f:
    next(f) # skip header
    for line in f:
        pieces = line.strip().split('\t')
        if pieces[12] == '' or 'x' in pieces[12]:
            pass
        elif ';' in pieces[12]:
            sites.update([(pieces[11], int(pos)) for pos in pieces[12].split(';')])
        else:
            sites.add((pieces[11], int(pieces[12])))
print('Sites', len(sites))

Sites 111564


In [10]:
sites = sorted(sites)
with open('../data/gwas_catalog_sites.bed', 'w+') as f:
    for site in sites:
        f.write('chr%s:%d-%d\n' % (site[0], site[1]-1, site[1]))

# Now, quantify overlaps

In [18]:
chroms = [str(x) for x in range(1, 23)]

def fraction_of_sites(data_dir, ref_version):
    overlapping_sites = 0
    total_sites = 0
    for chrom in chroms:
        print(chrom)

        # pull gwas sites
        sites = set()
        with open('../data/gwas_catalog_sites%s.bed' % ref_version, 'r') as f:
            for line in f:
                pieces = line.strip().split(':')
                if pieces[0][3:] == chrom:
                    sites.add(int(pieces[1].split('-')[1]))
        sites = sorted(sites)

        # pull snp positions
        pos_data = np.load('%s/chr.%s.gen.coordinates.npy' % (data_dir, chrom))
        is_snp = pos_data[:, 2].astype(bool)
        is_pass = pos_data[:, 3].astype(bool)
        #is_pass = np.ones(is_snp.shape, dtype=bool)
        snp_positions = pos_data[:, 1]
        print('Sites pulled from vcf:', snp_positions.shape[0])

        snp_positions = snp_positions[is_snp & is_pass]
        print('Removed %d sites that are not bi-allelic SNPs' % np.sum(~is_snp))
        print('Removed %d sites that do not pass GATK' % np.sum(is_snp & ~is_pass))

        print('Final matrix', snp_positions.shape)

        overlapping_sites += np.sum(np.isin(sites, snp_positions))
        total_sites += len(sites)

    return overlapping_sites/total_sites

In [19]:
data_dir = '../split_gen_ihart'
ref_version = '37'

ihart_p = fraction_of_sites('../split_gen_ihart', '37')
spark_exome_p = fraction_of_sites('../split_gen_spark_exome', '38')
spark_p = fraction_of_sites('../split_gen_spark', '38')


1
Sites pulled from vcf: 6256374
Removed 521542 sites that are not bi-allelic SNPs
Removed 277807 sites that do not pass GATK
Final matrix (5457025,)
2
Sites pulled from vcf: 6686571
Removed 555889 sites that are not bi-allelic SNPs
Removed 175037 sites that do not pass GATK
Final matrix (5955645,)
3
Sites pulled from vcf: 5454928
Removed 450460 sites that are not bi-allelic SNPs
Removed 68971 sites that do not pass GATK
Final matrix (4935497,)
4
Sites pulled from vcf: 5513425
Removed 455548 sites that are not bi-allelic SNPs
Removed 122178 sites that do not pass GATK
Final matrix (4935699,)
5
Sites pulled from vcf: 5017956
Removed 412536 sites that are not bi-allelic SNPs
Removed 83225 sites that do not pass GATK
Final matrix (4522195,)
6
Sites pulled from vcf: 4737122
Removed 405438 sites that are not bi-allelic SNPs
Removed 57838 sites that do not pass GATK
Final matrix (4273846,)
7
Sites pulled from vcf: 4575691
Removed 385370 sites that are not bi-allelic SNPs
Removed 153013 sites

Sites pulled from vcf: 18845
Removed 227 sites that are not bi-allelic SNPs
Removed 480 sites that do not pass GATK
Final matrix (18138,)
16
Sites pulled from vcf: 20428
Removed 238 sites that are not bi-allelic SNPs
Removed 367 sites that do not pass GATK
Final matrix (19823,)
17
Sites pulled from vcf: 18949
Removed 397 sites that are not bi-allelic SNPs
Removed 332 sites that do not pass GATK
Final matrix (18220,)
18
Sites pulled from vcf: 17820
Removed 100 sites that are not bi-allelic SNPs
Removed 723 sites that do not pass GATK
Final matrix (16997,)
19
Sites pulled from vcf: 14202
Removed 157 sites that are not bi-allelic SNPs
Removed 228 sites that do not pass GATK
Final matrix (13817,)
20
Sites pulled from vcf: 14888
Removed 93 sites that are not bi-allelic SNPs
Removed 292 sites that do not pass GATK
Final matrix (14503,)
21
Sites pulled from vcf: 8654
Removed 61 sites that are not bi-allelic SNPs
Removed 399 sites that do not pass GATK
Final matrix (8194,)
22
Sites pulled from

In [26]:
ihart_chip_p = fraction_of_sites('../split_gen_ihart_chip', '37')
ssc_p = fraction_of_sites('../split_gen_ssc', '37')


1
Sites pulled from vcf: 49067
Removed 9414 sites that are not bi-allelic SNPs
Removed 137 sites that do not pass GATK
Final matrix (39516,)
2
Sites pulled from vcf: 41971
Removed 6716 sites that are not bi-allelic SNPs
Removed 111 sites that do not pass GATK
Final matrix (35144,)
3
Sites pulled from vcf: 35363
Removed 5912 sites that are not bi-allelic SNPs
Removed 104 sites that do not pass GATK
Final matrix (29347,)
4
Sites pulled from vcf: 29203
Removed 4092 sites that are not bi-allelic SNPs
Removed 92 sites that do not pass GATK
Final matrix (25019,)
5
Sites pulled from vcf: 29463
Removed 4528 sites that are not bi-allelic SNPs
Removed 97 sites that do not pass GATK
Final matrix (24838,)
6
Sites pulled from vcf: 34243
Removed 5039 sites that are not bi-allelic SNPs
Removed 136 sites that do not pass GATK
Final matrix (29068,)
7
Sites pulled from vcf: 27498
Removed 4299 sites that are not bi-allelic SNPs
Removed 112 sites that do not pass GATK
Final matrix (23087,)
8
Sites pulled 

In [24]:
print(ihart_p, spark_exome_p, spark_p)

0.9190769319971782 0.05058414598822053 0.15546007531138362


In [27]:
print(ihart_chip_p, ssc_p)

0.14385249466085562 0.9262376668180632
