In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl

hl.init(
    tmp_dir='/net/ascratch/people/plggosborcz/gosborcz-hail',
    spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '30G'},
    default_reference='GRCh38') 



2023-02-27 20:00:45.724 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.3
SparkUI available at http://ac0089:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.105-acd89e80c345
LOGGING: writing to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/analysis/burden-and-family/hail-20230227-2000-0.2.105-acd89e80c345.log


In [2]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook() 

## 1. Get number of variants with CADD > 30

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-anno.mt')

In [4]:
pheno = hl.import_table(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/pheno/GTS-coded-corrected-june-2021.csv',
    impute = True,
    key='ID',
    delimiter = ',',
    quote ="\""
)

2023-02-27 11:42:52.741 Hail: INFO: Reading table to impute column types
2023-02-27 11:42:53.417 Hail: INFO: Finished type imputation
  Loading field 'ID' as type str (imputed)
  Loading field 'family' as type str (imputed)
  Loading field 'sex' as type str (imputed)
  Loading field 'kinship' as type str (imputed)
  Loading field 'disease' as type str (imputed)
  Loading field 'phenotype' as type str (imputed)
  Loading field 'add_pheno' as type str (imputed)
  Loading field 'heavy_tics' as type str (imputed)
  Loading field 'heavy_tics_familial' as type str (imputed)
  Loading field 'GTS_ASD_group' as type str (imputed)
  Loading field 'nonCTD' as type str (imputed)


In [5]:
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

In [6]:
fams = list(set(mt.phenotypes.family.collect()))

In [7]:
fams.sort()

In [8]:
mts = []
for f in fams:
    fam = hl.read_matrix_table(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filtered'+f+'.mt'
    )
    mts.append(fam)

In [None]:
i = 20

while len(mts) > 1:
    i = i+1
    res = mts.pop(0).union_cols(mts.pop(0),
                                row_join_type='outer')
    res.write('/net/ascratch/people/plggosborcz/temp_'+str(i)+'.mt')
    res = hl.read_matrix_table('/net/ascratch/people/plggosborcz/temp_'+str(i)+'.mt') 
    mts.append(res)



In [None]:
mt = mts[0]

In [None]:
mt.write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

## 2. Get probs of a cadd > 30 variant occuring in each of the families

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

In [4]:
mt.count()

(4476239, 124)

In [16]:
mafs = mt.aggregate_rows(
    hl.agg.filter((mt.cadd.score_phred > 30) & (mt.gnomad_v_3_1.freq.AF[2]<0.01),
                  hl.agg.collect(mt.gnomad_v_3_1.freq.AF[2])
                 )
)



In [18]:
len(mafs)

67

In [None]:
#calculate probability of any variant with CADD > 30 and MAF < 0.01%. occuring in one family

In [23]:
np.sum(mafs) #but this overcounts joined events we have to deduct at least all the cases when they are present together

0.09318025229243268

In [42]:
anti_sum = []

for maf in mafs:
    anti_sum.append(1-maf)

In [47]:
1 - np.prod(anti_sum)

0.08916132722517234

In [44]:
#calculate probablities for all pairs and sum them

probs = []

for idx, m in enumerate(mafs):

    
    for i in range(idx+1,len(mafs)): 
        prob = m*mafs[i]
        probs.append(prob)

In [39]:
np.sum(probs)

0.004132811296717947

In [40]:
np.sum(mafs) - np.sum(probs)

0.08904744099571474

## 2. Get probs of 1% CADD > 10 variant occuring in each of the families

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

In [4]:
mt.count()

(4476239, 124)

In [5]:
mafs = mt.aggregate_rows(
    hl.agg.filter((mt.cadd.score_phred > 10) & (mt.gnomad_v_3_1.freq.AF[2]<0.01),
                  hl.agg.collect(mt.gnomad_v_3_1.freq.AF[2])
                 )
)



In [6]:
len(mafs)

6717

In [7]:
#calculate probability of any variant with MAF < 0.01%. occuring in one family

In [10]:
anti_sum = []

for maf in mafs:
    anti_sum.append(1-maf)

1 - np.prod(anti_sum)

## 3. Get probs of ultrarare, rare and uncommon variants occuring in one person in each of the genes from the list

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

In [4]:
genes_found = hl.import_table('all_genes_dups.csv', 
                       delimiter=';',
                       impute=True)

2023-02-27 20:01:22.170 Hail: INFO: Reading table to impute column types
2023-02-27 20:01:22.888 Hail: INFO: Finished type imputation
  Loading field 'locus' as type str (imputed)
  Loading field 'alleles' as type str (imputed)
  Loading field 'within_gene' as type str (imputed)
  Loading field 'cadd' as type float64 (imputed)
  Loading field 'gnomad_v3_nfe_af' as type float64 (imputed)
  Loading field 'family_non_ref' as type str (imputed)


In [5]:
genes_lists = hl.import_table('gts_gene_results.csv', 
                       delimiter=',',
                       impute=True)

2023-02-27 20:01:23.390 Hail: INFO: Reading table to impute column types
2023-02-27 20:01:23.728 Hail: INFO: Finished type imputation
  Loading field 'UR_2' as type str (imputed)
  Loading field 'UR_3' as type str (imputed)
  Loading field 'UR_4' as type str (imputed)
  Loading field 'R_3' as type str (imputed)
  Loading field 'R_4' as type str (imputed)
  Loading field 'U_4' as type str (imputed)
  Loading field 'U_5' as type str (imputed)
  Loading field 'U_6' as type str (imputed)
  Loading field 'U_7' as type str (imputed)


In [6]:
for_calcs = []

for n in range(2,5):
    print("UR_"+str(n))
    for_calcs.append(
    genes_lists["UR_"+str(n)].collect()
    )

for n in range(3,5):
    print("R_"+str(n))
    for_calcs.append(
    genes_lists["R_"+str(n)].collect()
    )

for n in range(4,8):
    print("U_"+str(n))
    for_calcs.append(
    genes_lists["U_"+str(n)].collect()
    )

UR_2
UR_3
UR_4
R_3
R_4
U_4
U_5
U_6
U_7


In [7]:
for_calcs = hl.flatten(hl.literal(for_calcs))

In [8]:
mt = mt.explode_rows(mt.within_gene)

In [9]:
mt = mt.filter_rows(
    hl.any(
        hl.literal(for_calcs).contains(mt.within_gene)
    )
)

In [35]:
mt.write('/net/ascratch/people/plggosborcz/gosborcz-hail/exploted-genes.mt', overwrite = True)

2023-02-27 11:49:09.565 Hail: INFO: wrote matrix table with 30357 rows and 124 columns in 50 partitions to /net/ascratch/people/plggosborcz/gosborcz-hail/exploted-genes.mt


In [10]:
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/gosborcz-hail/exploted-genes.mt')

In [11]:
mafs = mt.aggregate_rows(
    hl.agg.group_by(
        mt.within_gene,
        hl.agg.filter(
            (mt.cadd.score_phred > 10) & (mt.gnomad_v_3_1.freq.AF[2]<0.05),
            hl.agg.collect(mt.gnomad_v_3_1.freq.AF[2])
        )
    )
)



In [12]:
final_mafs = {gene:
      mafs.get(gene)
      for gene in hl.eval(for_calcs)
      }

In [13]:
for key in final_mafs.keys():
    if final_mafs[key] == None:
        final_mafs[key] = [0.00001]   
    elif len(final_mafs[key]) == 0:
        final_mafs[key] = [0.00001]    

In [14]:
for gene in genes_lists['UR_2'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['UR_3'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['UR_4'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['R_3'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['R_4'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_4'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_5'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_6'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_7'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

0.07900376349569216
0.006169585042434256
0.14827309508682818
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
0.08440483169625557
0.2805628354870734
0.08389165070053883
0.10937174523250537
0.05727786755797648
0.08355627111080732
0.27432050849167045
0.09008870678895287
0.020081738261150783
0.004411437537158558
0.29572633593132325
9.99999999995449e-06
0.08630780792094517
0.4105368257558051
0.056958319287831016
0.27912457319413875
0.20535785406594487
0.14866664876660873
0.07033212676877287
0.04167401658140768
0.06695099341235111
7.351860020587964e-05
0.2602923176870958
0.1439098878283227
0.002337341604680576
0.11733790762923768
0.00221947849604609
9.99999999995449e-06
0.007364614570470995
0.085995452703994
0.12936751663502466
0.1179964840465576
0.2335659129356633
0.43373274304379417
0.043884139080668016
0.04835234583786907
0.05600398739918788
0.09681351136407867
0.027935942760726085
0.0
9.99999999995449e-06
0.06182375864774081
0.07340865574358257
9.99999

In [40]:
ur = genes_found.filter(genes_found.gnomad_v3_nfe_af < 0.001)
ur = ur.annotate(within_gene = ur.within_gene.split(","))
ur = ur.explode(ur.within_gene)

ur = ur.key_by(ur.family_non_ref, ur.within_gene)
ur = ur.distinct()

In [41]:
fams_ur = ur.group_by(ur.within_gene).aggregate(
    fams_non_ref = hl.agg.collect(ur.family_non_ref))

In [43]:
fams_dict = {
    "A":0.0625,
    "B":0.0156,
    "C":0.0078,
    "D":0.0156,
    "E":0.0625,
    "F":0.0078,
    "G":0.25,
    "H":0.125,
    "I":0.0626,
    "J":0.25,
    "R":0.125,
    "S":0.125,
    "T":0.5,
    "U":0.0625,
    "W":0.000976,
    "X":0.125,
    "Y":0.03125,
}

In [51]:
fams_ur = fams_ur.annotate(
    p_seg = hl.product(hl.map(lambda x: hl.dict(fams_dict).get(x), fams_ur.fams_non_ref))
)

In [52]:
fams_ur.filter(hl.len(fams_ur.fams_non_ref) == 2).show(n_rows=150)

2023-02-27 21:05:33.892 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:05:34.185 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC007402.1""","[""E"",""J""]",0.0156
"""AC007656.2""","[""U"",""X""]",0.00781
"""AC008691.1""","[""T"",""U""]",0.0313
"""AC046130.1""","[""G"",""J""]",0.0625
"""AC090912.1""","[""E"",""J""]",0.0156
"""AC090912.2""","[""E"",""J""]",0.0156
"""AC092844.1""","[""S"",""X""]",0.0156
"""AC092957.1""","[""H"",""I""]",0.00783
"""ADAMTSL1""","[""A"",""Y""]",0.00195
"""ADGRL3""","[""H"",""J""]",0.0313


In [53]:
fams_ur.filter(hl.len(fams_ur.fams_non_ref) == 3).show(n_rows=150)

2023-02-27 21:10:18.416 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:10:18.628 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""ANKS1B""","[""T"",""U"",""X""]",0.00391
"""EBF1""","[""J"",""U"",""X""]",0.00195
"""ESRRG""","[""G"",""H"",""X""]",0.00391
"""OPCML""","[""H"",""S"",""T""]",0.00781
"""PKP4""","[""F"",""J"",""T""]",0.000975
"""PRKG1""","[""E"",""H"",""I""]",0.000489
"""PTPRD""","[""G"",""T"",""X""]",0.0156
"""RASAL2""","[""H"",""J"",""X""]",0.00391
"""TPRG1""","[""D"",""H"",""J""]",0.000488
"""WWOX""","[""I"",""J"",""X""]",0.00196


In [54]:
fams_ur.filter(hl.len(fams_ur.fams_non_ref) == 4).show(n_rows=150)

2023-02-27 21:14:03.170 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:14:03.388 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""MACROD2""","[""D"",""G"",""I"",""T""]",0.000122
"""ROBO2""","[""A"",""H"",""R"",""X""]",0.000122


In [55]:
r = genes_found.filter(genes_found.gnomad_v3_nfe_af < 0.01)
r = r.annotate(within_gene = r.within_gene.split(","))
r = r.explode(r.within_gene)

r = r.key_by(r.family_non_ref, r.within_gene)
r = r.distinct()

In [56]:
fams_r = r.group_by(r.within_gene).aggregate(
    fams_non_ref = hl.agg.collect(r.family_non_ref))

In [57]:
fams_r = fams_r.annotate(
    p_seg = hl.product(hl.map(lambda x: hl.dict(fams_dict).get(x), fams_r.fams_non_ref))
)

In [58]:
fams_r.filter(hl.len(fams_r.fams_non_ref) == 3).show(n_rows=150)

2023-02-27 21:18:20.787 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:18:21.020 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC007656.2""","[""G"",""U"",""X""]",0.00195
"""AC011246.1""","[""H"",""J"",""W""]",3.05e-05
"""AC058822.1""","[""G"",""J"",""S""]",0.00781
"""AC079298.3""","[""H"",""J"",""X""]",0.00391
"""ADAMTSL1""","[""A"",""G"",""Y""]",0.000488
"""ADGRL2""","[""A"",""C"",""I""]",3.05e-05
"""ADK""","[""D"",""J"",""X""]",0.000488
"""AL356534.1""","[""G"",""H"",""J""]",0.00781
"""ALK""","[""J"",""R"",""U""]",0.00195
"""ANKS1B""","[""T"",""U"",""X""]",0.00391


In [59]:
fams_r.filter(hl.len(fams_r.fams_non_ref) == 4).show(n_rows=150)

2023-02-27 21:21:35.744 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:21:35.977 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AL157944.1""","[""A"",""G"",""I"",""Y""]",3.06e-05
"""CDH23""","[""D"",""E"",""I"",""J""]",1.53e-05
"""EBF1""","[""J"",""T"",""U"",""X""]",0.000977
"""GNAS""","[""J"",""U"",""W"",""X""]",1.91e-06
"""KAT6B""","[""H"",""I"",""J"",""X""]",0.000245
"""LRMDA""","[""H"",""I"",""J"",""X""]",0.000245
"""MACROD2""","[""D"",""G"",""I"",""T""]",0.000122
"""RARB""","[""D"",""H"",""J"",""T""]",0.000244
"""ROBO2""","[""A"",""H"",""R"",""X""]",0.000122
"""SEMA3A""","[""E"",""H"",""J"",""R""]",0.000244


In [64]:
u = genes_found.filter(genes_found.gnomad_v3_nfe_af < 0.05)
u = u.annotate(within_gene = u.within_gene.split(","))
u = u.explode(u.within_gene)

u = u.key_by(u.family_non_ref, u.within_gene)
u = u.distinct()

In [66]:
fams_u = u.group_by(u.within_gene).aggregate(
    fams_non_ref = hl.agg.collect(u.family_non_ref))

In [67]:
fams_u = fams_u.annotate(
    p_seg = hl.product(hl.map(lambda x: hl.dict(fams_dict).get(x), fams_u.fams_non_ref))
)

In [68]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 4).show(n_rows=150)

2023-02-27 21:26:22.993 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:26:23.267 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC007402.1""","[""E"",""F"",""J"",""X""]",1.52e-05
"""AC079298.3""","[""H"",""J"",""R"",""X""]",0.000488
"""AC087379.1""","[""G"",""H"",""J"",""X""]",0.000977
"""AC093865.1""","[""E"",""G"",""S"",""X""]",0.000244
"""ADAMTSL1""","[""A"",""G"",""T"",""Y""]",0.000244
"""ADGRL2""","[""A"",""C"",""I"",""Y""]",9.54e-07
"""ADGRL3""","[""G"",""H"",""J"",""S""]",0.000977
"""ADK""","[""D"",""E"",""J"",""X""]",3.05e-05
"""AL022068.1""","[""E"",""G"",""T"",""X""]",0.000977
"""AL035078.4""","[""A"",""H"",""J"",""X""]",0.000244


In [69]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 5).show(n_rows=150)

2023-02-27 21:30:32.797 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:30:33.074 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC109466.1""","[""F"",""J"",""T"",""U"",""X""]",7.62e-06
"""AC126121.3""","[""A"",""D"",""G"",""H"",""J""]",7.62e-06
"""AL137230.2""","[""E"",""G"",""H"",""T"",""U""]",6.1e-05
"""AL162726.3""","[""F"",""G"",""J"",""S"",""X""]",7.62e-06
"""AL589740.1""","[""D"",""J"",""T"",""U"",""Y""]",3.81e-06
"""BTBD9""","[""D"",""G"",""H"",""T"",""X""]",3.05e-05
"""CACNA1C""","[""G"",""H"",""S"",""T"",""X""]",0.000244
"""CACNA2D3""","[""A"",""D"",""I"",""J"",""T""]",7.63e-06
"""CDH23""","[""D"",""E"",""H"",""I"",""J""]",1.91e-06
"""COL25A1""","[""G"",""H"",""J"",""X"",""Y""]",3.05e-05


In [70]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 6).show(n_rows=150)

2023-02-27 21:33:49.846 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:33:50.095 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""NTM""","[""A"",""E"",""S"",""T"",""U"",""X""]",1.91e-06


In [71]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 7).show(n_rows=150)

2023-02-27 21:37:21.335 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-27 21:37:21.614 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""MACROD2""","[""D"",""G"",""I"",""J"",""T"",""W"",""X""]",3.72e-09
