In [2]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl

hl.init(
    tmp_dir='/net/ascratch/people/plggosborcz/gosborcz-hail',
    spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '30G'},
    default_reference='GRCh38') 



2023-03-01 21:22:45.801 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.3
SparkUI available at http://ac0713:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.105-acd89e80c345
LOGGING: writing to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/analysis/burden-and-family/hail-20230301-2122-0.2.105-acd89e80c345.log


In [3]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook() 

## 1. Get number of variants with CADD > 30

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-anno.mt')

In [4]:
pheno = hl.import_table(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/pheno/GTS-coded-corrected-june-2021.csv',
    impute = True,
    key='ID',
    delimiter = ',',
    quote ="\""
)

2023-02-27 11:42:52.741 Hail: INFO: Reading table to impute column types
2023-02-27 11:42:53.417 Hail: INFO: Finished type imputation
  Loading field 'ID' as type str (imputed)
  Loading field 'family' as type str (imputed)
  Loading field 'sex' as type str (imputed)
  Loading field 'kinship' as type str (imputed)
  Loading field 'disease' as type str (imputed)
  Loading field 'phenotype' as type str (imputed)
  Loading field 'add_pheno' as type str (imputed)
  Loading field 'heavy_tics' as type str (imputed)
  Loading field 'heavy_tics_familial' as type str (imputed)
  Loading field 'GTS_ASD_group' as type str (imputed)
  Loading field 'nonCTD' as type str (imputed)


In [5]:
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

In [6]:
fams = list(set(mt.phenotypes.family.collect()))

In [7]:
fams.sort()

In [8]:
mts = []
for f in fams:
    fam = hl.read_matrix_table(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filtered'+f+'.mt'
    )
    mts.append(fam)

In [None]:
i = 20

while len(mts) > 1:
    i = i+1
    res = mts.pop(0).union_cols(mts.pop(0),
                                row_join_type='outer')
    res.write('/net/ascratch/people/plggosborcz/temp_'+str(i)+'.mt')
    res = hl.read_matrix_table('/net/ascratch/people/plggosborcz/temp_'+str(i)+'.mt') 
    mts.append(res)



In [None]:
mt = mts[0]

In [None]:
mt.write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

## 2. Get probs of a cadd > 30 variant occuring in each of the families

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

In [4]:
mt.count()

(4476239, 124)

In [16]:
mafs = mt.aggregate_rows(
    hl.agg.filter((mt.cadd.score_phred > 30) & (mt.gnomad_v_3_1.freq.AF[2]<0.01),
                  hl.agg.collect(mt.gnomad_v_3_1.freq.AF[2])
                 )
)



In [18]:
len(mafs)

67

In [None]:
#calculate probability of any variant with CADD > 30 and MAF < 0.01%. occuring in one family

In [23]:
np.sum(mafs) #but this overcounts joined events we have to deduct at least all the cases when they are present together

0.09318025229243268

In [42]:
anti_sum = []

for maf in mafs:
    anti_sum.append(1-maf)

In [47]:
1 - np.prod(anti_sum)

0.08916132722517234

In [44]:
#calculate probablities for all pairs and sum them

probs = []

for idx, m in enumerate(mafs):

    
    for i in range(idx+1,len(mafs)): 
        prob = m*mafs[i]
        probs.append(prob)

In [39]:
np.sum(probs)

0.004132811296717947

In [40]:
np.sum(mafs) - np.sum(probs)

0.08904744099571474

## 2. Get probs of 1% CADD > 10 variant occuring in each of the families

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

In [4]:
mt.count()

(4476239, 124)

In [5]:
mafs = mt.aggregate_rows(
    hl.agg.filter((mt.cadd.score_phred > 10) & (mt.gnomad_v_3_1.freq.AF[2]<0.01),
                  hl.agg.collect(mt.gnomad_v_3_1.freq.AF[2])
                 )
)



In [6]:
len(mafs)

6717

In [7]:
#calculate probability of any variant with MAF < 0.01%. occuring in one family

In [10]:
anti_sum = []

for maf in mafs:
    anti_sum.append(1-maf)

1 - np.prod(anti_sum)

## 3. Get probs of ultrarare, rare and uncommon variants occuring in one person in each of the genes from the list

In [4]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-joined.mt')

In [38]:
df = pd.read_excel('all_genes_dups.xlsx')

In [43]:
df = df[['locus', 'alleles', 'within_gene', 'gnomad_v3_nfe_af', 'family_non_ref']]

In [61]:
df['within_gene'] = df['within_gene'].str.replace('[', '').str.replace(']', '').str.replace("\"", "")

  df['within_gene'] = df['within_gene'].str.replace('[', '').str.replace(']', '').str.replace("\"", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['within_gene'] = df['within_gene'].str.replace('[', '').str.replace(']', '').str.replace("\"", "")


In [62]:
df

Unnamed: 0,locus,alleles,within_gene,gnomad_v3_nfe_af,family_non_ref
0,chr12:101319592,"[""G"",""A""]",UTP20,0.0,G
1,chr12:1885984,"[""G"",""A""]",CACNA2D4,0.0,H
2,chr3:9751135,"[""T"",""C""]",OGG1,0.0,I
3,chr10:96370115,"[""C"",""G""]",TLL2,0.0,X
4,chr12:34024062,"[""G"",""A""]","AC046130.1,ALG10",0.0,J
...,...,...,...,...,...
8277,chr8:9675439,"[""T"",""C""]",TNKS,0.0,U
8278,chr5:38378453,"[""G"",""C""]",EGFLAM,0.0,W
8279,chr4:162057445,"[""A"",""G""]",FSTL5,0.0,T
8280,chr4:88737338,"[""T"",""C""]",FAM13A,0.0,T


In [63]:
df.to_csv("all_genes_dups.csv", sep = ";")

In [64]:
genes_found = hl.import_table('all_genes_dups.csv',
                              delimiter = ';',
                              impute=True)

2023-03-01 22:11:53.097 Hail: INFO: Reading table to impute column types
2023-03-01 22:11:53.358 Hail: INFO: Finished type imputation
  Loading field '' as type int32 (imputed)
  Loading field 'locus' as type str (imputed)
  Loading field 'alleles' as type str (imputed)
  Loading field 'within_gene' as type str (imputed)
  Loading field 'gnomad_v3_nfe_af' as type float64 (imputed)
  Loading field 'family_non_ref' as type str (imputed)


In [65]:
genes_found.show()

Unnamed: 0_level_0,locus,alleles,within_gene,gnomad_v3_nfe_af,family_non_ref
int32,str,str,str,float64,str
0,"""chr12:101319592""","""""[""""G"""",""""A""""]""""","""UTP20""",0.0,"""G"""
1,"""chr12:1885984""","""""[""""G"""",""""A""""]""""","""CACNA2D4""",0.0,"""H"""
2,"""chr3:9751135""","""""[""""T"""",""""C""""]""""","""OGG1""",0.0,"""I"""
3,"""chr10:96370115""","""""[""""C"""",""""G""""]""""","""TLL2""",0.0,"""X"""
4,"""chr12:34024062""","""""[""""G"""",""""A""""]""""","""AC046130.1,ALG10""",0.0,"""J"""
5,"""chr11:66592559""","""""[""""G"""",""""A""""]""""","""CCDC87""",0.0,"""A"""
6,"""chr1:116608098""","""""[""""C"""",""""G""""]""""","""IGSF3""",0.0,"""T"""
7,"""chr5:1238025""","""""[""""G"""",""""A""""]""""","""SLC6A18""",0.0,"""G"""
8,"""chr10:70752197""","""""[""""G"""",""""A""""]""""","""ADAMTS14""",0.0,"""E"""
9,"""chr1:227747768""","""""[""""C"""",""""T""""]""""","""SNAP47""",0.0,"""T"""


In [57]:
hl.eval(hl.str("\"\"[\"\"UTP20\"\"]\"\"").strip())

'""[""UTP20""]""'

In [18]:
genes_lists = hl.import_table('gts_gene_results.csv', 
                       delimiter=',',
                       impute=True)

2023-03-01 21:50:49.071 Hail: INFO: Reading table to impute column types
2023-03-01 21:50:49.316 Hail: INFO: Finished type imputation
  Loading field 'UR_2' as type str (imputed)
  Loading field 'UR_3' as type str (imputed)
  Loading field 'UR_4' as type str (imputed)
  Loading field 'R_3' as type str (imputed)
  Loading field 'R_4' as type str (imputed)
  Loading field 'U_4' as type str (imputed)
  Loading field 'U_5' as type str (imputed)
  Loading field 'U_6' as type str (imputed)
  Loading field 'U_7' as type str (imputed)


In [19]:
for_calcs = []

for n in range(2,5):
    print("UR_"+str(n))
    for_calcs.append(
    genes_lists["UR_"+str(n)].collect()
    )

for n in range(3,5):
    print("R_"+str(n))
    for_calcs.append(
    genes_lists["R_"+str(n)].collect()
    )

for n in range(4,8):
    print("U_"+str(n))
    for_calcs.append(
    genes_lists["U_"+str(n)].collect()
    )

UR_2
UR_3
UR_4
R_3
R_4
U_4
U_5
U_6
U_7


In [20]:
for_calcs = hl.flatten(hl.literal(for_calcs))

In [21]:
mt = mt.explode_rows(mt.within_gene)

In [22]:
mt = mt.filter_rows(
    hl.any(
        hl.literal(for_calcs).contains(mt.within_gene)
    )
)

In [35]:
mt.write('/net/ascratch/people/plggosborcz/gosborcz-hail/exploted-genes.mt', overwrite = True)

2023-02-27 11:49:09.565 Hail: INFO: wrote matrix table with 30357 rows and 124 columns in 50 partitions to /net/ascratch/people/plggosborcz/gosborcz-hail/exploted-genes.mt


In [23]:
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/gosborcz-hail/exploted-genes.mt')

In [24]:
mafs = mt.aggregate_rows(
    hl.agg.group_by(
        mt.within_gene,
        hl.agg.filter(
            (mt.cadd.score_phred > 10) & (mt.gnomad_v_3_1.freq.AF[2]<0.05),
            hl.agg.collect(mt.gnomad_v_3_1.freq.AF[2])
        )
    )
)



In [25]:
final_mafs = {gene:
      mafs.get(gene)
      for gene in hl.eval(for_calcs)
      }

In [26]:
for key in final_mafs.keys():
    if final_mafs[key] == None:
        final_mafs[key] = [0.00001]   
    elif len(final_mafs[key]) == 0:
        final_mafs[key] = [0.00001]    

In [12]:
for gene in genes_lists['UR_2'].collect():
    anti_sum = []
    
    for maf in final_mafs[gene]:
        if maf < 0.001:
            anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

0.0
0.001014268768879334
0.0
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
0.0
0.0006467527603951639
0.0008826385006913684
0.00016212070755883268
8.823010411151966e-05
0.0005293027906019265
0.00030875654165907207
4.409819197415121e-05
0.0
1.470199064956823e-05
0.0
9.99999999995449e-06
0.0006466516376404385
0.0
0.0
0.00030901191849497245
0.0008672009438861483
0.0
0.0
0.0
0.0
7.351860020587964e-05
0.0
0.00017638758231419693
0.0
0.000984888575292553
0.0
9.99999999995449e-06
0.0
0.0
1.4699829481990001e-05
0.0
0.0012797634819851078
4.413215320686614e-05
0.0004263326570815851
0.0008529219381191444
0.0
0.0
0.0003675783146221967
0.0
9.99999999995449e-06
0.0009413702821169379
0.0
9.99999999995449e-06
0.00011796974076150324
0.00042637026581981985
0.0001323023549819613
0.0
0.0003233487168935456
0.0
0.0
0.0
0.0
0.0
0.0
0.0007642338555597927
0.0
0.0001911064270210261
0.0
0.0002794364208607014
9.99999999995449e-06
0.0
9.99999999995449e-06
0.000502774886488977
0.

In [13]:
for gene in genes_lists['UR_3'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        if maf < 0.001:
            anti_sum.append(1-maf)
    print(1 - np.prod(anti_sum))

0.0
5.878549174065384e-05
0.0002942214899376294
2.939598760220985e-05
0.0
0.0008525970199977362
0.0022046073416095435
9.99999999995449e-06
0.0
0.0016015357996583823
0.0025419135881612487
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.9999999999544

In [14]:
for gene in genes_lists['UR_4'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        if maf < 0.001:
            anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

0.00010292391333066586
0.0016015025561502139
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999

In [15]:
for gene in genes_lists['R_3'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        if maf < 0.01:
            anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

0.006169585042434256
0.018828524660445267
0.015830892727067947
9.99999999995449e-06
0.010481877159442599
0.030635519983802317
1.4701126106242413e-05
0.012749870785920492
0.011898497529950869
0.02081838452393736
0.04263108874264021
0.02019282576295134
0.010127873253263031
0.05974573218449597
0.0
0.015647672862870277
0.009884969550763456
0.018731555497794594
0.0024695555039576655
0.026874292543033418
0.007883686002235546
0.0
0.031264294737708864
0.016154928730183316
0.012842695360488543
0.021200322708156416
0.0
9.99999999995449e-06
0.016801611766908087
0.014280435368054567
0.012799380801312155
0.01122166633569377
0.010583682759898294
0.02932806849774061
0.01762135268943621
0.009863295604880196
0.010682144434712604
0.05787389955400157
0.0
0.0
0.0018965567937898609
0.017688570075394927
0.016344604802682405
9.99999999995449e-06
0.043207300369012436
0.004073289806481917
0.001073213760658609
0.002843385342361615
0.004737773246574095
0.002843385342361615
9.99999999995449e-06
0.0105192094711236

In [16]:
for gene in genes_lists['R_4'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        if maf < 0.01:
            anti_sum.append(1-maf)
            
    print(1 - np.prod(anti_sum))

0.015244462789543722
0.0
0.01707341807967999
0.0096654603230647
9.99999999995449e-06
0.0
0.00739959207684171
0.0017640339886612066
0.029172189486141864
0.0017650692789692446
0.02797128515799885
0.04236515933785567
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.99999999995449e-06
9.9999999

In [17]:
for gene in genes_lists['U_4'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_5'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_6'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

for gene in genes_lists['U_7'].collect():
    anti_sum = []
    for maf in final_mafs[gene]:
        anti_sum.append(1-maf)

    print(1 - np.prod(anti_sum))

0.07900376349569216
9.99999999995449e-06
0.003675065416164358
0.164964010327179
0.2805628354870734
0.15384557658004672
0.08389165070053883
0.09934617990853156
0.14885420707255492
0.07596675970566824
0.10937174523250537
0.17527852095149776
0.292060015481852
0.06972005345313437
0.06806508643731812
0.07679556287144484
0.27912457319413875
0.13202013045250505
0.1845295074236517
0.03433021145143145
0.2602923176870958
0.08847814936803378
0.47387408362363403
0.13917169848087796
0.04702631274621949
9.99999999995449e-06
0.11082184541570073
0.2335659129356633
0.10204695164963151
0.09596114686257817
0.13188207523351636
0.43373274304379417
0.043884139080668016
0.04835234583786907
0.16134136424540846
0.1554575123216343
0.06716968280259294
0.025328314233247884
9.99999999995449e-06
0.07340865574358257
0.05014102807149956
0.08677379606276137
9.99999999995449e-06
0.057940792712101286
0.09100762868310663
0.10357141348834764
0.29896259004775394
0.10918672641314009
0.2960642535785759
0.07198483600671457
0.

In [66]:
ur = genes_found.filter(genes_found.gnomad_v3_nfe_af < 0.001)
ur = ur.annotate(within_gene = ur.within_gene.split(","))
ur = ur.explode(ur.within_gene)

ur = ur.key_by(ur.family_non_ref, ur.within_gene)
ur = ur.distinct()

In [67]:
fams_ur = ur.group_by(ur.within_gene).aggregate(
    fams_non_ref = hl.agg.collect(ur.family_non_ref))

In [68]:
fams_dict = {
    "A":0.0625,
    "B":0.0156,
    "C":0.0078,
    "D":0.0156,
    "E":0.0625,
    "F":0.0078,
    "G":0.25,
    "H":0.125,
    "I":0.0626,
    "J":0.25,
    "R":0.125,
    "S":0.125,
    "T":0.5,
    "U":0.0625,
    "W":0.000976,
    "X":0.125,
    "Y":0.03125,
}

In [69]:
fams_ur = fams_ur.annotate(
    p_seg = hl.product(hl.map(lambda x: hl.dict(fams_dict).get(x), fams_ur.fams_non_ref))
)

In [70]:
fams_ur.filter(hl.len(fams_ur.fams_non_ref) == 2).show(n_rows=150)

2023-03-01 22:12:12.638 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:12:12.860 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC007402.1""","[""E"",""J""]",0.0156
"""AC007656.2""","[""U"",""X""]",0.00781
"""AC008691.1""","[""T"",""U""]",0.0313
"""AC046130.1""","[""G"",""J""]",0.0625
"""AC090912.1""","[""E"",""J""]",0.0156
"""AC090912.2""","[""E"",""J""]",0.0156
"""AC092844.1""","[""S"",""X""]",0.0156
"""AC092957.1""","[""H"",""I""]",0.00783
"""ADAMTSL1""","[""A"",""Y""]",0.00195
"""ADGRL3""","[""H"",""J""]",0.0313


In [71]:
fams_ur.filter(hl.len(fams_ur.fams_non_ref) == 3).show(n_rows=150)

2023-03-01 22:13:17.321 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:13:17.527 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""ANKS1B""","[""T"",""U"",""X""]",0.00391
"""EBF1""","[""J"",""U"",""X""]",0.00195
"""ESRRG""","[""G"",""H"",""X""]",0.00391
"""OPCML""","[""H"",""S"",""T""]",0.00781
"""PKP4""","[""F"",""J"",""T""]",0.000975
"""PRKG1""","[""E"",""H"",""I""]",0.000489
"""PTPRD""","[""G"",""T"",""X""]",0.0156
"""RASAL2""","[""H"",""J"",""X""]",0.00391
"""TPRG1""","[""D"",""H"",""J""]",0.000488
"""WWOX""","[""I"",""J"",""X""]",0.00196


In [72]:
fams_ur.filter(hl.len(fams_ur.fams_non_ref) == 4).show(n_rows=150)

2023-03-01 22:13:58.333 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:13:58.541 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""MACROD2""","[""D"",""G"",""I"",""T""]",0.000122
"""ROBO2""","[""A"",""H"",""R"",""X""]",0.000122


In [73]:
r = genes_found.filter(genes_found.gnomad_v3_nfe_af < 0.01)
r = r.annotate(within_gene = r.within_gene.split(","))
r = r.explode(r.within_gene)

r = r.key_by(r.family_non_ref, r.within_gene)
r = r.distinct()

In [74]:
fams_r = r.group_by(r.within_gene).aggregate(
    fams_non_ref = hl.agg.collect(r.family_non_ref))

In [75]:
fams_r = fams_r.annotate(
    p_seg = hl.product(hl.map(lambda x: hl.dict(fams_dict).get(x), fams_r.fams_non_ref))
)

In [76]:
fams_r.filter(hl.len(fams_r.fams_non_ref) == 3).show(n_rows=150)

2023-03-01 22:15:10.884 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:15:11.108 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC007656.2""","[""G"",""U"",""X""]",0.00195
"""AC011246.1""","[""H"",""J"",""W""]",3.05e-05
"""AC058822.1""","[""G"",""J"",""S""]",0.00781
"""AC079298.3""","[""H"",""J"",""X""]",0.00391
"""ADAMTSL1""","[""A"",""G"",""Y""]",0.000488
"""ADGRL2""","[""A"",""C"",""I""]",3.05e-05
"""ADK""","[""D"",""J"",""X""]",0.000488
"""AL356534.1""","[""G"",""H"",""J""]",0.00781
"""ALK""","[""J"",""R"",""U""]",0.00195
"""ANKS1B""","[""T"",""U"",""X""]",0.00391


In [77]:
fams_r.filter(hl.len(fams_r.fams_non_ref) == 4).show(n_rows=150)

2023-03-01 22:17:04.716 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:17:04.938 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AL157944.1""","[""A"",""G"",""I"",""Y""]",3.06e-05
"""CDH23""","[""D"",""E"",""I"",""J""]",1.53e-05
"""EBF1""","[""J"",""T"",""U"",""X""]",0.000977
"""GNAS""","[""J"",""U"",""W"",""X""]",1.91e-06
"""KAT6B""","[""H"",""I"",""J"",""X""]",0.000245
"""LRMDA""","[""H"",""I"",""J"",""X""]",0.000245
"""MACROD2""","[""D"",""G"",""I"",""T""]",0.000122
"""RARB""","[""D"",""H"",""J"",""T""]",0.000244
"""ROBO2""","[""A"",""H"",""R"",""X""]",0.000122
"""SEMA3A""","[""E"",""H"",""J"",""R""]",0.000244


In [78]:
u = genes_found.filter(genes_found.gnomad_v3_nfe_af < 0.05)
u = u.annotate(within_gene = u.within_gene.split(","))
u = u.explode(u.within_gene)

u = u.key_by(u.family_non_ref, u.within_gene)
u = u.distinct()

In [79]:
fams_u = u.group_by(u.within_gene).aggregate(
    fams_non_ref = hl.agg.collect(u.family_non_ref))

In [80]:
fams_u = fams_u.annotate(
    p_seg = hl.product(hl.map(lambda x: hl.dict(fams_dict).get(x), fams_u.fams_non_ref))
)

In [81]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 4).show(n_rows=150)

2023-03-01 22:28:19.616 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:28:19.891 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC007402.1""","[""E"",""F"",""J"",""X""]",1.52e-05
"""AC079298.3""","[""H"",""J"",""R"",""X""]",0.000488
"""AC087379.1""","[""G"",""H"",""J"",""X""]",0.000977
"""AC093865.1""","[""E"",""G"",""S"",""X""]",0.000244
"""ADAMTSL1""","[""A"",""G"",""T"",""Y""]",0.000244
"""ADGRL2""","[""A"",""C"",""I"",""Y""]",9.54e-07
"""ADGRL3""","[""G"",""H"",""J"",""S""]",0.000977
"""ADK""","[""D"",""E"",""J"",""X""]",3.05e-05
"""AL022068.1""","[""E"",""G"",""T"",""X""]",0.000977
"""AL035078.4""","[""A"",""H"",""J"",""X""]",0.000244


In [82]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 5).show(n_rows=150)

2023-03-01 22:31:28.319 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:31:28.631 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""AC109466.1""","[""F"",""J"",""T"",""U"",""X""]",7.62e-06
"""AC126121.3""","[""A"",""D"",""G"",""H"",""J""]",7.62e-06
"""AL137230.2""","[""E"",""G"",""H"",""T"",""U""]",6.1e-05
"""AL162726.3""","[""F"",""G"",""J"",""S"",""X""]",7.62e-06
"""AL589740.1""","[""D"",""J"",""T"",""U"",""Y""]",3.81e-06
"""BTBD9""","[""D"",""G"",""H"",""T"",""X""]",3.05e-05
"""CACNA1C""","[""G"",""H"",""S"",""T"",""X""]",0.000244
"""CACNA2D3""","[""A"",""D"",""I"",""J"",""T""]",7.63e-06
"""CDH23""","[""D"",""E"",""H"",""I"",""J""]",1.91e-06
"""COL25A1""","[""G"",""H"",""J"",""X"",""Y""]",3.05e-05


In [83]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 6).show(n_rows=150)

2023-03-01 22:33:52.279 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:33:52.592 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""NTM""","[""A"",""E"",""S"",""T"",""U"",""X""]",1.91e-06


In [84]:
fams_u.filter(hl.len(fams_u.fams_non_ref) == 7).show(n_rows=150)

2023-03-01 22:34:48.821 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-01 22:34:49.123 Hail: INFO: Ordering unsorted dataset with network shuffle


within_gene,fams_non_ref,p_seg
str,array<str>,float64
"""MACROD2""","[""D"",""G"",""I"",""J"",""T"",""W"",""X""]",3.72e-09
