In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

#This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot

hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook()

In [None]:
#this table is already filtered for coverage and with repeatmasker (done in previous analysis on io, available in the github repo)

gnomad = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad_selected_filtered.ht')

### annotate with nearest genes and HPO:

In [None]:
genes = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/vcf_preprocessing/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))

hpo = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/vcf_preprocessing/hpo.tsv', no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])
genes = genes.key_by(genes.interval)

gnomad = gnomad.annotate(within_gene = hl.array(hl.set(genes.index(gnomad.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
gnomad = gnomad.annotate(hpo = hl.array(hl.set(genes.index(gnomad.locus, all_matches=True)['hpo'])))

In [None]:
gnomad.write('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad_genes_hpo.ht')

### Annotate with CADD

In [None]:
gnomad = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad_genes_hpo.ht')

In [None]:
#now read all the matrices and join
cds = !ls /net/archive/groups/plggneuromol/GTS-analysis/data/cadd-per-chr

In [None]:
cd_list = []

for cd in cds:
    cd_list.append(hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/data/cadd-per-chr/'+cd))

In [None]:
cadd = hl.Table.union(*cd_list)

In [None]:
gnomad = gnomad.annotate(cadd = cadd[gnomad.key])

In [None]:
gnomad = gnomad.transmute(cadd = gnomad.cadd.cadd_score)

In [None]:
gnomad.write('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad-ready-to-draw.ht')

## Draw gnomad controls

In [None]:
gnomad = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad-ready-to-draw.ht')

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-pheno-cadd.mt')

In [None]:
gd = hl.MatrixTable.from_rows_table(gnomad)

In [None]:
gd = gd.annotate_cols(col_idx = hl.null(hl.tint32))

In [None]:
gd = gd.key_cols_by(gd.col_idx)

In [None]:
gnomad_samples = []

for i in range(1,186):
    gnomad_samples.append(str(i)+'_gnomad')

In [None]:
gnomad_columns = hl.utils.range_matrix_table(n_rows=0, n_cols=185)

In [None]:
gnomad_columns = gnomad_columns.annotate_rows(locus = hl.locus(hl.null(hl.tstr), hl.null(hl.tint32), reference_genome='GRCh38'),
                                              alleles = hl.null(hl.tarray(hl.tstr)))

In [None]:
gnomad_columns = gnomad_columns.key_rows_by(gnomad_columns.locus, gnomad_columns.alleles)

In [None]:
gd = gd.union_cols(gnomad_columns, row_join_type='outer')

In [None]:
gd = gd.annotate_entries(AD = hl.null(hl.tarray(hl.tint32)),
                         DP = hl.null(hl.tint32),
                         GQ = hl.null(hl.tint32),
                         GT = hl.call(hl.int32(hl.rand_bool(gd.v3_nfe['AF'])), hl.int32(hl.rand_bool(gd.v3_nfe['AF']))),
                         MIN_DP = hl.null(hl.tint32),
                         PGT = hl.call(hl.int32(hl.rand_bool(gd.v3_nfe['AF'])), hl.int32(hl.rand_bool(gd.v3_nfe['AF']))),
                         PID = hl.null(hl.tstr),
                         PL = hl.null(hl.tarray(hl.tint32)),
                         PS = hl.null(hl.tint32),
                         RGQ = hl.null(hl.tint32),
                         SB = hl.null(hl.tarray(hl.tint32)))

In [None]:
gd.write('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad-ready-to-join.ht')