In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://633fc3f91d0f:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.29-cf730c8fc8f6
LOGGING: writing to /hail/large_vcf_filtering_annotation/hail-20191231-0753-0.2.29-cf730c8fc8f6.log


In [2]:
rpmk_files = !ls ./repeatmasker_split

for file in rpmk_files:
    hl.import_bed('./repeatmasker_split/'+file, reference_genome='GRCh38', skip_invalid_intervals=True).write(file+'.ht')

gnmd = hl.read_table('/hail/gnomad/gnomad.genomes.r3.0.sites.ht')

for file in rpmk_files:
    rpmk = hl.read_table(file+'.ht')
    gnmd = gnmd.filter(hl.is_defined(rpmk[gnmd.locus]), keep = False)
    gnmd = gnmd.key_by()
    gnmd = gnmd.key_by(gnmd.locus, gnmd.alleles)

gnmd = gnmd.select(v3_nfe = gnmd.freq[2], 
            intergenic_consequences = gnmd.vep.intergenic_consequences,
            most_severe_consequence = gnmd.vep.most_severe_consequence,
            motif_feature_consequences = gnmd.vep.motif_feature_consequences,
            regulatory_feature_consequences = gnmd.vep.regulatory_feature_consequences,
            transcript_consequences = gnmd.vep.transcript_consequences,
            variant_class = gnmd.vep.variant_class,
            rsid = gnmd.rsid)

cov = hl.read_table('/hail/gnomad/gnomad.genomes.r3.0.coverage.ht')
cov = cov.select(cov.over_1)

gnmd = gnmd.annotate(cov_v3 = cov[gnmd.locus])
gnmd = gnmd.filter(gnmd.cov_v3.over_1 > 0.9)
gnmd.write('/hail/gnomad/gnomad_selected_filtered.ht')
gnmd = hl.read_table('/hail/gnomad/gnomad_selected_filtered.ht')

hl.import_table('./cadd/whole_genome_SNVs.tsv.gz', force_bgz = True, delimiter='\t', comment='#', 
                     types = {'f0':'tstr', 'f1':'tint', 'f2':'tstr', 'f3':'tstr', 'f4':'tfloat', 'f5':'tfloat'},
                     no_header = True).write('./cadd/cadd.ht')

cadd = hl.read_table('./cadd/cadd.ht')
cadd = cadd.select(variants = hl.parse_variant('chr'+hl.str(cadd.f0)+':'+hl.str(cadd.f1)+':'+hl.str(cadd.f2)+':'+hl.str(cadd.f3), reference_genome='GRCh38'),cxfd
               cadd_score = cadd.f5)

cadd = cadd.key_by(cadd.variants.locus,cadd.variants.alleles)

gnmd = gnmd.annotate(cadd = cadd[gnmd.key])
gnmd = gnmd.transmute(cadd = gnmd.cadd.cadd_score)

gnmd.checkpoint('/hail/gnomad/gnomad_selected_filtered_cadd.ht')

In [15]:
gnmd = hl.read_table('/hail/gnomad/gnomad_selected_filtered_cadd.ht')
gnmd = gnmd.filter(gnmd.v3_nfe.AF != 0)
gnmd = hl.MatrixTable.from_rows_table(gnmd)
gnmd = gnmd.add_row_index()
gnmd = gnmd.key_rows_by(gnmd.row_idx)


imaginary_controls = [str(i)+'gnmd' for i in range(1,152)]
imaginary_controls = hl.literal(imaginary_controls)

In [16]:
genotypes = hl.utils.range_matrix_table(n_rows=gnmd.count()[0], n_cols=151)

In [17]:
genotypes = genotypes.add_row_index('added_row_index')
genotypes = genotypes.key_rows_by('added_row_index')

genotypes = genotypes.annotate_cols(s=imaginary_controls[genotypes['col_idx']])
genotypes = genotypes.key_cols_by(genotypes.s)
genotypes = genotypes.annotate_rows(g = gnmd.index_rows(genotypes.added_row_index))

In [18]:
genotypes = genotypes.annotate_entries(AD = hl.array([0,0,0]),
                                       DP = 0,
                                       GQ = 0,
                                       GT = hl.parse_call(hl.str(hl.cond(hl.rand_bool(genotypes.g.v3_nfe.AF), 1, 0))+'/'+hl.str(hl.cond(hl.rand_bool(genotypes.g.v3_nfe.AF), 1, 0))),
                                       PGT = hl.parse_call('0/0'),
                                       PID = hl.str('none'),
                                       PL = hl.array([0,0,0]))

In [19]:
genotypes = genotypes.filter_rows(hl.agg.any(genotypes.GT.is_non_ref()), keep = True)

In [20]:
genotypes = genotypes.drop('col_idx')

In [21]:
genotypes = genotypes.select_rows(locus = genotypes.g.locus,
                             alleles = genotypes.g.alleles,
                             rsid = genotypes.g.rsid,
                             AC = hl.array([genotypes.g.v3_nfe.AC, genotypes.g.v3_nfe.AC]),
                             AF = hl.array([genotypes.g.v3_nfe.AF, genotypes.g.v3_nfe.AF]),
                             AN = genotypes.g.v3_nfe.AN,
                             a_index = 0,
                             was_split = False,
                             cov_v3 = genotypes.g.cov_v3,
                             gnomad_v3 = genotypes.g.drop('locus','alleles','cov_v3', 'cadd'),
                             cadd = genotypes.g.cadd,
                             gwas_p = hl.struct(SNP = hl.str('none'), P = hl.float64(0)))

In [22]:
genotypes = genotypes.key_rows_by('locus', 'alleles')

In [23]:
genotypes = genotypes.drop('added_row_index')

In [24]:
count = genotypes.count()

2020-01-03 17:48:22 Hail: INFO: Coerced sorted dataset
2020-01-03 17:50:51 Hail: INFO: Coerced sorted dataset


In [25]:
count

(9159120, 151)

In [None]:
genotypes.write('/hail/gnomad/gnomad_with_drawn_genotypes_ready_to_merge.ht')

2020-01-03 20:50:53 Hail: INFO: Coerced sorted dataset
2020-01-03 20:54:59 Hail: INFO: Coerced sorted dataset
