In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl

hl.init(
    tmp_dir='/net/ascratch/people/plggosborcz/gosborcz-hail',
    spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '30G'},
    default_reference='GRCh38') 



2022-12-30 12:41:22.141 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.3
SparkUI available at http://ac0049:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.105-acd89e80c345
LOGGING: writing to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/analysis/burden-and-family/hail-20221230-1240-0.2.105-acd89e80c345.log


In [2]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook() 

## 1. Annotate with phenotypes

In [None]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-anno.mt')

In [None]:
pheno = hl.import_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/pheno/GTS-coded-corrected-june-2021.csv', impute = True, key='ID', delimiter = ',', quote ="\"")

In [None]:
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

## 2. For each of the families get all variants with MAF in gnomAD < 0.001 and create a separate mt

In [None]:
fams = list(set(mt.phenotypes.family.collect()))

In [None]:
fams.sort()

In [None]:
cadd = hl.read_table('/net/pr2/projects/plgrid/plggneuromol/resources/cadd.ht')
vep = hl.read_table('/net/pr2/projects/plgrid/plggneuromol/resources/vep38/grch38_context_vep_annotated.ht')

In [None]:
for f in fams:
    fam = mt.filter_cols(mt.phenotypes.family == f)
    fam = fam.filter_rows(hl.agg.any(fam.GT.is_non_ref()))
    fam = fam.naive_coalesce(500)
    fam = fam.filter_rows(
        hl.if_else(
            hl.is_defined(fam.gnomad_v_3_1.freq.AF[2]), 
            fam.gnomad_v_3_1.freq.AF[2] < 0.001,
            True),
        keep=True)
    fam = fam.annotate_rows(cadd = cadd[fam.row_key])
    fam = fam.annotate_rows(vep = vep[fam.row_key])
    fam.write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-noncoding'+f+'.mt')

## 3. exclude all intragenic variants:

In [None]:
for f in fams:
    fam = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-noncoding'+f+'.mt')
    fam = fam.filter_rows(fam.within_gene == hl.empty_array(hl.tstr), keep = True)
    fam.write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-intragenic'+f+'.mt')

## 4. For each of the larger families find pathogenic variants present in all cases (only GTS considered cases)

In [3]:
fams = ['A','B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'R', 'S', 'T', 'U', 'W', 'X', 'Y']

In [None]:
# prepare mts for annotations
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-sportwgs/data/joint/dense-all.mt')

samples_to_keep = hl.import_table('/net/pr2/projects/plgrid/plggneuromol/matzieb/projects/imdik-zekanowski-sportwgs/data/prs-data/sportsmen-control-pheno.tsv')
samples_to_keep = samples_to_keep.key_by(samples_to_keep['sample'])
sample_filter = hl.literal(samples_to_keep['sample'].collect()).contains(mt.s)
mt = mt.filter_cols(hl.if_else(mt.s.contains('B'), sample_filter,
                          hl.if_else(mt.s.contains('HG'), sample_filter, hl.if_else(
           mt.s.contains('NA'), sample_filter, True))))

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'GTS'))))

mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

mt = mt.annotate_rows(dp_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.DP)),
                      gq_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.GQ)),
                      hwe = hl.agg.group_by(mt.group, hl.agg.hardy_weinberg_test(mt.GT)))

mt = mt.annotate_rows(n_below_dp_3 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.DP < 3)),
                      n_below_gq_30 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GQ <30)))
    
mt = mt.filter_rows((mt.dp_qc.get('GTS', hl.struct(mean=0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.dp_qc.get('sport', hl.struct(mean=0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.gq_qc.get('GTS', hl.struct(mean=0.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.gq_qc.get('sport', hl.struct(mean=0.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.hwe.get('1kg', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.n_below_dp_3.get('sport', 0) < 3) &
                    (mt.n_below_gq_30.get('sport', 0) < 30) &
                    (mt.n_below_dp_3.get('GTS', 0) < 3) &
                    (mt.n_below_gq_30.get('GTS', 0) <30))

mt = mt.distinct_by_row()
mt = mt.key_rows_by(mt.locus, mt.alleles)
mt = hl.split_multi_hts(mt)

for subgroup in ['control', '1kg', 'sport']:
    temp = mt.filter_cols(mt.group == subgroup)
    temp = hl.variant_qc(temp)
    
    temp = temp.transmute_rows(    
        AF = temp.variant_qc.AF,
        het = temp.variant_qc.n_het,
        hom = temp.variant_qc.homozygote_count,
        n_called = temp.variant_qc.n_called,
        n_not_called = temp.variant_qc.n_called,
        samples_het = hl.agg.filter(temp.GT.is_het(), hl.agg.collect(temp.s)),
        samples_hom = hl.agg.filter(temp.GT.is_hom_var(), hl.agg.collect(temp.s))
    )
    temp = temp.naive_coalesce(500)
    temp.write('/net/ascratch/people/plggosborcz/gosborcz-hail/for-af-'+subgroup+'-temp100.mt')              

In [None]:
for f in fams:
    
    fam = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-noncoding'+f+'.mt')

    n_cases = fam.aggregate_cols(hl.agg.count_where(fam.phenotypes.phenotype == 'GTS'))

    fam = fam.annotate_rows(
        cases_non_ref = 
            hl.agg.filter(
                fam.phenotypes.phenotype == 'GTS',
                hl.agg.count_where(
                    fam.GT.is_non_ref()
                )
            )
        )

    fam = fam.filter_rows(fam.cases_non_ref == n_cases)
    fam = fam.filter_rows(hl.is_snp(fam.alleles[0], fam.alleles[1]))
    fam = fam.filter_rows(fam['was_split'], keep = False)
    
    fam = fam.drop(fam['a_index'], fam['was_split'], fam['within_gene'], fam['hpo'], fam['cases_non_ref'])
    fam = fam.transmute_rows(family = hl.str(f),
                             gts_DP_mean = fam.dp_qc.mean,
                             gts_GQ_mean =fam.gq_qc.mean,
                             gts_hwe_p_value = fam.hwe.p_value,
                             gnomad_v3_nfe_af = fam.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = fam.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = fam.cadd.score_phred,
                             most_severe_consequence = fam.vep.vep.most_severe_consequence,
                             intergenic_consequences = fam.vep.vep.intergenic_consequences,
                             motif_feature_consequences = fam.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = fam.vep.vep.regulatory_feature_consequences,
                             het_in_fam = hl.agg.filter(fam.GT.is_het(), hl.agg.collect(fam.s)),
                             hom_in_fam = hl.agg.filter(fam.GT.is_hom_var(), hl.agg.collect(fam.s))
                            )

    fam.write('/net/ascratch/people/plggosborcz/gosborcz-hail/'+f+'-temp2.mt')

In [6]:
poles = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/resources/polish-genomes/polish-genomes.mt')
gts = hl.read_matrix_table('/net/ascratch/people/plggosborcz/gosborcz-hail/for-af-control-temp5.mt')
sport = hl.read_matrix_table('/net/ascratch/people/plggosborcz/gosborcz-hail/for-af-sport-temp5.mt')
kg = hl.read_matrix_table('/net/ascratch/people/plggosborcz/gosborcz-hail/for-af-1kg-temp5.mt')
fams_ts = []

for f in fams:
    fam = hl.read_matrix_table('/net/ascratch/people/plggosborcz/gosborcz-hail/'+f+'-temp2.mt')
    fam = fam.annotate_rows(polish_af = poles.rows()[fam.row_key]['info']['AF'])
    fam = fam.annotate_rows(gts_af = gts.rows()[fam.row_key]['AF'],
                            gts_het = gts.rows()[fam.row_key]['het'],
                            gts_hom = gts.rows()[fam.row_key]['hom'],
                            gts_n_called = gts.rows()[fam.row_key]['n_called'],
                            gts_n_not_called = gts.rows()[fam.row_key]['n_not_called'],
                            gts_samples_het = gts.rows()[fam.row_key]['samples_het'],
                            gts_samples_hom = gts.rows()[fam.row_key]['samples_hom'],
                            sport_af = sport.rows()[fam.row_key]['AF'],
                            sport_het = sport.rows()[fam.row_key]['het'],
                            sport_hom = sport.rows()[fam.row_key]['hom'],
                            sport_n_called = sport.rows()[fam.row_key]['n_called'],
                            sport_n_not_called = sport.rows()[fam.row_key]['n_not_called'],
                            sport_samples_het = sport.rows()[fam.row_key]['samples_het'],
                            sport_samples_hom = sport.rows()[fam.row_key]['samples_hom'],
                            eur_1kg_af = kg.rows()[fam.row_key]['AF']
                           )

    fam = fam.rows()
    fam = fam.key_by(fam.locus, fam.alleles, fam.family)
    
    fam.write('/net/ascratch/people/plggosborcz/gosborcz-hail/'+f+'-temp4.mt')
    fam = hl.read_table('/net/ascratch/people/plggosborcz/gosborcz-hail/'+f+'-temp4.mt')
    fams_ts.append(fam)

2022-11-10 13:59:52.858 Hail: INFO: Coerced sorted dataset=====>(499 + 1) / 500]
2022-11-10 14:02:21.294 Hail: INFO: wrote table with 2480 rows in 280 partitions to /net/ascratch/people/plggosborcz/gosborcz-hail/A-temp4.mt
2022-11-10 14:04:49.593 Hail: INFO: Coerced sorted dataset=====>(499 + 1) / 500]
2022-11-10 14:06:32.456 Hail: INFO: wrote table with 1300 rows in 169 partitions to /net/ascratch/people/plggosborcz/gosborcz-hail/B-temp4.mt
2022-11-10 14:08:26.677 Hail: INFO: Coerced sorted dataset=====>(499 + 1) / 500]
2022-11-10 14:09:38.719 Hail: INFO: wrote table with 218 rows in 141 partitions to /net/ascratch/people/plggosborcz/gosborcz-hail/C-temp4.mt
2022-11-10 14:11:46.488 Hail: INFO: Coerced sorted dataset=====>(499 + 1) / 500]
2022-11-10 14:13:11.755 Hail: INFO: wrote table with 1109 rows in 153 partitions to /net/ascratch/people/plggosborcz/gosborcz-hail/D-temp4.mt
2022-11-10 14:17:49.008 Hail: INFO: Coerced sorted dataset=====>(499 + 1) / 500]
2022-11-10 14:21:38.940 Hail

In [19]:
fam_ts = []

for f in fams:
    fam = hl.read_table('/net/ascratch/people/plggosborcz/gosborcz-hail/'+f+'-temp4.mt')
    fam = fam.filter(hl.is_defined(fam.gts_af))
    fam = fam.annotate(gts_af = fam.gts_af[1],
                            gts_hom = fam.gts_hom[1],
                            sport_af = fam.sport_af[1],
                            sport_hom = fam.sport_hom[1],
                            eur_1kg_af = fam.eur_1kg_af[1]
                           )
    fam = fam.naive_coalesce(50)
    fam_ts.append(fam)
                            
fams_all = hl.Table.union(*fam_ts)

In [20]:
fams_all.write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/filtered-intergenic.ht')

2022-11-10 21:28:01.182 Hail: INFO: wrote table with 9255 rows in 825 partitions to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/filtered-intergenic.ht
    Total size: 907.53 KiB
    * Rows: 907.52 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  190 rows (15.65 KiB)


In [21]:
ht = hl.read_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/filtered-intergenic.ht')
ht.naive_coalesce(
    20
).write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/to-export-intergenic.ht')

2022-11-10 21:31:18.788 Hail: INFO: wrote table with 9255 rows in 20 partitions to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/to-export-intergenic.ht


In [22]:
ht = hl.read_table(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/to-export-intergenic.ht'
)

In [25]:
#now export all the selevted variants inot one csv
ht.export(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/selected-intergenic.csv', delimiter='\t'
)

2022-11-10 21:37:39.702 Hail: INFO: merging 21 files totalling 3.2M...
2022-11-10 21:37:39.742 Hail: INFO: while writing:
    /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/selected-intergenic.csv
  merge time: 39.642ms


In [None]:
#now clean the table look and export only intragenic variants in > two families 

In [52]:
ht = hl.read_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/to-export-intergenic.ht')

In [53]:
ht = ht.key_by(ht.locus)

In [62]:
results = ht.distinct()
results = results.key_by(results.locus, results.family)
ht = ht.key_by(ht.locus, ht.family)

In [63]:
ht.count()

9255

In [64]:
results.count()

9144

In [67]:
dups = ht.anti_join(results)

In [68]:
dups.count()

2022-11-10 22:06:51.607 Hail: INFO: Coerced sorted dataset
2022-11-10 22:06:51.832 Hail: INFO: Coerced sorted dataset


111

In [69]:
dups = dups.key_by(dups.locus)
ht = ht.key_by(ht.locus)

ht_dups = ht.semi_join(dups)

In [70]:
ht_dups.count()

2022-11-10 22:07:53.075 Hail: INFO: Coerced sorted dataset
2022-11-10 22:07:53.268 Hail: INFO: Coerced sorted dataset


217

In [72]:
ht_dups.export(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/duplicated-intergenic.csv', delimiter='\t'
)

2022-11-10 22:10:58.083 Hail: INFO: Coerced sorted dataset
2022-11-10 22:10:58.312 Hail: INFO: Coerced sorted dataset
2022-11-10 22:10:58.713 Hail: INFO: merging 21 files totalling 100.1K...
2022-11-10 22:10:58.734 Hail: INFO: while writing:
    /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/duplicated-intergenic.csv
  merge time: 20.823ms
