In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz', spark_conf={'spark.driver.memory': '15G', 'spark.executor.memory': '15G'}) 

Running on Apache Spark version 2.4.5
SparkUI available at http://p0654.prometheus:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.62-84fa81b9ea3d
LOGGING: writing to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/analysis/burden-and-family/hail-20211026-2043-0.2.62-84fa81b9ea3d.log


In [3]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain
import statistics as stat
from collections import Counter
    
import bokeh.palettes

from bokeh.plotting import figure, show, output_notebook
output_notebook()

In [4]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-gnomad-sex.mt') #this has outdated phenotypes - new phenotypes are added at the end of this analysis

In [5]:
mt.count() #this matrix contains simulated controls, these are not for this part of the project and are thus removed

(11825581, 370)

In [6]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'phenotypes': struct {
        family: str, 
        sex: str, 
        kinship: str, 
        disease: str, 
        phenotype: str, 
        add_pheno: str, 
        heavy_tics: str
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'AC': array<int32>
    'AF': array<float64>
    'AN': int32
    'a_index': int32
    'was_split': bool
    'gnomad_v3': struct {
        v3_nfe: struct {
            AC: int32, 
            AF: float64, 
            AN: int32, 
            homozygote_count: int32
        }, 
        intergenic_consequences: array<struct {
            allele_num: int32, 
            consequence_terms: array<str>, 
            impact: str, 
            minimised: int32, 
            variant_allele: str
        }>, 
        most_severe_consequence: str, 
     

In [8]:
mt = mt.filter_cols(mt.s.contains('gnomad'), keep = False)

In [9]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-1.mt')

2021-09-14 13:19:15 Hail: INFO: wrote matrix table with 11825581 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-1.mt
    Total size: 10.97 GiB
    * Rows/entries: 10.97 GiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  8022 rows (8.75 MiB)


<hail.matrixtable.MatrixTable at 0x2b8394560a20>

In [10]:
mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

In [11]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-2.mt')

2021-09-14 13:23:38 Hail: INFO: wrote matrix table with 8104588 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-2.mt
    Total size: 10.82 GiB
    * Rows/entries: 10.82 GiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  7047 rows (8.73 MiB)


<hail.matrixtable.MatrixTable at 0x2b8394751320>

## 1. perform variant QC and filter low quality genotypes

- this table has already been filtered with the repeatmasker track and gnomad coverage 90% of samples with DP > 1

In [12]:
mt = hl.variant_qc(mt)

In [13]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'phenotypes': struct {
        family: str, 
        sex: str, 
        kinship: str, 
        disease: str, 
        phenotype: str, 
        add_pheno: str, 
        heavy_tics: str
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'AC': array<int32>
    'AF': array<float64>
    'AN': int32
    'a_index': int32
    'was_split': bool
    'gnomad_v3': struct {
        v3_nfe: struct {
            AC: int32, 
            AF: float64, 
            AN: int32, 
            homozygote_count: int32
        }, 
        intergenic_consequences: array<struct {
            allele_num: int32, 
            consequence_terms: array<str>, 
            impact: str, 
            minimised: int32, 
            variant_allele: str
        }>, 
        most_severe_consequence: str, 
     

In [14]:
mt = mt.filter_rows((mt.variant_qc.dp_stats.mean > 10) & (mt.variant_qc.gq_stats.mean > 60))

In [15]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-3.mt')

2021-09-14 14:06:08 Hail: INFO: wrote matrix table with 6163334 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-3.mt
    Total size: 8.82 GiB
    * Rows/entries: 8.82 GiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  5367 rows (7.01 MiB)


<hail.matrixtable.MatrixTable at 0x2b8394866198>

In [16]:
mt.count()

(6163334, 185)

## 2. Filter variants that are within protein-coding genes

In [4]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-3.mt')

In [5]:
mt = mt.filter_rows(mt.within_gene == hl.empty_array(hl.tstr), keep = False)

In [6]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-4.mt')

2021-09-14 15:09:55 Hail: INFO: wrote matrix table with 4034934 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-4.mt
    Total size: 5.93 GiB
    * Rows/entries: 5.93 GiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  2504 rows (4.88 MiB)


<hail.matrixtable.MatrixTable at 0x2aed099118d0>

In [7]:
mt.count()

(4034934, 185)

## 3. Filter variants to be < 0.05 MAF in non-finnish europeans

In [8]:
mt = mt.filter_rows(mt.gnomad_v3.v3_nfe.AF < 0.05)

In [9]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-5.mt')

2021-09-14 15:17:10 Hail: INFO: wrote matrix table with 1752213 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-5.mt
    Total size: 2.33 GiB
    * Rows/entries: 2.33 GiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1336 rows (2.44 MiB)


<hail.matrixtable.MatrixTable at 0x2aed09a46550>

In [10]:
mt.count()

(1752213, 185)

## 4. Filter according to consequences

In [12]:
consequence = mt.aggregate_rows(hl.agg.counter(mt.gnomad_v3.most_severe_consequence))

In [13]:
consequence

{'protein_altering_variant': 20,
 'coding_sequence_variant': 5,
 'stop_lost': 66,
 'missense_variant': 36038,
 'start_lost': 125,
 'splice_donor_variant': 634,
 'splice_region_variant': 7058,
 'non_coding_transcript_exon_variant': 89568,
 'intron_variant': 1434332,
 'inframe_deletion': 707,
 'incomplete_terminal_codon_variant': 3,
 'frameshift_variant': 1046,
 'TF_binding_site_variant': 581,
 'mature_miRNA_variant': 102,
 'synonymous_variant': 29628,
 'stop_retained_variant': 31,
 'splice_acceptor_variant': 408,
 'intergenic_variant': 45173,
 'upstream_gene_variant': 8988,
 'stop_gained': 670,
 '3_prime_UTR_variant': 59851,
 'downstream_gene_variant': 7084,
 'inframe_insertion': 293,
 'regulatory_region_variant': 10750,
 '5_prime_UTR_variant': 19052}

In [14]:
consequences_to_keep = ['protein_altering_variant', 'coding_sequence_variant', 'stop_lost', 'missense_variant', 'start_lost', 'splice_donor_variant', 'splice_region_variant','inframe_deletion',
 'incomplete_terminal_codon_variant', 'frameshift_variant', 'synonymous_variant', 'stop_retained_variant', 'splice_acceptor_variant', 'stop_gained', '3_prime_UTR_variant',
 'inframe_insertion', 'regulatory_region_variant', '5_prime_UTR_variant']

In [17]:
mt = mt.filter_rows(hl.literal(consequences_to_keep).contains(mt.gnomad_v3.most_severe_consequence))

In [18]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-6.mt')

2021-09-14 15:27:48 Hail: INFO: wrote matrix table with 166385 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-6.mt
    Total size: 262.85 MiB
    * Rows/entries: 262.84 MiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  485 rows (796.99 KiB)


<hail.matrixtable.MatrixTable at 0x2aecfd7d5898>

In [19]:
mt.count()

(166385, 185)

## 5. ~1000 preselected genes

In [20]:
genes = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/gts-gene-lists/gene-lists-filtering.csv')

2021-09-14 15:29:00 Hail: INFO: Reading table without type imputation
  Loading field 'gene_symbol' as type str (not specified)
  Loading field 'list_source' as type str (not specified)


In [22]:
genes.show()

gene_symbol,list_source
str,str
"""AADAT""","""KF"""
"""AANAT""","""KF"""
"""ABAT""","""KF"""
"""ACAT1""","""KF"""
"""ACHE""","""KF"""
"""ACMSD""","""KF"""
"""ACTA1""","""KF"""
"""ACTN2""","""KF"""
"""ADCY1""","""KF"""
"""ADCY2""","""KF"""


In [23]:
genes = genes.select('gene_symbol')

In [24]:
genes = genes['gene_symbol'].collect()

In [25]:
genes_scores = list(set(genes))

In [26]:
mt = mt.filter_rows(hl.any(lambda x: hl.literal(genes_scores).contains(x), mt.within_gene))

In [27]:
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-7.mt')

2021-09-14 15:33:49 Hail: INFO: wrote matrix table with 10315 rows and 185 columns in 6622 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-for-filtering-7.mt
    Total size: 19.15 MiB
    * Rows/entries: 19.14 MiB
    * Columns: 2.37 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  53 rows (206.52 KiB)


<hail.matrixtable.MatrixTable at 0x2aed09ca1a20>

In [28]:
mt.count()

(10315, 185)

## 6. Filter for CADD

In [32]:
mt.filter_rows(mt.cadd > 25).count()

(344, 185)

In [33]:
mt = mt.filter_rows(mt.cadd > 25)

In [51]:
#mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-filtered-cadd-25.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/mts/GTS-filtered-cadd-25.mt')

In [36]:
mt.sample_rows(0.1).show()

locus,alleles
locus<GRCh38>,array<str>
chr1:2029156,"[""A"",""C""]"
chr1:21828852,"[""G"",""A""]"
chr1:160031510,"[""G"",""A""]"
chr2:232540061,"[""G"",""A""]"
chr3:38009996,"[""G"",""A""]"
chr3:42125503,"[""G"",""A""]"
chr3:52488281,"[""G"",""A""]"
chr6:24520491,"[""G"",""A""]"
chr6:33686055,"[""C"",""G""]"
chr6:146433954,"[""G"",""A""]"


In [5]:
p = hl.plot.histogram(mt.DP, range=(0,50), bins=30, title='DP Histogram', legend='DP')
show(p)

In [6]:
p2 = hl.plot.histogram(mt.GQ, range=(0,100), bins=15, title='GQ Histogram', legend='GQ')
show(p2)

## 7. Add information about non-ref samples:

In [52]:
#add updated phenotypes:
pheno = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/pheno/GTS-coded-corrected-june-2021.csv', delimiter=',', impute=True,  quote='\"')
pheno = pheno.key_by(pheno.ID)
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

2021-09-15 13:44:14 Hail: INFO: Reading table to impute column types
2021-09-15 13:44:14 Hail: INFO: Finished type imputation
  Loading field 'ID' as type str (imputed)
  Loading field 'family' as type str (imputed)
  Loading field 'sex' as type str (imputed)
  Loading field 'kinship' as type str (imputed)
  Loading field 'disease' as type str (imputed)
  Loading field 'phenotype' as type str (imputed)
  Loading field 'add_pheno' as type str (imputed)
  Loading field 'heavy_tics' as type str (imputed)
  Loading field 'heavy_tics_familial' as type str (imputed)
  Loading field 'GTS_ASD_group' as type str (imputed)
  Loading field 'nonCTD' as type str (imputed)


In [53]:
pheno.show()

ID,family,sex,kinship,disease,phenotype,add_pheno,heavy_tics,heavy_tics_familial,GTS_ASD_group,nonCTD
str,str,str,str,str,str,str,str,str,str,str
"""460""","""U""","""M""","""P""","""YES""","""GTS""",""".""",""".""",""".""",""".""","""."""
"""461""","""U""","""M""","""father""","""YES""","""tics""",""".""",""".""",""".""",""".""","""YES"""
"""462""","""U""","""F""","""mother""","""NO""",""".""",""".""",""".""",""".""",""".""","""."""
"""463""","""U""","""M""","""father_brother_son""","""YES""","""GTS""",""".""",""".""",""".""",""".""","""."""
"""464""","""U""","""M""","""father_brother""",""".""",""".""",""".""",""".""",""".""",""".""","""."""
"""465""","""U""","""M""","""father_father""","""YES""","""tics""",""".""",""".""",""".""",""".""","""YES"""
"""466""","""W""","""M""","""P""","""YES""","""GTS""",""".""",""".""",""".""",""".""","""."""
"""467""","""W""","""F""","""sister""","""YES""","""tics""",""".""",""".""",""".""",""".""","""YES"""
"""468""","""W""","""F""","""mother""","""NO""",""".""",""".""",""".""",""".""",""".""","""."""
"""469""","""W""","""M""","""father""","""YES""","""tics""",""".""",""".""",""".""",""".""","""YES"""


#### Here we create the following 4 groups:
1. healthy controls (disease = NO)
2. tics (phenotype = tics)
3. GTS (phenotype = GTS & heavy tics or heavy tics familal != YES)
4. heavy GTS

In [54]:
mt = mt.annotate_cols(group = hl.if_else((mt.phenotypes.disease == 'NO'), 'healthy', hl.if_else(
                            (mt.phenotypes.phenotype == 'tics'), 'tics', hl.if_else(
                            ((mt.phenotypes.heavy_tics == 'YES') | (mt.phenotypes.heavy_tics_familial == 'YES')), 'heavy_GTS', 'GTS'))))

In [55]:
mt.aggregate_cols(hl.agg.counter(mt.group))

{'GTS': 46, 'healthy': 56, 'heavy_GTS': 41, 'tics': 42}

In [56]:
mt = mt.annotate_rows(samples_non_ref = hl.agg.filter(mt.GT.is_non_ref(), hl.agg.collect(mt.s)),
                      samples_hom_non_ref = hl.agg.filter(mt.GT.is_hom_var(), hl.agg.collect(mt.s)),
                      grouped_variants = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_non_ref())))

In [57]:
mt.grouped_variants.show()

locus,alleles,grouped_variants
locus<GRCh38>,array<str>,"dict<str, int64>"
chr1:2029156,"[""A"",""C""]","{""GTS"":1,""healthy"":0,""heavy_GTS"":0,""tics"":0}"
chr1:8343777,"[""C"",""T""]","{""GTS"":0,""healthy"":1,""heavy_GTS"":0,""tics"":0}"
chr1:8358481,"[""C"",""T""]","{""GTS"":1,""healthy"":2,""heavy_GTS"":0,""tics"":4}"
chr1:15876348,"[""A"",""G""]","{""GTS"":0,""healthy"":1,""heavy_GTS"":1,""tics"":0}"
chr1:15928357,"[""G"",""A""]","{""GTS"":0,""healthy"":1,""heavy_GTS"":1,""tics"":0}"
chr1:15934591,"[""G"",""A""]","{""GTS"":0,""healthy"":2,""heavy_GTS"":0,""tics"":0}"
chr1:21824322,"[""C"",""T""]","{""GTS"":0,""healthy"":0,""heavy_GTS"":1,""tics"":0}"
chr1:21828852,"[""G"",""A""]","{""GTS"":2,""healthy"":0,""heavy_GTS"":0,""tics"":1}"
chr1:21841268,"[""C"",""T""]","{""GTS"":0,""healthy"":1,""heavy_GTS"":0,""tics"":0}"
chr1:21842362,"[""G"",""A""]","{""GTS"":2,""healthy"":0,""heavy_GTS"":1,""tics"":3}"


In [58]:
mt = mt.annotate_rows(GTS_non_refs_fraction = (mt.grouped_variants[('GTS')] / 46),
                      tics_non_refs_fraction = (mt.grouped_variants[('tics')] / 42),
                      heavy_GTS_non_refs_fraction = (mt.grouped_variants[('heavy_GTS')] / 41),
                      healthy_non_refs_fraction = (mt.grouped_variants[('healthy')] / 56))

In [59]:
mt = mt.rows()

In [60]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'AC': array<int32> 
    'AF': array<float64> 
    'AN': int32 
    'a_index': int32 
    'was_split': bool 
    'gnomad_v3': struct {
        v3_nfe: struct {
            AC: int32, 
            AF: float64, 
            AN: int32, 
            homozygote_count: int32
        }, 
        intergenic_consequences: array<struct {
            allele_num: int32, 
            consequence_terms: array<str>, 
            impact: str, 
            minimised: int32, 
            variant_allele: str
        }>, 
        most_severe_consequence: str, 
        motif_feature_consequences: array<struct {
            allele_num: int32, 
            consequence_terms: array<str>, 
            high_inf_pos: str, 
            impact: str, 
            minimised: int32, 
            motif_feature_id: str, 
        

In [61]:
mt = mt.to_pandas()

In [62]:
mt.columns

Index(['locus.contig', 'locus.position', 'alleles', 'rsid', 'AC', 'AF', 'AN',
       'a_index', 'was_split', 'gnomad_v3.v3_nfe.AC', 'gnomad_v3.v3_nfe.AF',
       'gnomad_v3.v3_nfe.AN', 'gnomad_v3.v3_nfe.homozygote_count',
       'gnomad_v3.intergenic_consequences',
       'gnomad_v3.most_severe_consequence',
       'gnomad_v3.motif_feature_consequences',
       'gnomad_v3.regulatory_feature_consequences',
       'gnomad_v3.transcript_consequences', 'gnomad_v3.variant_class',
       'gnomad_v3.rsid', 'gnomad_v3.cov_v3.over_1', 'within_gene', 'hpo',
       'cadd', 'nearest_genes_20kb', 'variant_qc.dp_stats.mean',
       'variant_qc.dp_stats.stdev', 'variant_qc.dp_stats.min',
       'variant_qc.dp_stats.max', 'variant_qc.gq_stats.mean',
       'variant_qc.gq_stats.stdev', 'variant_qc.gq_stats.min',
       'variant_qc.gq_stats.max', 'variant_qc.AC', 'variant_qc.AF',
       'variant_qc.AN', 'variant_qc.homozygote_count', 'variant_qc.call_rate',
       'variant_qc.n_called', 'variant_qc.n_no

In [63]:
mt.drop(['a_index', 'was_split', 'rsid', 'grouped_variants', 'gnomad_v3.motif_feature_consequences', 'gnomad_v3.regulatory_feature_consequences', 'gnomad_v3.variant_class',
       'variant_qc.dp_stats.stdev', 'variant_qc.gq_stats.stdev', 'variant_qc.AC', 'variant_qc.AF',
       'variant_qc.AN', 'variant_qc.homozygote_count', 'variant_qc.call_rate',
       'variant_qc.n_called', 'variant_qc.n_filtered', 'variant_qc.n_het', 'variant_qc.n_non_ref',
       'variant_qc.het_freq_hwe'], axis = 1, inplace=True)

In [64]:
mt['samples_non_ref']

0                                                  [489]
1                                               [S_7307]
2      [488, 491, S_170c, S_7255, S_7269, S_7270, S_7...
3                                        [494, WGS_188a]
4                                     [WGS_183, WGS_37b]
                             ...                        
339                                             [WGS_87]
340    [S_7227, S_7234, S_7236, S_7245, S_7247, WGS_1...
341    [S_7214, S_7269, S_7270, WGS_108, WGS_180b, WG...
342                                   [WGS_85a, WGS_85b]
343                             [S_7275, S_7276, S_7279]
Name: samples_non_ref, Length: 344, dtype: object

In [65]:
mt.to_csv('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/results/GTS-filtering-results.csv')