# Table of contents:

## [1. Import mt and filter it to only contain unrelated patients](#1)


## [2. Filter for a list of related genes and compare with Fisher exact with gnomAD (simple)](#2)

In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))


In [4]:
import hail as hl
hl.init() 

Running on Apache Spark version 2.4.1
SparkUI available at http://8c03623a3bed:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.28-61941242c15d
LOGGING: writing to /hail/hail-20191213-1908-0.2.28-61941242c15d.log


In [5]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

<section id='1'> </section>

# 1. Import mt and filter it to only contain unrelated patients

In [6]:
mt = hl.read_matrix_table('data/clean.mt')

In [92]:
mt.count()

(47373, 149)

In [152]:
mtp = mt.filter_cols(mt.phenotypes.disease == 'YES')

In [153]:
pc_rel = hl.pc_relate(mtp.GT, 0.001, k=2, statistics='kin')
pairs = pc_rel.filter(pc_rel['kin'] > 0.08)
related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)

pairs = pairs.annotate(j_fam = mtp.cols()[pairs.j].phenotypes.family,
                      i_fam = mtp.cols()[pairs.i].phenotypes.family)


2019-12-12 19:09:21 Hail: INFO: hwe_normalized_pca: running PCA using 37093 variants.
2019-12-12 19:09:22 Hail: INFO: pca: running PCA with 2 components...
2019-12-12 19:09:45 Hail: INFO: Wrote all 12 blocks of 47373 x 102 matrix with block size 4096.
2019-12-12 19:09:46 Hail: INFO: wrote matrix with 102 rows and 102 columns as 1 block of size 4096 to file:/tmp/hail.aiJ5KIRBBjfZ/CKPeZnLaR9.bm
2019-12-12 19:09:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2019-12-12 19:09:46 Hail: INFO: wrote table with 52 rows in 1 partition to file:/tmp/hail.aiJ5KIRBBjfZ/38AYA4fa8y


In [177]:
related_samples_to_remove.count()

2019-12-12 19:17:15 Hail: INFO: Ordering unsorted dataset with network shuffle


29

In [154]:
mtp = mtp.filter_cols(hl.is_defined(related_samples_to_remove[mtp.col_key]), keep=False)

In [155]:
mtp = mtp.key_cols_by()

In [156]:
mtp = mtp.key_cols_by(mtp.s)

In [113]:
mtp.count() # this matrix is ready to compare variant frequencies with gnomAD and other dbs

2019-12-12 18:57:53 Hail: INFO: Coerced sorted dataset
2019-12-12 18:57:53 Hail: INFO: Ordering unsorted dataset with network shuffle


(47373, 73)

<section id='2'> </section>

# 2. Filter for a list of related genes and compare with Fisher exact with gnomAD (simple)

In [114]:
import pickle 
# get the list many_genes from GTS_main_and_patient_oriented.ipynb
    
with open('./py_objects/many_genes', 'rb') as l:
    many_genes = pickle.load(l)

In [157]:
mtp = mtp.filter_rows(hl.any(lambda x: hl.literal(many_genes).contains(x), mtp.info.ISEQ_GENES_NAMES))
mtp = hl.variant_qc(mtp)

In [158]:
v3_AN = mtp.info.ISEQ_GNOMAD_GENOMES_V3_AN
v3_AC = mtp.info.ISEQ_GNOMAD_GENOMES_V3_AC[0]

p_AN = mtp.variant_qc.AN
p_AC = mtp.variant_qc.AC[1]

In [159]:
mtp = mtp.annotate_rows(fish_exact = hl.fisher_exact_test(p_AC, p_AN, v3_AC, v3_AN))

In [103]:
mtp.count()

2019-12-12 18:56:35 Hail: INFO: Coerced sorted dataset
2019-12-12 18:56:35 Hail: INFO: Ordering unsorted dataset with network shuffle


(6951, 73)

In [160]:
p = 1e-8 

In [161]:
mtp = mtp.filter_rows(mtp.fish_exact.p_value < p)

In [162]:
mtp = mtp.filter_rows(mtp.info.ISEQ_HIGHEST_IMPACT.contains('HIGH'))

In [163]:
mtp = mtp.select_cols(mtp.phenotypes.family, mtp.phenotypes.disease, mtp.phenotypes.sex, mtp.phenotypes.kinship, mtp.phenotypes.add_pheno,
                        mtp.phenotypes.heavy_tics)

In [164]:
mtp = mtp.select_entries(mtp.GT, mtp.GQ)

In [166]:
mtp = mtp.annotate_rows(AN_GTS =  mtp.variant_qc.AN, AC_GTS = mtp.variant_qc.AC)

In [168]:
mtp = mtp.select_rows(mtp.fish_exact.p_value, mtp.info.AC, mtp.info.AN, mtp.AN_GTS, mtp.AC_GTS,
                      mtp.rsid, mtp.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe, mtp.info.ISEQ_GENES_NAMES, 
                      mtp.info.ISEQ_CLINVAR_ALLELE_ID, mtp.info.ISEQ_CLINVAR_DISEASES, mtp.info.ISEQ_HPO_INHERITANCE, 
                      mtp.info.ISEQ_HPO_PHENOTYPES, mtp.info.ISEQ_HPO_DISEASES, 
                      mtp.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE, mtp.info.ANN)

In [169]:
mtp = mtp.filter_cols(hl.agg.any(mtp.GT.is_non_ref()))

In [170]:
tb = mtp.make_table()

2019-12-12 19:12:39 Hail: INFO: Coerced sorted dataset
2019-12-12 19:12:39 Hail: INFO: Ordering unsorted dataset with network shuffle
2019-12-12 19:12:45 Hail: INFO: Coerced sorted dataset
2019-12-12 19:12:45 Hail: INFO: Ordering unsorted dataset with network shuffle


In [171]:
df = tb.to_pandas()

2019-12-12 19:12:49 Hail: INFO: Coerced sorted dataset
2019-12-12 19:12:49 Hail: INFO: Ordering unsorted dataset with network shuffle
2019-12-12 19:12:55 Hail: INFO: Coerced sorted dataset
2019-12-12 19:12:55 Hail: INFO: Ordering unsorted dataset with network shuffle


In [190]:
mtc = mt.filter_cols(mt.phenotypes.disease == 'NO')

In [191]:
mtc = mtc.semi_join_rows(mtp.rows())

In [192]:
mtc = hl.variant_qc(mtc)

In [193]:
mtc = mtc.select_rows(mtc.variant_qc.AC)

In [194]:
mtc = mtc.make_table()

In [195]:
tb2 = mtc.to_pandas()

2019-12-12 19:31:55 Hail: INFO: Coerced sorted dataset
2019-12-12 19:31:55 Hail: INFO: Ordering unsorted dataset with network shuffle


In [196]:
tb2 = tb2['AC']

In [211]:
df.iloc[:,0:18]

Unnamed: 0,locus.contig,locus.position,alleles,p_value,AC,AN,AN_GTS,AC_GTS,rsid,ISEQ_GNOMAD_GENOMES_V3_AF_nfe,ISEQ_GENES_NAMES,ISEQ_CLINVAR_ALLELE_ID,ISEQ_CLINVAR_DISEASES,ISEQ_HPO_INHERITANCE,ISEQ_HPO_PHENOTYPES,ISEQ_HPO_DISEASES,ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE,ANN
0,chr1,45515758,"[G, A]",4.398181e-09,[3],302,146,"[143, 3]",rs112565051,[0.0],[PRDX1],,[Disorder_of_organic_acid_metabolism^Disorders...,[Autosomal_recessive_inheritance],[Abnormality_of_extrapyramidal_motor_function^...,[mahcc^methylmalonic_aciduria_and_homocystinur...,,[A|protein_protein_contact|HIGH|PRDX1|ENSG0000...
1,chr1,153886522,"[CT, C]",2.058061e-14,[35],302,146,"[131, 15]",,[0.00585229],[GATAD2B],,[Mental_retardation_autosomal_dominant_18],[Autosomal_dominant_inheritance],[Abnormality_of_the_cerebral_white_matter^Abse...,[mental_retardation_autosomal_dominant_18^mrd1...,,[C|splice_acceptor_variant&intron_variant|HIGH...
2,chr7,107563982,"[CAG, C]",2.43706e-12,[8],302,146,"[141, 5]",,[9.3301e-05],[COG5],,[Congenital_disorder_of_glycosylation^Congenit...,[Autosomal_recessive_inheritance],[Atrophy/Degeneration_affecting_the_brainstem^...,[#613612_congenital_disorder_of_glycosylation_...,,[C|frameshift_variant|HIGH|COG5|ENSG0000016459...
3,chr8,116874117,"[A, C]",2.277255e-15,[20],300,144,"[138, 6]",,[0.0],[RAD21],,[Cornelia_de_Lange_syndrome_4^De_Lange_syndrom...,[Autosomal_dominant_inheritance^Autosomal_rece...,[Abnormality_of_the_autonomic_nervous_system^A...,[#614701_cornelia_de_lange_syndrome_4^cdls4^co...,,[C|splice_donor_variant&intron_variant|HIGH|RA...
4,chr11,74129643,"[CAG, C]",5.725268e-09,[12],302,146,"[141, 5]",,[0.000232883],[C2CD3],,[Orofaciodigital_syndrome_xiv],[Autosomal_recessive_inheritance],[Abnormal_facial_shape^Absent_speech^Aplasia_o...,[ofd14^orofaciodigital_syndrome_xiv],,[C|frameshift_variant|HIGH|C2CD3|ENSG000001680...
5,chr16,54933218,"[GC, G]",4.559169e-09,[5],302,146,"[143, 3]",,[0.0],[IRX5],,[Hamamy_syndrome],[Autosomal_recessive_inheritance],[Anteverted_nares^Atrial_septal_defect^Brachyc...,[hamamy_syndrome^hmms],,[G|frameshift_variant|HIGH|IRX5|ENSG0000017684...
6,chr19,1393296,"[C, T]",4.05814e-09,[14],302,146,"[141, 5]",,[0.000543006],[NDUFS7],,[MITOCHONDRIAL_COMPLEX_I_DEFICIENCY_NUCLEAR_TY...,[Autosomal_recessive_inheritance],[Abnormal_mitochondria_in_muscle_tissue^Abnorm...,[isolated_complex_i_deficiency^leigh_syndrome_...,,[T|stop_gained|HIGH|NDUFS7|ENSG00000115286|tra...


In [214]:
df2 = pd.concat([df.iloc[:,0:3],tb2,df.iloc[:,4:18]], axis=1)

In [217]:
df2.columns = ['locus.contig', 'locus.position', 'alleles', 'counts_alts_controls', 'alts_all', 'counts_all', 'out',
       'counts_alts_gts_not_related', 'rsid', 'ISEQ_GNOMAD_GENOMES_V3_AF_nfe', 'ISEQ_GENES_NAMES',
       'ISEQ_CLINVAR_ALLELE_ID', 'ISEQ_CLINVAR_DISEASES',
       'ISEQ_HPO_INHERITANCE', 'ISEQ_HPO_PHENOTYPES', 'ISEQ_HPO_DISEASES',
       'ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE', 'ANN']

In [223]:
df2 = df2.drop(['out'], axis = 1)

In [225]:
df2

Unnamed: 0,locus.contig,locus.position,alleles,counts_alts_controls,alts_all,counts_all,counts_alts_gts_not_related,rsid,ISEQ_GNOMAD_GENOMES_V3_AF_nfe,ISEQ_GENES_NAMES,ISEQ_CLINVAR_ALLELE_ID,ISEQ_CLINVAR_DISEASES,ISEQ_HPO_INHERITANCE,ISEQ_HPO_PHENOTYPES,ISEQ_HPO_DISEASES,ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE,ANN
0,chr1,45515758,"[G, A]","[86, 0]",[3],302,"[143, 3]",rs112565051,[0.0],[PRDX1],,[Disorder_of_organic_acid_metabolism^Disorders...,[Autosomal_recessive_inheritance],[Abnormality_of_extrapyramidal_motor_function^...,[mahcc^methylmalonic_aciduria_and_homocystinur...,,[A|protein_protein_contact|HIGH|PRDX1|ENSG0000...
1,chr1,153886522,"[CT, C]","[73, 13]",[35],302,"[131, 15]",,[0.00585229],[GATAD2B],,[Mental_retardation_autosomal_dominant_18],[Autosomal_dominant_inheritance],[Abnormality_of_the_cerebral_white_matter^Abse...,[mental_retardation_autosomal_dominant_18^mrd1...,,[C|splice_acceptor_variant&intron_variant|HIGH...
2,chr7,107563982,"[CAG, C]","[85, 1]",[8],302,"[141, 5]",,[9.3301e-05],[COG5],,[Congenital_disorder_of_glycosylation^Congenit...,[Autosomal_recessive_inheritance],[Atrophy/Degeneration_affecting_the_brainstem^...,[#613612_congenital_disorder_of_glycosylation_...,,[C|frameshift_variant|HIGH|COG5|ENSG0000016459...
3,chr8,116874117,"[A, C]","[78, 8]",[20],300,"[138, 6]",,[0.0],[RAD21],,[Cornelia_de_Lange_syndrome_4^De_Lange_syndrom...,[Autosomal_dominant_inheritance^Autosomal_rece...,[Abnormality_of_the_autonomic_nervous_system^A...,[#614701_cornelia_de_lange_syndrome_4^cdls4^co...,,[C|splice_donor_variant&intron_variant|HIGH|RA...
4,chr11,74129643,"[CAG, C]","[82, 4]",[12],302,"[141, 5]",,[0.000232883],[C2CD3],,[Orofaciodigital_syndrome_xiv],[Autosomal_recessive_inheritance],[Abnormal_facial_shape^Absent_speech^Aplasia_o...,[ofd14^orofaciodigital_syndrome_xiv],,[C|frameshift_variant|HIGH|C2CD3|ENSG000001680...
5,chr16,54933218,"[GC, G]","[85, 1]",[5],302,"[143, 3]",,[0.0],[IRX5],,[Hamamy_syndrome],[Autosomal_recessive_inheritance],[Anteverted_nares^Atrial_septal_defect^Brachyc...,[hamamy_syndrome^hmms],,[G|frameshift_variant|HIGH|IRX5|ENSG0000017684...
6,chr19,1393296,"[C, T]","[82, 4]",[14],302,"[141, 5]",,[0.000543006],[NDUFS7],,[MITOCHONDRIAL_COMPLEX_I_DEFICIENCY_NUCLEAR_TY...,[Autosomal_recessive_inheritance],[Abnormal_mitochondria_in_muscle_tissue^Abnorm...,[isolated_complex_i_deficiency^leigh_syndrome_...,,[T|stop_gained|HIGH|NDUFS7|ENSG00000115286|tra...


In [224]:
df2.to_csv('fisher_exact_againts_gnov3.csv')

In [9]:
voi = mt.filter_rows(mt.locus == hl.parse_locus('chr7:107563982', reference_genome='GRCh38'))

In [11]:
voi = voi.filter_cols(hl.agg.any(voi.GT.is_non_ref()))

In [12]:
voi.GT.show()

locus,alleles,S_170d.GT,S_7231.GT,S_7241.GT,S_7271.GT,S_7276.GT,S_7278.GT,S_7284.GT,S_7305.GT
locus<GRCh38>,array<str>,call,call,call,call,call,call,call,call
chr7:107563982,"[""CAG"",""C""]",0/1,0/1,0/1,0/1,0/1,0/1,0/1,0/1


In [12]:
voi.GT.show()

locus,alleles,S_170d.GT,S_7231.GT,S_7241.GT,S_7271.GT,S_7276.GT,S_7278.GT,S_7284.GT,S_7305.GT
locus<GRCh38>,array<str>,call,call,call,call,call,call,call,call
chr7:107563982,"[""CAG"",""C""]",0/1,0/1,0/1,0/1,0/1,0/1,0/1,0/1


In [13]:
voi.phenotypes.disease.show()

s,Unnamed: 1_level_0
str,str
"""S_170d""","""YES"""
"""S_7231""","""YES"""
"""S_7241""","""NO"""
"""S_7271""","""YES"""
"""S_7276""","""YES"""
"""S_7278""","""YES"""
"""S_7284""","""YES"""
"""S_7305""","""YES"""


In [14]:
voi.phenotypes.family.show()

s,Unnamed: 1_level_0
str,str
"""S_170d""","""B"""
"""S_7231""","""E"""
"""S_7241""","""C"""
"""S_7271""","""G"""
"""S_7276""","""H"""
"""S_7278""","""H"""
"""S_7284""","""F"""
"""S_7305""","""J"""


In [16]:
cog5 = mt.filter_rows(mt.info.ISEQ_GENES_NAMES == ["COG5"])

In [18]:
cog5 = cog5.filter_cols(hl.agg.any(cog5.GT.is_non_ref()))

In [21]:
cog5.aggregate_cols(hl.agg.counter(cog5.phenotypes.disease))

{'NO': 6, 'YES': 16, 'n/a': 2}

In [36]:
cog5.aggregate_entries(hl.agg.counter(cog5.GT.is_non_ref()))

{False: 81, True: 39}

In [41]:
cog5.filter_rows(cog5.phenotypes.disease == 'YES').aggregate_entries(hl.agg.counter(cog5.GT.is_non_ref()))

2019-12-13 19:38:53 Hail: ERROR: scope violation: 'MatrixTable.filter_rows' expects an expression indexed by ['row']
    Found indices ['column'], with unexpected indices ['column']. Invalid fields:
        'phenotypes' (indices ['column'])
    'MatrixTable.filter_rows' supports aggregation over axes ['column'], so these fields may appear inside an aggregator function.


ExpressionException: scope violation: 'MatrixTable.filter_rows' expects an expression indexed by ['row']
    Found indices ['column'], with unexpected indices ['column']. Invalid fields:
        'phenotypes' (indices ['column'])
    'MatrixTable.filter_rows' supports aggregation over axes ['column'], so these fields may appear inside an aggregator function.

In [25]:
cog5.GT.show(n_cols = 24)

locus,alleles,S_170d.GT,S_6982.GT,S_7212.GT,S_7227.GT,S_7231.GT,S_7237.GT,S_7241.GT,S_7249.GT,S_7254.GT,S_7260.GT,S_7271.GT,S_7275.GT,S_7276.GT,S_7278.GT,S_7279.GT,S_7284.GT,S_7290.GT,S_7293.GT,S_7301.GT,S_7304.GT,S_7305.GT,S_7306.GT,S_7308.GT,WGS_6958.GT
locus<GRCh38>,array<str>,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call,call
chr7:107256749,"[""T"",""C""]",0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
chr7:107298208,"[""T"",""C""]",0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,0/1,0/1,0/1
chr7:107563971,"[""G"",""GGCC""]",0/0,0/1,0/1,0/0,0/1,0/1,0/1,0/1,0/1,0/1,0/1,0/0,0/1,0/1,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0
chr7:107563979,"[""A"",""C""]",0/1,0/0,0/0,0/1,0/1,0/0,0/1,0/0,0/0,0/0,0/1,0/1,0/1,0/1,0/0,0/1,0/1,0/1,0/1,0/0,0/1,0/0,0/0,0/0
chr7:107563982,"[""CAG"",""C""]",0/1,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/1,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0


In [31]:
cog5 = cog5.annotate_cols(alt_per_person = hl.agg.counter(cog5.GT.n_alt_alleles()))

In [32]:
cog5.alt_per_person.show()

s,alt_per_person
str,"dict<int32, int64>"
"""S_170d""","{0:3,1:2}"
"""S_6982""","{0:4,1:1}"
"""S_7212""","{0:4,1:1}"
"""S_7227""","{0:4,1:1}"
"""S_7231""","{0:2,1:3}"
"""S_7237""","{0:4,1:1}"
"""S_7241""","{0:2,1:3}"
"""S_7249""","{0:4,1:1}"
"""S_7254""","{0:4,1:1}"
"""S_7260""","{0:4,1:1}"


In [32]:
cog5.alt_per_person.show()

s,alt_per_person
str,"dict<int32, int64>"
"""S_170d""","{0:3,1:2}"
"""S_6982""","{0:4,1:1}"
"""S_7212""","{0:4,1:1}"
"""S_7227""","{0:4,1:1}"
"""S_7231""","{0:2,1:3}"
"""S_7237""","{0:4,1:1}"
"""S_7241""","{0:2,1:3}"
"""S_7249""","{0:4,1:1}"
"""S_7254""","{0:4,1:1}"
"""S_7260""","{0:4,1:1}"
