# table of contents:

## [1. Import hail, other libraries and data](#1)

[1.1 import phenotype and pedigree data](#1.1)

[1.2 Annotate mt with phenotype and pedigree info](#1.2)

## [2. Explore MatrixTable, collect field descriptions](#2)

   [2.1 Removing the star alleles](#2.1)
   
   [2.1.5 Filter out outliers based on PCA](#2.1.5)
   
   [2.2 Creating a mt_p with patients only and mt_c with non-patients](#2.2)


## [3. Explore Clinvar pathogenic variants](#3)

[3.1 Filter out pathogenic variants with Gnomad AF > 0.001 and those that occur in controls](#3.1)

## [4. Filter for variants in genes associated with GTS ](#4)

[4.1 List of 260 genes enriched in basal ganglia](#4.1)

[4.2 Lists of genes associated with HPO phenotypes](#4.2)


## [5. Investigation of the X chromosome genotypes](#5)

[5.1 Filter for X associated rare variants](#5.1)

<a id='1'></a> 
## 1. Import hail, other libraries and data

always run this code to widen notebook:

In [1749]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
import hail as hl
hl.init() 

Running on Apache Spark version 2.4.1
SparkUI available at http://349d1de1bab4:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.26-2dcc3d963867
LOGGING: writing to /hail/hail-20191114-0831-0.2.26-2dcc3d963867.log


In [1750]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce

In [10]:
hl.import_vcf('data/annotated.test.vcf', reference_genome='GRCh38').write('data/sample.mt', overwrite=True)

2019-11-12 15:44:48 Hail: INFO: Ordering unsorted dataset with network shuffle
2019-11-12 15:45:18 Hail: INFO: wrote matrix table with 726380 rows and 151 columns in 116 partitions to data/sample.mt


In [1425]:
mt = hl.read_matrix_table('data/sample.mt') # mt stands for MatrixTable

<a id='1.1'></a>
## 1.1 import phenotype and pedigree data

In [1426]:
pheno = hl.import_table('GTS-coded.csv', delimiter = ',', impute = True, key = 'ID')

2019-12-02 15:41:42 Hail: INFO: Reading table to impute column types
2019-12-02 15:41:42 Hail: INFO: Finished type imputation
  Loading column 'ID' as type 'str' (imputed)
  Loading column 'family' as type 'str' (imputed)
  Loading column 'sex' as type 'str' (imputed)
  Loading column 'kinship' as type 'str' (imputed)
  Loading column 'disease' as type 'str' (imputed)
  Loading column 'phenotype' as type 'str' (imputed)
  Loading column 'add_pheno' as type 'str' (imputed)
  Loading column 'heavy_tics' as type 'str' (imputed)


<a id='1.2'></a>

## 1.2 Annotate mt with phenotype and pedigree info

In [1427]:
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

<a id='2'></a> 
## 2. Explore MatrixTable, collect field descriptions

<a id='2.1'></a>

## 2.1 Removing the star alleles

### These are orphaned stars and shouldn't be here



In [1434]:
mt = mt.filter_rows(mt.alleles.contains('*'), keep = False)

<a id='2.1.5'></a>

### 2.1.5 Filter out outliers based on PCA

In [1435]:
mt = mt.filter_cols(mt.s != 'WGS_139', keep = True)
mt = mt.filter_cols(mt.s != 'WGS_D6816', keep = True)

In [1436]:
eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

2019-12-02 15:42:48 Hail: INFO: hwe_normalized_pca: running PCA using 42468 variants.
2019-12-02 15:42:50 Hail: INFO: pca: running PCA with 10 components...


In [1439]:
mt = mt.annotate_cols(scores = pcs[mt.s].scores)

In [1713]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    label=mt.phenotypes.family,
                    title='PCA', xlabel='PC1', ylabel='PC2')
show(p)

<a id='2.2'></a>

## 2.2 Creating a mt_p with patients only and mt_c with non-patients

In [1763]:
mt_p = mt.filter_cols(mt.phenotypes.disease == 'YES', keep = True)
mt_p = mt_p.filter_rows(hl.agg.any(mt_p.GT.is_non_ref())) #filtering out variants that do not occur in any patients
mt_c = mt.filter_cols(mt.phenotypes.disease != 'YES', keep = True)
mt_c = mt_c.filter_rows(hl.agg.any(mt_c.GT.is_non_ref())) #filtering out variants that do not occur in any controls

mt_c = mt_c.filter_rows(hl.agg.any(mt_c.GT.is_non_ref())) #filtering out variants that do not occur in any controls

<a id='3'></a> 

## 3. Explore Clinvar pathogenic variants

for patient samples

In [1764]:
p = (mt_p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE.contains('pathogenic'))
c = (mt_p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE.contains('pathogenic/likely_pathogenic'))
l = (mt_p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE.contains('likely_pathogenic'))


patho_p = mt_p.filter_rows(p | c | l)

p = (mt_c.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE.contains('pathogenic'))
c = (mt_c.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE.contains('pathogenic/likely_pathogenic'))
l = (mt_c.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE.contains('likely_pathogenic'))


patho_c = mt_c.filter_rows(p | c | l)

<a id='3.1'></a> 


### 3.1 Filter out pathogenic variants with Gnomad AF > 0.001 and those that occur in controls


In [1765]:
AF_nfe = hl.float64(hl.delimit(patho_p.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))

patho_p_rare = patho_p.filter_rows(AF_nfe < 0.001)
patho_p_rare = patho_p_rare.anti_join_rows(patho_c.rows()) #remove variants that occur in controls

patho_p_rare = hl.variant_qc(patho_p_rare)

p = patho_p_rare.filter_cols(hl.agg.any(patho_p_rare.GT.is_non_ref()))


summary = dict()
fields = [p.s, p.locus, p.GT, p.rsid, p.alleles, p.phenotypes.family, p.phenotypes.sex, p.phenotypes.kinship,
          p.phenotypes.add_pheno, p.phenotypes.heavy_tics , p.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe, 
          p.info.ISEQ_GENES_NAMES, p.info.ISEQ_CLINVAR_ALLELE_ID, p.info.ISEQ_CLINVAR_DISEASES, 
          p.info.ISEQ_HPO_INHERITANCE, p.info.ISEQ_HPO_PHENOTYPES, p.info.ISEQ_HPO_DISEASES,
          p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE, p.info.ANN ]
field_names = ['sample', 'locus', 'genotype', 
               'rsid', 'alleles', 'family', 'sex', 'kinship', 'additional_pheno', 'heavy_tics', 
               'GNOMAD_V3_AF_non_finn_eur', 'Gene', 'CLINVAR_ALLELE_ID', 'CLINVAR_DISEASES', 'HPO_INHERITANCE', 
               'HPO_PHENOTYPES', 'HPO_DISEASES',
               'AGGREGATED_CLINVAR_SIGNIFICANCE', 'SnpEff']


for each, each_name in zip(fields, field_names):
    key = each_name
    summary[key] = p.aggregate_entries(hl.agg.filter(p.GT.is_non_ref(), hl.agg.collect(each)))

patho_vars_df = pd.DataFrame(summary)
patho_vars_df.to_csv('pathogenic_variants_patients_summary.csv')

<a id='4'></a> 

## 4. FIlter for a list of genes related to GTS

In [1766]:
#hand-made gene list for Tourette associated genes
gene_list = ['PANK2', 'COL27A1', 'PDGFB', 'CELSR3', 'OPA1', 'FBN2', 'WWC1', 'NIPBL', 'FN1', 'FBN2', 'SLITRK1', 'SLITRK2', 'SLITRK3', 'SLITRK4', 'SLITRK5', 'SLITRK6', 'HDC', 'OPRK1', 'PCDH10', 'NTSR2', 'OPRK1', 'CHD8', 'SCUBE1', 'PNKD', 'CNTNAP2', 'MOG', 'DRD2', 'DRD3', 'DRD4', 'DRD5', 'DAT1', 'DBH', 'HTR2A', 'TPH2', 'EAAT1', 'SAPAP3']        

In [1767]:
mt_f = mt_p.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mt_p.info.ISEQ_GENES_NAMES))

filter out common variants:

In [1768]:
AF_nfe = hl.float64(hl.delimit(mt_f.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
mt_f = mt_f.filter_rows(AF_nfe < 0.001)

filter out variants that occur in controls:

In [1769]:
mt_f_contr = mt_c.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mt_c.info.ISEQ_GENES_NAMES))

AF_nfe = hl.float64(hl.delimit(mt_f_contr.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
mt_f_contr = mt_f_contr.filter_rows(AF_nfe < 0.001)

mt_f = mt_f.anti_join_rows(mt_f_contr.rows())

In [1770]:
mt_f.aggregate_rows(hl.agg.explode(lambda element: hl.agg.counter(element), mt_f.info.ISEQ_GENES_NAMES))

{'SLITRK6': 1,
 'FBN2': 2,
 'SLITRK3': 2,
 'CHD8': 3,
 'NTSR2': 1,
 'MOG': 1,
 'SCUBE1': 3,
 'NIPBL': 3,
 'PCDH10': 1,
 'DRD3': 1,
 'COL27A1': 2,
 'FN1': 6,
 'DRD4': 1,
 'OPA1': 1}

In [1771]:
p = mt_f.filter_cols(hl.agg.any(mt_f.GT.is_non_ref()))

summary = dict()
fields = [p.s, p.locus, p.GT, p.rsid, p.alleles, p.phenotypes.family, p.phenotypes.sex, p.phenotypes.kinship,
          p.phenotypes.add_pheno, p.phenotypes.heavy_tics , p.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe, 
          p.info.ISEQ_GENES_NAMES, p.info.ISEQ_CLINVAR_ALLELE_ID, p.info.ISEQ_CLINVAR_DISEASES, 
          p.info.ISEQ_HPO_INHERITANCE, p.info.ISEQ_HPO_PHENOTYPES, p.info.ISEQ_HPO_DISEASES,
          p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE, p.info.ANN ]
field_names = ['sample', 'locus', 'genotype', 
               'rsid', 'alleles', 'family', 'sex', 'kinship', 'additional_pheno', 'heavy_tics', 
               'GNOMAD_V3_AF_non_finn_eur', 'Gene', 'CLINVAR_ALLELE_ID', 'CLINVAR_DISEASES', 'HPO_INHERITANCE', 
               'HPO_PHENOTYPES', 'HPO_DISEASES',
               'AGGREGATED_CLINVAR_SIGNIFICANCE', 'SnpEff']

for each, each_name in zip(fields, field_names):
    key = each_name
    summary[key] = p.aggregate_entries(hl.agg.filter(p.GT.is_non_ref(), hl.agg.collect(each)))

vars_df = pd.DataFrame(summary)

vars_df.to_csv('tourette_gene_list_variants_patients_summary.csv')

<a id='4.1'></a> 

### 4.1 List of 260 genes enriched in basal ganglia

https://www.proteinatlas.org/search/brain_category_rna%3Abasal+ganglia%3BRegion+enriched%2CGroup+enriched%2CRegion+enhanced+AND+sort_by%3Atissue+specific+score

In [1773]:
bg_genes = [line.rstrip() for line in open('brain_category_rna_basal.tsv')]

mt_p.filter_rows(hl.any(lambda x: hl.literal(bg_genes).contains(x), mt_p.info.ISEQ_GENES_NAMES)).count()
mt_f = mt_p.filter_rows(hl.any(lambda x: hl.literal(bg_genes).contains(x), mt_p.info.ISEQ_GENES_NAMES))

filter out common variants:

In [1774]:
AF_nfe = hl.float64(hl.delimit(mt_f.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
mt_f = mt_f.filter_rows(AF_nfe < 0.0001)

filter out variants that occur in controls:

In [1775]:
mt_f_contr = mt_c.filter_rows(hl.any(lambda x: hl.literal(bg_genes).contains(x), mt_c.info.ISEQ_GENES_NAMES))

AF_nfe = hl.float64(hl.delimit(mt_f_contr.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
mt_f_contr = mt_f_contr.filter_rows(AF_nfe < 0.0001)

mt_f = mt_f.anti_join_rows(mt_f_contr.rows())

In [1776]:
p = mt_f.filter_cols(hl.agg.any(mt_f.GT.is_non_ref()))

summary = dict()
fields = [p.s, p.locus, p.GT, p.rsid, p.alleles, p.phenotypes.family, p.phenotypes.sex, p.phenotypes.kinship,
          p.phenotypes.add_pheno, p.phenotypes.heavy_tics , p.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe, 
          p.info.ISEQ_GENES_NAMES, p.info.ISEQ_CLINVAR_ALLELE_ID, p.info.ISEQ_CLINVAR_DISEASES, 
          p.info.ISEQ_HPO_INHERITANCE, p.info.ISEQ_HPO_PHENOTYPES, p.info.ISEQ_HPO_DISEASES,
          p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE, p.info.ANN ]
field_names = ['sample', 'locus', 'genotype', 
               'rsid', 'alleles', 'family', 'sex', 'kinship', 'additional_pheno', 'heavy_tics', 
               'GNOMAD_V3_AF_non_finn_eur', 'Gene', 'CLINVAR_ALLELE_ID', 'CLINVAR_DISEASES', 'HPO_INHERITANCE', 
               'HPO_PHENOTYPES', 'HPO_DISEASES',
               'AGGREGATED_CLINVAR_SIGNIFICANCE', 'SnpEff']

for each, each_name in zip(fields, field_names):
    key = each_name
    summary[key] = p.aggregate_entries(hl.agg.filter(p.GT.is_non_ref(), hl.agg.collect(each)))

vars_df = pd.DataFrame(summary)
vars_df.describe()
vars_df.to_csv('basal_ganglia_genelist.csv')

<a id='4.2'></a> 

## 4.2 various lists of HPO genes

In [1751]:
file_list = !ls ./gts_gene_lists/*csv

In [1728]:
file_list[0:2]

['./gts_gene_lists/adhd.csv', './gts_gene_lists/agg_beh.csv']

In [1761]:
file_list[2:]

['./gts_gene_lists/echolalia.csv',
 './gts_gene_lists/inv_mov.csv',
 './gts_gene_lists/motor_tics.csv',
 './gts_gene_lists/neuro_dev.csv',
 './gts_gene_lists/ocd_beh.csv',
 './gts_gene_lists/phonic_tics.csv',
 './gts_gene_lists/self_mut.csv',
 './gts_gene_lists/tics.csv']

In [1762]:
for a_file in file_list[2:]:
    
    mt_p_only = mt_p.anti_join_rows(mt_c.rows())
    AF_nfe = hl.float64(hl.delimit(mt_p_only.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
    
    genes = []
    df = pd.read_csv(a_file)
    genes = list(df['GENE_SYMBOL'])
  
    
    if len(genes) > 150:
        mt_p_only = mt_p_only.filter_rows(AF_nfe < 0.0001)   
    else:
        mt_p_only = mt_p_only.filter_rows(AF_nfe < 0.001)
        
    mtx = mt_p_only.filter_rows(hl.any(lambda x: hl.literal(genes).contains(x), mt_p_only.info.ISEQ_GENES_NAMES)) 
    mtx = mtx.filter_cols(hl.agg.any(mtx.GT.is_non_ref()))
    
    summary = dict()
    fields = [mtx.s, mtx.locus, mtx.GT, mtx.GQ, mtx.rsid, mtx.alleles, mtx.phenotypes.family, mtx.phenotypes.sex, mtx.phenotypes.kinship,
              mtx.phenotypes.add_pheno, mtx.phenotypes.heavy_tics , mtx.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe, 
              mtx.info.ISEQ_GENES_NAMES, mtx.info.ISEQ_CLINVAR_ALLELE_ID, mtx.info.ISEQ_CLINVAR_DISEASES, 
              mtx.info.ISEQ_HPO_INHERITANCE, mtx.info.ISEQ_HPO_PHENOTYPES, mtx.info.ISEQ_HPO_DISEASES,
              mtx.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE, mtx.info.ANN ]
    field_names = ['sample', 'locus', 'genotype', 'GQ',
                   'rsid', 'alleles', 'family', 'sex', 'kinship', 'additional_pheno', 'heavy_tics', 
                   'GNOMAD_V3_AF_non_finn_eur', 'Gene', 'CLINVAR_ALLELE_ID', 'CLINVAR_DISEASES', 'HPO_INHERITANCE', 
                   'HPO_PHENOTYPES', 'HPO_DISEASES',
                   'AGGREGATED_CLINVAR_SIGNIFICANCE', 'SnpEff']

    for each, each_name in zip(fields, field_names):
        key = each_name
        summary[key] = mtx.aggregate_entries(hl.agg.filter(mtx.GT.is_non_ref(), hl.agg.collect(each)))

    vars_df = pd.DataFrame(summary)
    vars_df.to_csv('sum_'+a_file.split('/')[2])

<a id='5'></a>

## 5. Find compound homozygotes / double hets


In [None]:
#test_code
bg_genes = [line.rstrip() for line in open('brain_category_rna_basal.tsv')] # this has changed location!

#hand-made gene list for Tourette associated genes
gene_list = ['PANK2', 'COL27A1', 'PDGFB', 'CELSR3', 'OPA1', 'FBN2', 'WWC1', 'NIPBL', 'FN1', 'FBN2', 'SLITRK1', 'SLITRK2', 'SLITRK3', 'SLITRK4', 'SLITRK5', 'SLITRK6', 'HDC', 'OPRK1', 'PCDH10', 'NTSR2', 'OPRK1', 'CHD8', 'SCUBE1', 'PNKD', 'CNTNAP2', 'MOG', 'DRD2', 'DRD3', 'DRD4', 'DRD5', 'DAT1', 'DBH', 'HTR2A', 'TPH2', 'EAAT1', 'SAPAP3']        

patho_list = list(patho_vats_df['GENE_SYMBOL']) # not sure about the column

genes = []

for a_file in file_list:

    df = pd.read_csv(a_file)
    genes = genes + list(df['GENE_SYMBOL'])

many_genes = genes + gene_list + bg_genes

<a id='6'></a>

# 6. Investigation of the X chromosome genotypes

+ X chromosome + suscebility to brain ischemia:  TNFα,  IL-6  and  IL-1β, IL-4, IL-10
+ https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003864

In [1574]:
mt.count()

(47373, 149)

In [1576]:
mt_X = mt.filter_rows(mt.locus.contig == "chrX") 

In [1578]:
mt_X_f = mt_X.filter_cols(mt_X.phenotypes.sex == "F") 
mt_X_m = mt_X.filter_cols(mt_X.phenotypes.sex == "M") 

In [1580]:
mt_X_m_hets = mt_X_m.filter_rows(hl.agg.any(mt_X_m.GT.is_het()))
mt_X_m_homs = mt_X_m.filter_rows(hl.agg.any(mt_X_m.GT.is_het()), keep = False)

In [1585]:
mt_X_m_hets.count()

2019-12-03 09:38:29 Hail: INFO: reading 4 of 116 data partitions


(255, 95)

In [1586]:
mt_X_m_homs.count()

2019-12-03 09:38:39 Hail: INFO: reading 4 of 116 data partitions


(761, 95)

In [1584]:
p1 = hl.plot.histogram(mt_X_f.DP, range=(0,50), bins=20, title='DP Histogram for X females', legend='DP')
p2 = hl.plot.histogram(mt_X_m.DP, range=(0,50), bins=20, title='DP Histogram for X males', legend='DP')
p3 = hl.plot.histogram(mt_X_m_hets.DP, range=(0,50), bins=20, title='DP Histogram for X males het', legend='DP')
p4 = hl.plot.histogram(mt_X_m_homs.DP, range=(0,50), bins=20, title='DP Histogram for X males homs', legend='DP')

show(gridplot([p1, p2, p3, p4], ncols=2, plot_width=500, plot_height=500))

2019-12-03 09:27:43 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 09:27:44 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 09:27:44 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 09:27:45 Hail: INFO: reading 4 of 116 data partitions


In [1600]:
p1 = hl.plot.histogram(mt_X_f.GQ, range=(0,99), bins=15, title='GQ Histogram for X females', legend='GQ')
p2 = hl.plot.histogram(mt_X_m.GQ, range=(0,99), bins=15, title='GQ Histogram for X males', legend='GQ')
p3 = hl.plot.histogram(mt_X_m_hets.GQ, range=(0,99), bins=15, title='GQ Histogram for X males het', legend='GQ')
p4 = hl.plot.histogram(mt_X_m_homs.GQ, range=(0,99), bins=15, title='GQ Histogram for X males homs', legend='GQ')
p5 = hl.plot.histogram(mt_f.GQ, range=(0,99), bins=15, title='GQ Histogram for filtered variants', legend='GQ')

show(gridplot([p1, p2, p3, p4, p5], ncols=2, plot_width=500, plot_height=500))

2019-12-03 11:00:49 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:00:50 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:00:51 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:00:51 Hail: INFO: reading 4 of 116 data partitions


In [1528]:
mt_p_X = mt_p.filter_rows(mt_p.locus.contig == "chrX")
mt_c_X = mt_c.filter_rows(mt_c.locus.contig == "chrX")

In [1529]:
AF_nfe = hl.float64(hl.delimit(mt_p_X.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
mt_p_X = mt_p_X.filter_rows(AF_nfe < 0.0001)

In [1530]:
AF_nfe = hl.float64(hl.delimit(mt_c_X.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe))
mt_c_X = mt_c_X.filter_rows(AF_nfe < 0.0001)

In [1531]:
mt_c_X.count()

2019-12-02 16:03:56 Hail: INFO: reading 4 of 116 data partitions


(97, 47)

filter out patients without any variants:

In [1534]:
mt_p_X = mt_p_X.filter_cols(hl.agg.any(mt_p_X.GT.is_non_ref()))

In [1535]:
mt_p_X.count() #all have at least one

2019-12-02 16:04:40 Hail: INFO: reading 4 of 116 data partitions
2019-12-02 16:04:41 Hail: INFO: reading 4 of 116 data partitions


(130, 84)

filter out variants that occur in controls:

In [1536]:
mt_p_X = mt_p_X.anti_join_rows(mt_c_X.rows())

In [1539]:
mt_p_X.aggregate_rows(hl.agg.explode(lambda element: hl.agg.counter(element), mt_p_X.info.ISEQ_GENES_NAMES))

2019-12-02 16:07:13 Hail: INFO: reading 4 of 116 data partitions
2019-12-02 16:07:13 Hail: INFO: reading 4 of 116 data partitions


{'KLHL34': 1,
 'VCX2': 7,
 'ARL13A': 1,
 'SHOX': 1,
 'CT55': 1,
 'ZNF185': 2,
 'GABRA3': 1,
 'TIMP1': 1,
 'PHF8': 1,
 'CAPN6': 1,
 'MED12': 1,
 'TENM1': 1,
 'KIAA1210': 1,
 'CXorf23': 1,
 'GAGE2A': 1,
 'PFKFB1': 1,
 'HCFC1': 1,
 'IL13RA2': 1,
 'PNCK': 2,
 'BGN': 1,
 'G6PD': 1,
 'FOXO4': 1,
 'SHROOM2': 2,
 'MAGIX': 1,
 'IQSEC2': 1,
 'PPP2R3B': 1,
 'TLR8': 2,
 'H2BFM': 1,
 'TSPYL2': 1,
 'GDI1': 1,
 'RP11-402P6.15': 4,
 'ZMAT1': 1,
 'ZXDA': 1,
 'SSX1': 1,
 'SLC25A5': 1,
 'MAGEC1': 1,
 'SLC25A53': 1,
 'FAM9A': 1,
 'ZIC3': 1,
 'RGN': 1,
 'GYG2': 1,
 'AMER1': 1,
 'GRIA3': 1,
 'MTMR1': 1,
 'ARHGAP6': 1,
 'ZNF41': 2,
 'ZXDB': 1,
 'RBMXL3': 3,
 'NXT2': 1,
 'BCOR': 1,
 'MAGEA8': 1,
 'RP3-358H7.1': 1,
 'USP26': 1,
 'FOXP3': 1,
 'ZMYM3': 1,
 'WASH6P': 1,
 'GABRE': 1}

In [1613]:
p = mt_p_X.filter_cols(hl.agg.any(mt_p_X.GT.is_non_ref()))
summary = dict()
fields = [p.s, p.locus, p.GT, p.GQ, p.DP, p.rsid, p.alleles, p.phenotypes.family, p.phenotypes.sex, p.phenotypes.kinship,
          p.phenotypes.add_pheno, p.phenotypes.heavy_tics , p.info.ISEQ_GNOMAD_GENOMES_V3_AF_nfe, 
          p.info.ISEQ_GENES_NAMES, p.info.ISEQ_CLINVAR_ALLELE_ID, p.info.ISEQ_CLINVAR_DISEASES, 
          p.info.ISEQ_HPO_INHERITANCE, p.info.ISEQ_HPO_PHENOTYPES, p.info.ISEQ_HPO_DISEASES,
          p.info.ISEQ_AGGREGATED_CLINVAR_SIGNIFICANCE, p.info.ANN]
field_names = ['sample', 'locus', 'genotype', 'GQ', 'DP',
               'rsid', 'alleles', 'family', 'sex', 'kinship', 'additional_pheno', 'heavy_tics', 
               'GNOMAD_V3_AF_non_finn_eur', 'Gene', 'CLINVAR_ALLELE_ID', 'CLINVAR_DISEASES', 'HPO_INHERITANCE', 
               'HPO_PHENOTYPES', 'HPO_DISEASES',
               'AGGREGATED_CLINVAR_SIGNIFICANCE', 'SnpEff']

In [1614]:
for each, each_name in zip(fields, field_names):
    key = each_name
    summary[key] = p.aggregate_entries(hl.agg.filter(p.GT.is_non_ref(), hl.agg.collect(each)))

2019-12-03 11:36:21 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:21 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:22 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:22 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:22 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:23 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:24 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:24 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:24 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:25 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:26 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:27 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:27 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:27 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:27 Hail: INFO: reading 4 of 116 data partitions
2019-12-03 11:36:29 Hail:

In [1615]:
vars_df = pd.DataFrame(summary)

In [1616]:
vars_df

Unnamed: 0,sample,locus,genotype,GQ,DP,rsid,alleles,family,sex,kinship,...,heavy_tics,GNOMAD_V3_AF_non_finn_eur,Gene,CLINVAR_ALLELE_ID,CLINVAR_DISEASES,HPO_INHERITANCE,HPO_PHENOTYPES,HPO_DISEASES,AGGREGATED_CLINVAR_SIGNIFICANCE,SnpEff
0,WGS_7168,chrX:347315,0/1,99,50,,"[G, A]",.,M,.,...,YES,[0.0],[PPP2R3B],,,,,,,[A|structural_interaction_variant|HIGH|PPP2R3B...
1,S_7288,chrX:630950,0/1,99,33,,"[A, T]",C,F,father_sister,...,,[3.09866e-05],[SHOX],,[Langer_mesomelic_dysplasia_syndrome^Leri_Weil...,[Autosomal_dominant_inheritance^Autosomal_rece...,[Abnormal_metatarsal_morphology^Abnormality_of...,[#249700_langer_mesomelic_dysplasia^dyschondro...,,[T|missense_variant|MODERATE|SHOX|ENSG00000185...
2,WGS_85a,chrX:2843294,0/1,99,23,,"[G, C]",L,F,P,...,NO,[3.97212e-05],[GYG2],,,,,,,[C|missense_variant|MODERATE|GYG2|ENSG00000056...
3,WGS_85c,chrX:2843294,1/1,57,19,,"[G, C]",L,M,father,...,NO,[3.97212e-05],[GYG2],,,,,,,[C|missense_variant|MODERATE|GYG2|ENSG00000056...
4,S_7288,chrX:8170141,0/1,99,12,rs41305169,"[A, AGTGGTTCCTCCACCTGGCTCTCCTGACTCG]",C,F,father_sister,...,,[0.0],[VCX2],,,,,,,[AGTGGTTCCTCCACCTGGCTCTCCTGACTCG|conservative_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,WGS_122,chrX:153672013,0/1,99,46,rs375555808,"[G, A]",.,F,.,...,YES,[9.92911e-05],[PNCK],,,,,,,[A|missense_variant|MODERATE|PNCK|ENSG00000130...
97,WGS_147c,chrX:153954374,1/1,60,20,,"[G, A]",F,M,father,...,NO,[3.99944e-05],[HCFC1],,[Disorders_of_Intracellular_Cobalamin_Metaboli...,[X-linked_recessive_inheritance],[2-3_toe_syndactyly^Absence_seizure^Athetosis^...,[methylmalonic_acidemia_and_homocysteinemia_cb...,,[A|missense_variant|MODERATE|HCFC1|ENSG0000017...
98,WGS_37a,chrX:154441678,0/1,99,7,,"[G, A]",K,M,P,...,NO,[0.0],[GDI1],,[X-Linked_Mental_Retardation_41],[X-linked_dominant_inheritance^X-linked_inheri...,[2-3_toe_syndactyly^Absence_seizure^Attention_...,[mental_retardation_x-linked_41^x-linked_non-s...,,[A|missense_variant|MODERATE|GDI1|ENSG00000203...
99,S_7146,chrX:154534367,0/1,99,53,,"[C, T]",E,M,P,...,,[0.0],[G6PD],,[Anemia_nonspherocytic_hemolytic_due_to_G6PD_d...,[X-linked_dominant_inheritance^X-linked_recess...,[Abdominal_pain^Anisocytosis^Fava_bean-induced...,[anemia_nonspherocytic_hemolytic_due_to_g6pd_d...,,[T|structural_interaction_variant|HIGH|G6PD|EN...


In [1617]:
vars_df.describe()

Unnamed: 0,GQ,DP
count,101.0,101.0
mean,71.465347,21.712871
std,29.491546,27.592512
min,6.0,2.0
25%,45.0,9.0
50%,78.0,14.0
75%,99.0,22.0
max,99.0,154.0


In [1619]:
vars_df.to_csv('rare_X_patients_only.csv')