In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://633fc3f91d0f:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.29-cf730c8fc8f6
LOGGING: writing to /hail/large_vcf_analysis/hail-20200315-1435-0.2.29-cf730c8fc8f6.log


In [2]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

In [61]:
def remove_sex_chrom(mtx):
    mtx = mtx.filter_rows((mtx.locus.contig != "chrX") & (mtx.locus.contig != "chrY"))
    return(mtx)


def prep_to_compare(mtx, group1, group2):
    groups = ['heavy_tics', 'gts_or_tics', 'controls', 'gnomads']
    assert (group1 in groups) & (group2 in groups) , 'groups must be: heavy_tics, gts_or_tics, controls or gnomads'
    my_groups = [group1, group2]
    if 'gnomads' in my_groups :
        if 'heavy_tics' in my_groups :
            mtx = mtx.filter_cols((mtx.phenotypes.heavy_tics == "YES") | (mtx.is_gnomad == True))
            mtx = mtx.annotate_cols(comparison = hl.cond(mtx.s.contains('gnmd'), False, mtx.phenotypes.heavy_tics == 'YES'))
        elif 'gts_or_tics' in my_groups:
            mtx = mtx.filter_cols((mtx.phenotypes.disease == "YES") | (mtx.is_gnomad == True))
            mtx = mtx.annotate_cols(comparison = hl.cond(mtx.s.contains('gnmd'), False, mtx.phenotypes.disease == "YES"))
        elif 'controls' in my_groups :
            mtx = mtx.filter_cols((mtx.phenotypes.disease == "NO") | (mtx.is_gnomad == True))
            mtx = mtx.annotate_cols(comparison = hl.cond(mtx.s.contains('gnmd'), False, mtx.phenotypes.disease == "NO"))
        else:
            print('this comparison is not supported')
        
    if ('controls' in my_groups) & ('gnomads' not in my_groups):
        if 'heavy_tics' in my_groups :
            mtx = mtx.filter_cols((mtx.phenotypes.heavy_tics == "YES") | (mtx.phenotypes.disease == "NO") | (mtx.is_gnomad == True))
            mtx = mtx.annotate_cols(comparison = hl.cond(mtx.s.contains('gnmd'), False, mtx.phenotypes.heavy_tics == "YES"))
        if 'gts_or_tics' in my_groups :
            mtx = mtx.filter_cols((mtx.phenotypes.disease == "YES") | (mtx.phenotypes.disease == "NO") | (mtx.is_gnomad == True))
            mtx = mtx.annotate_cols(comparison = hl.cond(mtx.s.contains('gnmd'), False, mtx.phenotypes.disease == "YES"))
        else:
            print('this comparison is not supported')
        
    return(mtx)                                  


def remove_related(mtx, mtx_subset):
    pc_rel = hl.pc_relate(mtx_subset.GT, 0.001, k=2, statistics='kin')
    pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)
    
    related_samples_to_remove = related_samples_to_remove.annotate(s = related_samples_to_remove.node.s)
    related_samples_to_remove = related_samples_to_remove.key_by('s')
    
    mtx = mtx.key_cols_by()
    mtx = mtx.filter_cols(hl.is_defined(related_samples_to_remove[mtx.s]), keep=False)
    mtx = mtx.key_cols_by(mtx.s)
    
    mtx_subset = mtx_subset.filter_cols(hl.is_defined(related_samples_to_remove[mtx_subset.s]), keep=False)
    
    return(mtx, mtx_subset)


def run_pca(mtx, mtx_subset):
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mtx_subset.GT)
    mtx = mtx.annotate_cols(scores = pcs[mtx.s].scores)
    p = hl.plot.scatter(mtx.scores[0],
                    mtx.scores[1],
                    label=mtx.phenotypes.family,
                    title='PCA', xlabel='PC1', ylabel='PC2')

    p2 = hl.plot.scatter(mtx.scores[2],
                        mtx.scores[3],
                        label=mtx.phenotypes.family,
                        title='PCA', xlabel='PC3', ylabel='PC4')

    p3 = hl.plot.scatter(mtx.scores[4],
                        mtx.scores[5],
                        label=mtx.phenotypes.family,
                        title='PCA', xlabel='PC5', ylabel='PC6')
    show(p)
    show(p2)
    show(p3)
    
    return(mtx)



def run_skat_lin(mtx, gene_list, pcs): #does not have a family covariate
    
    mtx = mtx.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mtx.nearest_genes_20kb))
    mtx = mtx.filter_rows(hl.agg.any(mtx.GT.is_non_ref()))
    mtx = mtx.explode_rows(mtx.nearest_genes_20kb)
    mtx = mtx.filter_rows(hl.literal(gene_list).contains(mtx.nearest_genes_20kb))
    
    scores = [mtx.scores[x] for x in list(range(pcs))]
                          
    
    skat_table = hl.skat(key_expr=mtx.nearest_genes_20kb,
                         weight_expr=mtx.cadd,
                         y=mtx.category,
                         x=mtx.GT.n_alt_alleles(),
                         covariates=[1] + scores,
                         max_size = 2200)
    
    genes_result = skat_table.filter(skat_table.p_value < 0.05/len(gene_list)).id.collect() 

    skat_table.filter(skat_table.p_value < 0.002).show(20)

    skat_table = skat_table.annotate(label = hl.literal(genes).contains(skat_table.id))

    qq_plot = hl.plot.qq(skat_table.p_value,
                                         label = skat_table.label,
                                         n_divisions = len(gene_list))
    show(qq_plot)
    
    return(skat_table, genes_result, qq_plot)


def run_skat(mtx, gene_list, pcs): #does not have a family covariate
    
    mtx = mtx.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mtx.nearest_genes_20kb))
    mtx = mtx.filter_rows(hl.agg.any(mtx.GT.is_non_ref()))
    mtx = mtx.explode_rows(mtx.nearest_genes_20kb)
    mtx = mtx.filter_rows(hl.literal(gene_list).contains(mtx.nearest_genes_20kb))
    
    scores = [mtx.scores[x] for x in list(range(pcs))]
                          
    
    skat_table = hl.skat(key_expr=mtx.nearest_genes_20kb,
                         weight_expr=mtx.cadd,
                         y=mtx.category,
                         x=mtx.GT.n_alt_alleles(),
                         covariates=[1] + scores,
                         max_size = 2200,
                        logistic = True)
    
    genes_result = skat_table.filter(skat_table.p_value < 0.05/len(gene_list)).id.collect() 

    skat_table.filter(skat_table.p_value < 0.002).show(20)

    skat_table = skat_table.annotate(label = hl.literal(genes).contains(skat_table.id))

    qq_plot = hl.plot.qq(skat_table.p_value,
                                         label = skat_table.label,
                                         n_divisions = len(gene_list))
    show(qq_plot)
    
    return(skat_table, genes_result, qq_plot)



def full_skat_lin(mtx, mtx_subset, gene_list, pcs):
    
    mtx = remove_sex_chrom(mtx)
    mtx_subset = remove_sex_chrom(mtx_subset)
    
    mtx, mtx_subset = remove_related(mtx, mtx_subset)
    
    mtx = run_pca(mtx, mtx_subset) #this matrix will be returned, so I can do SKAT with other list and parameters
    
    skat_table, genes_result, qq_plot = run_skat_lin(mtx, gene_list, pcs)
    
    return(mtx, skat_table, genes_result, qq_plot)


def full_skat_log(mtx, mtx_subset, gene_list, pcs):
    
    mtx = remove_sex_chrom(mtx)
    mtx_subset = remove_sex_chrom(mtx_subset)
    
    mtx, mtx_subset = remove_related(mtx, mtx_subset)
    
    mtx = run_pca(mtx, mtx_subset) #this matrix will be returned, so I can do SKAT with other list and parameters
    
    skat_table, genes_result, qq_plot = run_skat(mtx, gene_list, pcs)
    
    return(mtx, skat_table, genes_result, qq_plot)

In [28]:
mt.count()

2020-03-15 17:11:41 Hail: INFO: Coerced sorted dataset
2020-03-15 17:11:41 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 17:11:41 Hail: INFO: Coerced sorted dataset


(6000013, 98)

# import large and smaller matrix table

In [45]:
mt = hl.read_matrix_table('/hail/large_vcf_filtering_annotation/step6_output.mt')
mt = mt.filter_rows(mt.cadd > 0) # this is so I don't do too many tests

mt_subset = hl.read_matrix_table('mt_subset_for_tests.mt')
mt_subset = mt_subset.filter_rows(mt_subset.cadd > 0)

In [46]:
mt = mt.filter_cols(mt.s.contains('gnmd'), keep = False)

In [47]:
mt_subset = mt_subset.filter_cols(mt_subset.s.contains('gnmd'), keep = False)

In [48]:
mt.count()

(6148861, 151)

In [49]:
mt_subset.count()

(6129, 151)

## prepare the gene lists

### get a list of genes enriched in the brain (genes_neuro), enriched in T-cell and B-cells (genes_immuno) and random non-neuro and non-immuno (random)

In [9]:
genes_neuro = list(set([line.rstrip('\n') for line in open('brain_enriched.txt')]))
genes_ecm = list(set([line.rstrip('\n') for line in open('extracellular-matrix.txt')]))
genes_neurodev = list(set([line.rstrip('\n') for line in open('hpo-neuro-dev.txt')]))


allgenes = hl.import_table('human-genes-with-GO-and-symbols') # these are downloaded from biomart. Human genes with some GO terms and defined symbols (to avoid trash:)
allgenes = allgenes.select('UniProtKB Gene Name symbol')

allgenes = allgenes.filter(allgenes['UniProtKB Gene Name symbol'] != "")
genes_random = allgenes.sample(0.05)['UniProtKB Gene Name symbol'].collect()

genes_random = [x for x in genes_random if x not in genes_ecm]
genes_random = [x for x in genes_random if x not in genes_neuro]
genes_random = [x for x in genes_random if x not in genes_neurodev]


2020-03-15 14:49:57 Hail: INFO: Reading table with no type imputation
  Loading column 'Gene stable ID' as type 'str' (type not specified)
  Loading column 'UniProtKB Gene Name symbol' as type 'str' (type not specified)



In [10]:
genes_all = allgenes['UniProtKB Gene Name symbol'].collect()

A list of genes associated with many psychiatric disorders (from https://www.sciencedirect.com/science/article/pii/S0092867419312760?dgcid=author) and GTS association > 0.85

In [11]:
genes = ['DCC', 'RBFOX', 'SLC30A9', 'DCAF4L1', 'SORCS3', 'KCNQ5', 'KCNQ-IT1', 'APOPT1', 'C14orf2', 'NAA11', 'NEGR1',
         'CHADL', 'L3MBTL2', 'SOX5']

In [12]:
genes_scores = genes_all + genes

In [14]:
len(genes_scores)

22859

# give scores

In [50]:
pheno = hl.import_table('/hail/GTS-coded.csv', delimiter = ',', impute = True, key = 'ID')

mt = mt.annotate_cols(phenotypes = pheno[mt.s])
mt_subset = mt_subset.annotate_cols(phenotypes = pheno[mt_subset.s])

2020-03-15 17:19:08 Hail: INFO: Reading table to impute column types
2020-03-15 17:19:08 Hail: INFO: Finished type imputation
  Loading column 'ID' as type 'str' (imputed)
  Loading column 'family' as type 'str' (imputed)
  Loading column 'sex' as type 'str' (imputed)
  Loading column 'kinship' as type 'str' (imputed)
  Loading column 'disease' as type 'str' (imputed)
  Loading column 'phenotype' as type 'str' (imputed)
  Loading column 'add_pheno' as type 'str' (imputed)
  Loading column 'heavy_tics' as type 'str' (imputed)


In [51]:
mt = mt.annotate_cols(category = hl.cond(mt.s.contains('gnmd'), hl.float(0), hl.cond(mt.phenotypes.disease == 'NO', hl.float(5), 
                                                                                     hl.cond(mt.phenotypes.phenotype == 'tics', hl.float(25), hl.cond(
                                                                                         mt.phenotypes.heavy_tics == 'YES', hl.float(45), hl.cond(
                                                                                             mt.phenotypes.phenotype == 'GTS', hl.float(50), hl.float(1000)))))))

In [52]:
mt_subset = mt_subset.annotate_cols(category = hl.cond(mt_subset.s.contains('gnmd'), hl.float(0), hl.cond(mt_subset.phenotypes.disease == 'NO', hl.float(5), 
                                                                                     hl.cond(mt_subset.phenotypes.phenotype == 'tics', hl.float(25), hl.cond(
                                                                                         mt_subset.phenotypes.heavy_tics == 'YES', hl.float(45), hl.cond(
                                                                                             mt_subset.phenotypes.phenotype == 'GTS', hl.float(50), hl.float(1000)))))))

In [53]:
mt = mt.filter_cols((hl.is_defined(mt.category)) & (mt.category < 100))

In [54]:
groups = mt.aggregate_cols(hl.agg.counter(mt.category))

In [55]:
groups

{5.0: 43, 25.0: 30, 45.0: 40, 50.0: 35}

In [56]:
mt_subset = mt_subset.filter_cols((hl.is_defined(mt_subset.category)) & (mt_subset.category < 100))

In [57]:
mt_subset.count()

(6129, 148)

In [58]:
mt.count()

(6148861, 148)

## Firstly SKAT for all genes, no gnomads, related individuals removed

In [27]:
mt, skat_table_random, genes_result_random, qq_plot_random = full_skat_lin(mt, mt_subset, genes_scores, 2)

2020-03-15 14:55:31 Hail: INFO: hwe_normalized_pca: running PCA using 5876 variants.
2020-03-15 14:55:33 Hail: INFO: pca: running PCA with 2 components...
2020-03-15 14:57:10 Hail: INFO: Wrote all 2 blocks of 5997 x 148 matrix with block size 4096.
2020-03-15 14:57:11 Hail: INFO: wrote matrix with 148 rows and 148 columns as 1 block of size 4096 to file:/tmp/hail.ae6kphWwkraB/uPlNI0ys2s.bm
2020-03-15 14:57:11 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 14:57:11 Hail: INFO: wrote table with 119 rows in 1 partition to file:/tmp/hail.ae6kphWwkraB/1qb35DWhhz
2020-03-15 14:57:11 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2020-03-15 14:57:12 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 14:57:13 Hail: INFO: hwe_normalized_pca: running PCA using 5730 variants.
2020-03-15 14:57:13 Hail: INFO: Ordering unsorted dataset with network shuffle

2020-03-15 14:57:22 Hail: INFO: Coerced sorted dataset
2020-03-15 14:57:22 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 14:57:22 Hail: INFO: Coerced sorted dataset
2020-03-15 15:02:41 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 15:06:43 Hail: INFO: Coerced sorted dataset
2020-03-15 15:06:43 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 15:06:43 Hail: INFO: Coerced sorted dataset
2020-03-15 15:12:03 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""ATP5ME""",145,6710.0,0.000995,0
"""BPIFB6""",136,25300.0,0.00154,0
"""CDKN1B""",92,30600.0,0.000912,0
"""CNR1""",193,30500.0,0.000348,0
"""GLCE""",206,52500.0,0.000814,0
"""GPR180""",120,30700.0,0.000355,0
"""KCTD14""",104,26600.0,0.0011,0
"""KIAA1143""",76,18300.0,0.000353,0
"""KRT18""",97,20300.0,0.00195,0
"""LRP6""",319,40100.0,0.00145,0


2020-03-15 15:16:06 Hail: INFO: Coerced sorted dataset
2020-03-15 15:16:06 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 15:16:06 Hail: INFO: Coerced sorted dataset
2020-03-15 15:21:23 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 15:25:23 Hail: INFO: Ordering unsorted dataset with network shuffle


In [59]:
mt2, skat_table_random2, genes_result_random2, qq_plot_random2 = full_skat_lin(mt, mt_subset, genes_scores, 2) #without removal of related individuals

2020-03-15 17:19:26 Hail: INFO: hwe_normalized_pca: running PCA using 5876 variants.
2020-03-15 17:19:28 Hail: INFO: pca: running PCA with 2 components...
2020-03-15 17:20:50 Hail: INFO: Wrote all 2 blocks of 5997 x 148 matrix with block size 4096.
2020-03-15 17:20:51 Hail: INFO: wrote matrix with 148 rows and 148 columns as 1 block of size 4096 to file:/tmp/hail.ae6kphWwkraB/OBEaGiPAoG.bm
2020-03-15 17:20:51 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 17:20:51 Hail: INFO: wrote table with 0 rows in 1 partition to file:/tmp/hail.ae6kphWwkraB/Ou8xYkYawh
2020-03-15 17:20:53 Hail: INFO: hwe_normalized_pca: running PCA using 5876 variants.
2020-03-15 17:20:55 Hail: INFO: pca: running PCA with 10 components...
2020-03-15 17:21:32 Hail: INFO: Coerced sorted dataset
2020-03-15 17:21:32 Hail: INFO: Coerced sorted dataset
2020-03-15 17:21:33 Hail: INFO: Coerced sorted dataset


2020-03-15 17:21:34 Hail: INFO: Coerced sorted dataset
2020-03-15 17:28:09 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 17:33:15 Hail: INFO: Coerced sorted dataset
2020-03-15 17:39:48 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""ATP5ME""",151,11200.0,0.00036,0
"""KRT18""",97,35800.0,0.000369,0
"""MICU2""",288,122000.0,0.000404,0
"""MYL5""",162,12100.0,0.00128,0
"""PDE6B""",282,16500.0,0.00183,0
"""TRBV5-5""",881,31500.0,0.00129,0
"""TRBV5-6""",942,32600.0,0.000458,0
"""TRBV6-8""",1093,31500.0,0.00113,0
"""TRBV7-6""",977,36700.0,0.000295,0
"""TRBV7-7""",1041,31000.0,0.00096,0


2020-03-15 17:44:54 Hail: INFO: Coerced sorted dataset
2020-03-15 17:51:26 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-15 17:56:32 Hail: INFO: Ordering unsorted dataset with network shuffle


## SKAT (with gnomads included) for brain-enriched genes

In [68]:
genes_neuro = list(set([line.rstrip('\n') for line in open('brain_enriched.txt')]))

In [62]:
mt = hl.read_matrix_table('/hail/large_vcf_filtering_annotation/step6_output.mt')
mt = mt.filter_rows(mt.cadd > 0) # this is so I don't do too many tests

mt_subset = hl.read_matrix_table('mt_subset_for_tests.mt')
mt_subset = mt_subset.filter_rows(mt_subset.cadd > 0)

In [63]:
mt = mt.annotate_cols(category = hl.cond(mt.s.contains('gnmd'), False, (mt.phenotypes.disease == 'YES'))) #logistic

samples = mt.s.collect()

mt = mt.filter_cols(hl.array(samples[0:-111]).contains(mt.s)) # filter out excessive gnomads

mt_subset = mt_subset.filter_cols(hl.array(samples[0:-111]).contains(mt_subset.s))

mt = mt.filter_cols((mt.s.contains('gnmd')) | ((mt.phenotypes.family =='.') & (mt.phenotypes.heavy_tics =='YES')))

mt_subset = mt_subset.filter_cols((mt_subset.s.contains('gnmd')) | ((mt_subset.phenotypes.family =='.') & (mt_subset.phenotypes.heavy_tics =='YES')))

In [64]:
groups = mt.aggregate_cols(hl.agg.counter(mt.category))

groups

{False: 40, True: 38}

In [66]:
mt.count()

(6148861, 78)

In [67]:
mt_subset.count()

(6129, 78)

In [74]:
#reimport the matrixtable again

mt_test = hl.read_matrix_table('/hail/large_vcf_filtering_annotation/step6_output.mt')
mt_test = mt_test.filter_rows(mt_test.cadd > 0) # this is so I don't do too many tests

mt_test = mt_test.annotate_cols(category = hl.cond(mt_test.s.contains('gnmd'), False, (mt_test.phenotypes.disease == 'YES'))) #logistic

mt_test = mt_test.filter_cols(mt_test.phenotypes.family == '.', keep = False)

mt_test = mt_test.filter_cols((mt_test.phenotypes.disease == 'YES') | (mt_test.phenotypes.disease == 'NO'))

In [75]:
mt_test.count()

(6148861, 107)

In [69]:
mt, skat_table_log_neuro, genes_result_log_neuro, qq_plot_log_neuro = full_skat_log(mt, mt_subset, genes_neuro, 2)

2020-03-16 08:42:27 Hail: INFO: hwe_normalized_pca: running PCA using 4995 variants.
2020-03-16 08:42:47 Hail: INFO: pca: running PCA with 2 components...
2020-03-16 08:43:43 Hail: INFO: Wrote all 2 blocks of 5997 x 78 matrix with block size 4096.
2020-03-16 08:43:44 Hail: INFO: wrote matrix with 78 rows and 78 columns as 1 block of size 4096 to file:/tmp/hail.ae6kphWwkraB/V9TCF8qaA1.bm
2020-03-16 08:43:44 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-16 08:43:44 Hail: INFO: wrote table with 1 row in 1 partition to file:/tmp/hail.ae6kphWwkraB/vcKY3UaIGn
2020-03-16 08:43:45 Hail: INFO: Coerced sorted dataset
2020-03-16 08:43:51 Hail: INFO: hwe_normalized_pca: running PCA using 4961 variants.
2020-03-16 08:43:51 Hail: INFO: Coerced sorted dataset
2020-03-16 08:44:09 Hail: INFO: pca: running PCA with 10 components...
2020-03-16 08:44:43 Hail: INFO: Coerced sorted dataset
2020-03-16 08:44:43 Hail: INFO: Coerced sorted dataset
2020-03-16 08:44:43 Hail: INFO: Coerced sor

2020-03-16 08:44:52 Hail: INFO: Coerced sorted dataset
2020-03-16 08:44:52 Hail: INFO: Coerced sorted dataset
2020-03-16 08:44:52 Hail: INFO: Coerced sorted dataset
2020-03-16 08:48:38 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-16 08:48:41 Hail: INFO: Coerced sorted dataset
2020-03-16 08:48:41 Hail: INFO: Coerced sorted dataset
2020-03-16 08:48:41 Hail: INFO: Coerced sorted dataset
2020-03-16 08:52:18 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""FAM131B""",113,1960.0,0.000552,0
"""FSTL5""",2140,18600.0,0.000948,0
"""HTR6""",108,3550.0,0.000533,0
"""KCNK4""",78,11300.0,0.0,0
"""KLC1""",335,7070.0,0.000492,0


2020-03-16 08:52:20 Hail: INFO: Coerced sorted dataset
2020-03-16 08:52:20 Hail: INFO: Coerced sorted dataset
2020-03-16 08:52:20 Hail: INFO: Coerced sorted dataset
2020-03-16 08:56:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-16 08:56:01 Hail: INFO: Coerced sorted dataset
2020-03-16 08:56:01 Hail: INFO: Coerced dataset with out-of-order partitions.


In [70]:
skat_table_log_neuro.write('skat_table_log_neuro.ht')

2020-03-16 08:57:53 Hail: INFO: Coerced sorted dataset
2020-03-16 08:57:53 Hail: INFO: Coerced sorted dataset
2020-03-16 08:57:53 Hail: INFO: Coerced sorted dataset
2020-03-16 09:01:30 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-03-16 09:01:33 Hail: INFO: wrote table with 465 rows in 465 partitions to skat_table_log_neuro.ht


In [71]:
skat_log_neuro = hl.read_table('skat_table_log_neuro.ht')

In [72]:
skat_log_neuro.order_by('p_value').show(20) # based on the plot I will try to go with 1-5 genes

id,size,q_stat,p_value,fault,label
str,int32,float64,float64,int32,bool
"""KCNK4""",78,11300.0,0.0,0,False
"""KLC1""",335,7070.0,0.000492,0,False
"""HTR6""",108,3550.0,0.000533,0,False
"""FAM131B""",113,1960.0,0.000552,0,False
"""FSTL5""",2140,18600.0,0.000948,0,False
"""SOX2""",90,1780.0,0.0025,0,False
"""DISP2""",139,2700.0,0.00295,0,False
"""KIAA0319""",303,4700.0,0.00304,0,False
"""CHRM5""",273,3220.0,0.00318,0,False
"""MTURN""",192,2900.0,0.00381,0,False


In [73]:
len(genes_neuro)

488

In [78]:
controls = np.zeros((len([1,2,3,4,5]), len([3,6,8,9,10,11,12,13,14,15])))
gts = np.zeros((len([1,2,3,4,5]), len([3,6,8,9,10,11,12,13,14,15])))

variants_controls = np.zeros((len([1,2,3,4,5]), len([3,6,8,9,10,11,12,13,14,15])))
variants_gts =  np.zeros((len([1,2,3,4,5]), len([3,6,8,9,10,11,12,13,14,15])))

controls_test = np.zeros((len([1,2,3,4,5]), len([3,6,8,9,10,11,12,13,14,15])))
gts_test = np.zeros((len([1,2,3,4,5]), len([3,6,8,9,10,11,12,13,14,15])))


for rows, n in enumerate([1,2,3,4,5]):
    top_log = hl.array(skat_log_neuro.order_by('p_value').id.take(n))
    mt_skat_log_40 = mt.filter_rows(hl.any(lambda x: hl.literal(top_log).contains(x), mt.nearest_genes_20kb))
    mt_test_skat_log_40 = mt_test.filter_rows(hl.any(lambda x: hl.literal(top_log).contains(x), mt_test.nearest_genes_20kb))
    
    for cols, c in enumerate([3,6,8,9,10,11,12,13,14,15]):

        mt_skat_log_40 = mt_skat_log_40.filter_rows(mt_skat_log_40.cadd > c)
        skat_log_40_res = mt_skat_log_40.annotate_cols(non_refs = hl.agg.count_where(mt_skat_log_40.GT.is_non_ref())) #count variants per sample
        skat_log_40_sum = skat_log_40_res.cols()

        skat_log_40_sum = (skat_log_40_sum.group_by(skat_log_40_sum.category).aggregate(mean_non_refs = hl.agg.mean(skat_log_40_sum.non_refs)))

        results = skat_log_40_sum.mean_non_refs.collect()
        ctrl = results[0]

        variants_controls[rows, cols] = results[0]
        variants_gts[rows, cols] = results[1]


        test = skat_log_40_res.cols()
        test = test.annotate(test_result = (((hl.float(ctrl) - test.non_refs) < 0)))
        test = test.annotate(success = (test.test_result == test.category))
        test2 = test.filter(test.s.contains('gnmd'))
        test1 = test.filter(test.phenotypes.disease == 'YES')

        try:
            controls[rows, cols] = test2.aggregate(hl.agg.counter(test2.success))[1]
        except KeyError:
            controls[rows, cols] = 0

        try:
            gts[rows, cols] = test1.aggregate(hl.agg.counter(test1.success))[1]
        except KeyError:
            gts[rows, cols] = 0
                
        mt_test_skat_log_40 = mt_test_skat_log_40.filter_rows(mt_test_skat_log_40.cadd > c)
        skat_log_40_res = mt_test_skat_log_40.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log_40.GT.is_non_ref())) #count variants per sample
        
        test3 = skat_log_40_res.cols()
        test3 = test3.annotate(test_result = (((hl.float(ctrl) - test3.non_refs) < 0)))
        test3 = test3.annotate(success = (test3.test_result == test3.category))
        test4 = test3.filter(test3.phenotypes.disease == 'NO')
        test5 = test3.filter(test3.phenotypes.disease == 'YES')
            
        try:
            controls_test[rows, cols] = test4.aggregate(hl.agg.counter(test4.success))[1]
        except KeyError:
            controls_test[rows, cols] = 0
            
        try:
            gts_test[rows, cols] = test5.aggregate(hl.agg.counter(test5.success))[1]
        except KeyError:
            gts_test[rows, cols] = 0

2020-03-16 09:21:03 Hail: INFO: Coerced sorted dataset
2020-03-16 09:21:03 Hail: INFO: Coerced sorted dataset
2020-03-16 09:21:03 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:32 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:32 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:32 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:32 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:33 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:33 Hail: INFO: Coerced sorted dataset
2020-03-16 09:22:33 Hail: INFO: Coerced sorted dataset
2020-03-16 09:24:00 Hail: INFO: Coerced sorted dataset
2020-03-16 09:24:00 Hail: INFO: Coerced sorted dataset
2020-03-16 09:24:00 Hail: INFO: Coerced sorted dataset
2020-03-16 09:24:01 Hail: INFO: Coerced sorted dataset
2020-03-16 09:24:01 Hail: INFO: Coerced sorted dataset
2020-03-16 09:24:01 Hail: INFO: Coerced sorted dataset
2020-03-16 09:25:27 Hail: INFO: Coerced sorted dataset
2020-03-16 09:25:27 Hail: INFO: Coerced sorted dataset
2020-03-16

2020-03-16 10:16:17 Hail: INFO: Coerced sorted dataset
2020-03-16 10:16:17 Hail: INFO: Coerced sorted dataset
2020-03-16 10:16:17 Hail: INFO: Coerced sorted dataset
2020-03-16 10:19:17 Hail: INFO: Coerced sorted dataset
2020-03-16 10:19:17 Hail: INFO: Coerced sorted dataset
2020-03-16 10:19:17 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:20:51 Hail: INFO: Coerced sorted dataset
2020-03-16 10:22:25 Hail: INFO: Coerced sorted dataset
2020-03-16 10:22:25 Hail: INFO: Coerced sorted dataset
2020-03-16 10:22:25 Hail: INFO: Coerced sorted dataset
2020-03-16 10:22:25 Hail: INFO: Coerced sorted dataset
2020-03-16 10:22:25 Hail: INFO: Coerced sorted dataset
2020-03-16

2020-03-16 11:13:19 Hail: INFO: Coerced sorted dataset
2020-03-16 11:13:19 Hail: INFO: Coerced sorted dataset
2020-03-16 11:13:19 Hail: INFO: Coerced sorted dataset
2020-03-16 11:14:46 Hail: INFO: Coerced sorted dataset
2020-03-16 11:14:46 Hail: INFO: Coerced sorted dataset
2020-03-16 11:14:46 Hail: INFO: Coerced sorted dataset
2020-03-16 11:17:35 Hail: INFO: Coerced sorted dataset
2020-03-16 11:17:35 Hail: INFO: Coerced sorted dataset
2020-03-16 11:17:35 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:00 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:00 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:00 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:01 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:01 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:01 Hail: INFO: Coerced sorted dataset
2020-03-16 11:19:01 Hail: INFO: Coerced sorted dataset
2020-03-16 11:20:25 Hail: INFO: Coerced sorted dataset
2020-03-16 11:20:25 Hail: INFO: Coerced sorted dataset
2020-03-16

KeyboardInterrupt: 

In [87]:
test1.count()

2020-03-16 11:56:30 Hail: INFO: Coerced sorted dataset
2020-03-16 11:56:30 Hail: INFO: Coerced sorted dataset
2020-03-16 11:56:30 Hail: INFO: Coerced sorted dataset


37

In [88]:
test2.count()

2020-03-16 11:56:38 Hail: INFO: Coerced sorted dataset
2020-03-16 11:56:38 Hail: INFO: Coerced sorted dataset
2020-03-16 11:56:38 Hail: INFO: Coerced sorted dataset


40

In [79]:
mt_test.filter_cols(mt_test.phenotypes.disease == 'YES').count()

2020-03-16 11:54:43 Hail: INFO: Coerced sorted dataset


(6148861, 64)

In [80]:
mt_test.filter_cols(mt_test.phenotypes.disease == 'NO').count()

2020-03-16 11:54:43 Hail: INFO: Coerced sorted dataset
2020-03-16 11:54:43 Hail: INFO: Coerced sorted dataset
2020-03-16 11:54:43 Hail: INFO: Coerced sorted dataset


(6148861, 43)

In [81]:
a = list((controls_test*100/43).flatten())
b = list((gts_test*100/64).flatten())

In [82]:
np.where(((controls_test*100/43) > 50) & ((gts_test*100/62) > 40))

(array([], dtype=int64), array([], dtype=int64))

In [85]:
(controls_test*100/43)

array([[16.27906977, 23.25581395, 20.93023256, 20.93023256, 23.25581395,
        23.25581395, 23.25581395, 23.25581395, 23.25581395, 23.25581395],
       [27.90697674, 13.95348837, 20.93023256, 23.25581395, 16.27906977,
        16.27906977, 16.27906977, 25.58139535, 23.25581395, 23.25581395],
       [30.23255814,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [89]:
(controls*100/37)

array([[56.75675676, 62.16216216, 64.86486486, 51.35135135, 64.86486486,
        70.27027027, 70.27027027, 70.27027027, 70.27027027, 70.27027027],
       [54.05405405, 62.16216216, 67.56756757, 67.56756757, 45.94594595,
        45.94594595, 45.94594595, 75.67567568, 67.56756757, 67.56756757],
       [56.75675676,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [86]:
(gts_test*100/64)

array([[76.5625, 64.0625, 64.0625, 75.    , 65.625 , 70.3125, 70.3125,
        70.3125, 70.3125, 70.3125],
       [67.1875, 79.6875, 79.6875, 73.4375, 79.6875, 70.3125, 70.3125,
        68.75  , 70.3125, 70.3125],
       [70.3125,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
         0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
         0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
         0.    ,  0.    ,  0.    ]])

In [90]:
(gts*100/40)

array([[80. , 80. , 77.5, 82.5, 80. , 80. , 80. , 80. , 80. , 80. ],
       [80. , 85. , 85. , 82.5, 87.5, 82.5, 82.5, 80. , 80. , 80. ],
       [75. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ]])