In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

#This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

Running on Apache Spark version 2.4.3
SparkUI available at http://p0341.prometheus:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/hail-20200225-1526-0.2.30-2ae07d872f43.log


In [3]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain


In [4]:
## kinship cut-off 0.125 - fist compare gnomads with heavy tics, then test it on families.

def remove_sex_chrom(mtx):
    mtx = mtx.filter_rows((mtx.locus.contig != "chrX") & (mtx.locus.contig != "chrY"))
    return(mtx)


def remove_related(mtx, mtx_subset):
    pc_rel = hl.pc_relate(mtx_subset.GT, 0.001, k=2, statistics='kin')
    pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)
    
    related_samples_to_remove = related_samples_to_remove.annotate(s = related_samples_to_remove.node.s)
    related_samples_to_remove = related_samples_to_remove.key_by('s')
    
    mtx = mtx.key_cols_by()
    mtx = mtx.filter_cols(hl.is_defined(related_samples_to_remove[mtx.s]), keep=False)
    mtx = mtx.key_cols_by(mtx.s)
    
    mtx_subset = mtx_subset.filter_cols(hl.is_defined(related_samples_to_remove[mtx_subset.s]), keep=False)
    
    return(mtx, mtx_subset)


def run_pca(mtx, mtx_subset):
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mtx_subset.GT)
    mtx = mtx.annotate_cols(scores = pcs[mtx.s].scores)
    p = hl.plot.scatter(mtx.scores[0],
                    mtx.scores[1],
                    label=mtx.phenotypes.family,
                    title='PCA', xlabel='PC1', ylabel='PC2')

    p2 = hl.plot.scatter(mtx.scores[2],
                        mtx.scores[3],
                        label=mtx.phenotypes.family,
                        title='PCA', xlabel='PC3', ylabel='PC4')

    p3 = hl.plot.scatter(mtx.scores[4],
                        mtx.scores[5],
                        label=mtx.phenotypes.family,
                        title='PCA', xlabel='PC5', ylabel='PC6')
    show(p)
    show(p2)
    show(p3)
    
    return(mtx)



def run_skat_lin(mtx, gene_list, pcs): #does not have a family covariate
    
    mtx = mtx.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mtx.nearest_genes_20kb))
    mtx = mtx.filter_rows(hl.agg.any(mtx.GT.is_non_ref()))
    mtx = mtx.explode_rows(mtx.nearest_genes_20kb)
    mtx = mtx.filter_rows(hl.literal(gene_list).contains(mtx.nearest_genes_20kb))
    
    scores = [mtx.scores[x] for x in list(range(pcs))]
                          
    
    skat_table = hl.skat(key_expr=mtx.nearest_genes_20kb,
                         weight_expr=mtx.cadd,
                         y=mtx.category,
                         x=mtx.GT.n_alt_alleles(),
                         covariates=[1] + scores,
                         max_size = 4000)
    
    genes_result = skat_table.filter(skat_table.p_value < 0.05/len(gene_list)).id.collect() 

    skat_table.filter(skat_table.p_value < 0.002).show(20)

    skat_table = skat_table.annotate(label = hl.literal(genes).contains(skat_table.id))

    qq_plot = hl.plot.qq(skat_table.p_value,
                                         label = skat_table.label,
                                         n_divisions = len(gene_list))
    show(qq_plot)
    
    return(skat_table, genes_result, qq_plot)


def run_skat_log(mtx, gene_list, pcs): #does not have a family covariate
    
    mtx = mtx.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mtx.nearest_genes_20kb))
    mtx = mtx.filter_rows(hl.agg.any(mtx.GT.is_non_ref()))
    mtx = mtx.explode_rows(mtx.nearest_genes_20kb)
    mtx = mtx.filter_rows(hl.literal(gene_list).contains(mtx.nearest_genes_20kb))
    
    scores = [mtx.scores[x] for x in list(range(pcs))]
                          
    
    skat_table = hl.skat(key_expr=mtx.nearest_genes_20kb,
                         weight_expr=mtx.cadd,
                         y=mtx.category,
                         x=mtx.GT.n_alt_alleles(),
                         covariates=[1] + scores,
                         max_size = 2200,
                        logistic = True)
    
    genes_result = skat_table.filter(skat_table.p_value < 0.05/len(gene_list)).id.collect() 

    skat_table.filter(skat_table.p_value < 0.002).show(20)

    skat_table = skat_table.annotate(label = hl.literal(genes).contains(skat_table.id))

    qq_plot = hl.plot.qq(skat_table.p_value,
                                         label = skat_table.label,
                                         n_divisions = len(gene_list))
    show(qq_plot)
    
    return(skat_table, genes_result, qq_plot)



def full_skat_lin(mtx, mtx_subset, gene_list, pcs):
    
    mtx = remove_sex_chrom(mtx)
    mtx_subset = remove_sex_chrom(mtx_subset)
    
    mtx, mtx_subset = remove_related(mtx, mtx_subset)
    
    mtx = run_pca(mtx, mtx_subset) #this matrix will be returned, so I can do SKAT with other list and parameters
    
    skat_table, genes_result, qq_plot = run_skat_lin(mtx, gene_list, pcs)
    
    return(mtx, skat_table, genes_result, qq_plot)


def full_skat_log(mtx, mtx_subset, gene_list, pcs):
    
    mtx = remove_sex_chrom(mtx)
    mtx_subset = remove_sex_chrom(mtx_subset)
    
    mtx, mtx_subset = remove_related(mtx, mtx_subset)
    
    mtx = run_pca(mtx, mtx_subset) #this matrix will be returned, so I can do SKAT with other list and parameters
    
    skat_table, genes_result, qq_plot = run_skat_log(mtx, gene_list, pcs)
    
    return(mtx, skat_table, genes_result, qq_plot)

# import large and smaller matrix table

In [5]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_filtering_annotation/step6_output.mt')
mt = mt.filter_rows(mt.cadd > 0) # this is so I don't do too many tests

In [6]:
mt_subset = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/mt_subset_for_tests.mt')
mt_subset = mt_subset.filter_rows(mt_subset.cadd > 0) # this is so I don't do too many tests

## prepare the gene lists

### get various HPO gene lists from related phenotypes

In [7]:
genes_ecm = list(set([line.rstrip('\n') for line in open('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/extracellular-matrix.txt')]))
genes_neurodev = list(set([line.rstrip('\n') for line in open('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/hpo-neuro-dev.txt')]))

bg =  list(set([line.rstrip('\n') for line in open('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/brain_category_rna_basal.tsv')]))

tics = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/tics.csv', delimiter = ',', quote="\"")
genes_tics = tics.GENE_SYMBOL.collect()

self_mut = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/self_mut.csv', delimiter = ',', quote="\"")
genes_self_mut = self_mut.GENE_SYMBOL.collect()

phonic = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/phonic_tics.csv', delimiter = ',', quote="\"")
genes_phonic = phonic.GENE_SYMBOL.collect()

ocd = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/ocd_beh.csv', delimiter = ',', quote="\"")
genes_ocd = ocd.GENE_SYMBOL.collect()

echo = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/echolalia.csv', delimiter = ',', quote="\"")
genes_echo = echo.GENE_SYMBOL.collect()

inv_mov = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/inv_mov.csv', delimiter = ',', quote="\"")
genes_inv = inv_mov.GENE_SYMBOL.collect()

adhd = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/adhd.csv', delimiter = ',', quote="\"")
genes_adhd = adhd.GENE_SYMBOL.collect()

agg_beh = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/agg_beh.csv', delimiter = ',', quote="\"")
genes_agg_beh = agg_beh.GENE_SYMBOL.collect()

motor_tics = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/gts_gene_lists/motor_tics.csv', delimiter = ',', quote="\"")
genes_motor_tics = motor_tics.GENE_SYMBOL.collect()

2020-02-25 15:26:06 Hail: INFO: Reading table with no type imputation
  Loading column 'GENE_ENTREZ_ID' as type 'str' (type not specified)
  Loading column 'GENE_SYMBOL' as type 'str' (type not specified)
  Loading column 'DISEASE_IDS' as type 'str' (type not specified)

2020-02-25 15:26:08 Hail: INFO: Reading table with no type imputation
  Loading column 'GENE_ENTREZ_ID' as type 'str' (type not specified)
  Loading column 'GENE_SYMBOL' as type 'str' (type not specified)
  Loading column 'DISEASE_IDS' as type 'str' (type not specified)

2020-02-25 15:26:09 Hail: INFO: Reading table with no type imputation
  Loading column 'GENE_ENTREZ_ID' as type 'str' (type not specified)
  Loading column 'GENE_SYMBOL' as type 'str' (type not specified)
  Loading column 'DISEASE_IDS' as type 'str' (type not specified)

2020-02-25 15:26:09 Hail: INFO: Reading table with no type imputation
  Loading column 'GENE_ENTREZ_ID' as type 'str' (type not specified)
  Loading column 'GENE_SYMBOL' as type 'str' 

A list of genes associated with many psychiatric disorders (from https://www.sciencedirect.com/science/article/pii/S0092867419312760?dgcid=author) and GTS association > 0.85

In [8]:
genes = ['DCC', 'RBFOX', 'SLC30A9', 'DCAF4L1', 'SORCS3', 'KCNQ5', 'KCNQ-IT1', 'APOPT1', 'C14orf2', 'NAA11', 'NEGR1',
         'CHADL', 'L3MBTL2', 'SOX5']

In [9]:
GTS_genes = ['PANK2', 'COL27A1', 'PDGFB', 'CELSR3', 'OPA1', 'FBN2', 'WWC1', 'NIPBL', 
             'FN1', 'FBN2', 'SLITRK1', 'SLITRK2', 'SLITRK3', 'SLITRK4', 'SLITRK5', 'SLITRK6', 
             'HDC', 'OPRK1', 'PCDH10', 'NTSR2', 'OPRK1', 'CHD8', 'SCUBE1', 'PNKD', 'CNTNAP2', 'MOG', 
             'DRD2', 'DRD3', 'DRD4', 'DRD5', 'DAT1', 'DBH', 'HTR2A', 'TPH2', 'EAAT1', 'SAPAP3',
            'CTNNA3', 'NLGN4', 'FSCB', 'IMMP2L', 'NRXN1', 'AADAC', 'DBH', 'MAOA', 'HTR1A', 'HTR2C', 'SLC6A4',
             'TPH2', 'COL27A1', '5-HTTLPR', 'EAAT1', 'COL8A1', 'KCNE1', 'KCNE2']
# a list manually curated from literature

In [10]:
genes_scores = genes + GTS_genes

In [11]:
len(genes_scores)

68

# give scores

In [12]:
mt = mt.annotate_cols(category = hl.cond(mt.s.contains('gnmd'), hl.float(0), hl.cond(mt.phenotypes.disease == 'NO', hl.float(5), 
                                                                                     hl.cond(mt.phenotypes.phenotype == 'tics', hl.float(25), hl.cond(
                                                                                         mt.phenotypes.heavy_tics == 'YES', hl.float(45), hl.cond(
                                                                                             mt.phenotypes.phenotype == 'GTS', hl.float(50), hl.float(1000)))))))
#linear

In [13]:
mt = mt.filter_cols((hl.is_defined(mt.category)) & (mt.category < 100))

In [14]:
mt = mt.annotate_cols(category = hl.cond(mt.s.contains('gnmd'), False, (mt.phenotypes.disease == 'YES')))

#logistic

In [15]:
samples = mt.s.collect()

In [16]:
mt = mt.filter_cols(hl.array(samples[0:-111]).contains(mt.s)) # filter out excessive gnomads

In [17]:
mt_subset = mt_subset.filter_cols(hl.array(samples[0:-111]).contains(mt_subset.s))

In [18]:
mt = mt.filter_cols((mt.s.contains('gnmd')) | (mt.phenotypes.heavy_tics =='YES'))

In [19]:
mt_subset = mt_subset.filter_cols((mt_subset.s.contains('gnmd')) | (mt_subset.phenotypes.heavy_tics =='YES'))

In [20]:
groups = mt.aggregate_cols(hl.agg.counter(mt.category))

In [21]:
groups

{False: 40, True: 40}

In [22]:
mt.count()

(6148861, 80)

In [23]:
mt_subset.count()

(6129, 80)

In [24]:
mt, skat_table_log, genes_result_log, qq_plot_log = full_skat_log(mt, mt_subset, genes_scores, 2)

2020-02-25 09:21:41 Hail: INFO: hwe_normalized_pca: running PCA using 5032 variants.
2020-02-25 09:21:42 Hail: INFO: pca: running PCA with 2 components...
2020-02-25 09:21:49 Hail: INFO: Wrote all 2 blocks of 5997 x 80 matrix with block size 4096.
2020-02-25 09:21:50 Hail: INFO: wrote matrix with 80 rows and 80 columns as 1 block of size 4096 to file:/net/scratch/people/plggosborcz/hail.nK7QI98AHJOq/FI0VkQNxVK.bm
2020-02-25 09:21:51 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-02-25 09:21:51 Hail: INFO: wrote table with 1 row in 1 partition to file:/net/scratch/people/plggosborcz/hail.nK7QI98AHJOq/WV5WJlHCO8
2020-02-25 09:21:51 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2020-02-25 09:21:52 Hail: INFO: Coerced sorted dataset
2020-02-25 09:21:53 Hail: INFO: hwe_normalized_pca: running PCA using 4998 variants.
2020-02-25 09:21:53 Hail: INFO: Coerced sorted dataset

2020-02-25 09:21:59 Hail: INFO: Coerced sorted dataset
2020-02-25 09:21:59 Hail: INFO: Coerced sorted dataset
2020-02-25 09:21:59 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:15 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:15 Hail: INFO: Coerced dataset with out-of-order partitions.
2020-02-25 09:22:15 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:15 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:15 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:28 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:28 Hail: INFO: Coerced dataset with out-of-order partitions.


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32


2020-02-25 09:22:28 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:28 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:28 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:41 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:41 Hail: INFO: Coerced dataset with out-of-order partitions.
2020-02-25 09:22:41 Hail: INFO: Coerced sorted dataset
2020-02-25 09:22:41 Hail: INFO: Coerced dataset with out-of-order partitions.


In [73]:
skat_table_log.write('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/skat_for_scores_log_40.ht')

2020-02-24 08:35:21 Hail: INFO: Coerced sorted dataset
2020-02-24 08:35:21 Hail: INFO: Coerced sorted dataset
2020-02-24 08:35:21 Hail: INFO: Coerced sorted dataset
2020-02-24 08:35:34 Hail: INFO: Coerced sorted dataset
2020-02-24 08:35:34 Hail: INFO: Coerced dataset with out-of-order partitions.
2020-02-24 08:35:35 Hail: INFO: wrote table with 49 rows in 49 partitions to /net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/skat_for_scores_log_40.ht


## create a scoring pattern for any model - based on CADD and number of genes



In [16]:
from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook() 

In [15]:
genes_scores = hl.array(genes_scores)

In [17]:
skat_log_40 = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/skat_for_scores_log_40.ht')

In [18]:
skat_log_40.order_by('p_value').show(20)

id,size,q_stat,p_value,fault,label
str,int32,float64,float64,int32,bool
"""L3MBTL2""",97,3940.0,0.00523,0,True
"""CHADL""",99,2760.0,0.0145,0,True
"""PCDH10""",232,2560.0,0.0238,0,False
"""KCNQ5""",1141,9640.0,0.0268,0,True
"""DRD2""",222,2890.0,0.0268,0,False
"""IMMP2L""",1283,8860.0,0.0272,0,False
"""SORCS3""",1339,16500.0,0.0359,0,True
"""KCNE2""",56,833.0,0.0482,0,False
"""DRD3""",196,1290.0,0.057,0,False
"""AADAC""",134,1830.0,0.0733,0,False


### use the mean number of variants detected above to check how stuff works in families

In [19]:
#reimport the matrixtable again
mt_test = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_filtering_annotation/step6_output.mt')
mt_test = mt_test.filter_rows(mt_test.cadd > 0) # this is so I don't do too many tests

In [20]:
mt_test = mt_test.annotate_cols(category = hl.cond(mt_test.s.contains('gnmd'), hl.float(0), hl.cond(mt_test.phenotypes.disease == 'NO', hl.float(5), 
                                                                                     hl.cond(mt_test.phenotypes.phenotype == 'tics', hl.float(25), hl.cond(
                                                                                         mt_test.phenotypes.heavy_tics == 'YES', hl.float(45), hl.cond(
                                                                                             mt_test.phenotypes.phenotype == 'GTS', hl.float(50), hl.float(1000)))))))
#linear

In [21]:
mt_test = mt_test.filter_cols((hl.is_defined(mt_test.category)) & (mt_test.category < 100))

In [22]:
mt_test = mt_test.annotate_cols(category = hl.cond(mt_test.s.contains('gnmd'), False, (mt_test.phenotypes.disease == 'YES')))

#logistic

In [23]:
mt_test = mt_test.filter_cols(mt_test.phenotypes.heavy_tics == 'YES', keep = False)

In [24]:
mt_test = mt_test.filter_cols(mt_test.s.contains('gnmd'), keep = False)

In [25]:
mt_test.count()

(6148861, 105)

In [158]:
controls = np.zeros((len([3,4,5]), len([10,11,12,13,14,15])))
gts = np.zeros((len([3,4,5]), len([10,11,12,13,14,15])))

variants_controls = np.zeros((len([3,4,5]), len([10,11,12,13,14,15])))
variants_gts =  np.zeros((len([3,4,5]), len([10,11,12,13,14,15])))

controls_test = np.zeros((len([3,4,5]), len([10,11,12,13,14,15])))
gts_test = np.zeros((len([3,4,5]), len([10,11,12,13,14,15])))


for rows, n in enumerate([3,4,5]):
    top_log = hl.array(skat_log_40.order_by('p_value').id.take(n))
    mt_skat_log_40 = mt.filter_rows(hl.any(lambda x: hl.literal(top_log).contains(x), mt.nearest_genes_20kb))
    mt_test_skat_log_40 = mt_test.filter_rows(hl.any(lambda x: hl.literal(top_log).contains(x), mt_test.nearest_genes_20kb))
    
    for cols, c in enumerate([10,11,12,13,14,15]):

        mt_skat_log_40 = mt_skat_log_40.filter_rows(mt_skat_log_40.cadd > c)
        skat_log_40_res = mt_skat_log_40.annotate_cols(non_refs = hl.agg.count_where(mt_skat_log_40.GT.is_non_ref())) #count variants per sample
        skat_log_40_sum = skat_log_40_res.cols()

        skat_log_40_sum = (skat_log_40_sum.group_by(skat_log_40_sum.category).aggregate(mean_non_refs = hl.agg.mean(skat_log_40_sum.non_refs)))

        results = skat_log_40_sum.mean_non_refs.collect()
        ctrl = results[0]

        variants_controls[rows, cols] = results[0]
        variants_gts[rows, cols] = results[1]


        test = skat_log_40_res.cols()
        test = test.annotate(test_result = (((hl.float(ctrl) - test.non_refs) / hl.float(ctrl)) < -0.1))
        test = test.annotate(success = (test.test_result == test.category))
        test2 = test.filter(test.s.contains('gnmd'))
        test1 = test.filter(test.phenotypes.disease == 'YES')

        try:
            controls[rows, cols] = test2.aggregate(hl.agg.counter(test2.success))[1]
        except KeyError:
            controls[rows, cols] = 0

        try:
            gts[rows, cols] = test1.aggregate(hl.agg.counter(test1.success))[1]
        except KeyError:
            gts[rows, cols] = 0
                
        mt_test_skat_log_40 = mt_test_skat_log_40.filter_rows(mt_test_skat_log_40.cadd > c)
        skat_log_40_res = mt_test_skat_log_40.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log_40.GT.is_non_ref())) #count variants per sample
        
        test3 = skat_log_40_res.cols()
        test3 = test3.annotate(test_result = (((hl.float(ctrl) - test3.non_refs) / hl.float(ctrl)) < -0.1))
        test3 = test3.annotate(success = (test3.test_result == test3.category))
        test4 = test3.filter(test3.phenotypes.disease == 'NO')
        test5 = test3.filter(test3.phenotypes.disease == 'YES')
            
        try:
            controls_test[rows, cols] = test4.aggregate(hl.agg.counter(test4.success))[1]
        except KeyError:
            controls_test[rows, cols] = 0
            
        try:
            gts_test[rows, cols] = test5.aggregate(hl.agg.counter(test5.success))[1]
        except KeyError:
            gts_test[rows, cols] = 0
                

2020-02-24 10:26:48 Hail: INFO: Coerced sorted dataset
2020-02-24 10:26:48 Hail: INFO: Coerced sorted dataset
2020-02-24 10:26:48 Hail: INFO: Coerced sorted dataset
2020-02-24 10:26:59 Hail: INFO: Coerced sorted dataset
2020-02-24 10:26:59 Hail: INFO: Coerced sorted dataset
2020-02-24 10:26:59 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:00 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:00 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:00 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:00 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:10 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:10 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:10 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:11 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:11 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:11 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:20 Hail: INFO: Coerced sorted dataset
2020-02-24 10:27:21 Hail: INFO: Coerced sorted dataset
2020-02-24

2020-02-24 10:33:26 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:26 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:26 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:46 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:46 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:55 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:56 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:56 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:56 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:56 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:56 Hail: INFO: Coerced sorted dataset
2020-02-24 10:33:56 Hail: INFO: Coerced sorted dataset
2020-02-24 10:34:06 Hail: INFO: Coerced sorted dataset
2020-02-24 10:34:06 Hail: INFO: Coerced sorted dataset
2020-02-24 10:34:06 Hail: INFO: Coerced sorted dataset
2020-02-24 10:34:06 Hail: INFO: Coerced sorted dataset
2020-02-24 10:34:06 Hail: INFO: Coerced sorted dataset
2020-02-24

2020-02-24 10:40:05 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:05 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:05 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:15 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:15 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:15 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:35 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:35 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:35 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:45 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:55 Hail: INFO: Coerced sorted dataset
2020-02-24 10:40:55 Hail: INFO: Coerced sorted dataset
2020-02-24

In [146]:
np.save('controls_40', controls)
np.save('gts_40', gts)
np.save('variants_controls_40', variants_controls)
np.save('variants_gts_40', variants_gts)
np.save('controls_test_40', controls_test)
np.save('gts_test_40', gts_test)

In [39]:
mt_test.filter_cols(mt_test.phenotypes.disease == 'YES').count()

(6148861, 62)

In [40]:
mt_test.filter_cols(mt_test.phenotypes.disease == 'NO').count()

(6148861, 43)

In [161]:
a = list((controls_test*100/43).flatten())
b = list((gts_test*100/62).flatten())

In [173]:
np.where(((controls_test*100/43) > 50) & ((gts_test*100/62) > 40)) 

(array([0]), array([1]))

In [174]:
controls_test[0,1]

31.0

In [175]:
gts_test[0,1]

26.0

In [162]:
from bokeh.plotting import figure, output_notebook, show

output_notebook

p2 = figure(plot_width=600, plot_height=600)

# add a circle renderer with a size, color, and alpha
p2.circle(a, b, size=10, alpha=0.5)

# show the results
show(p2)

#this graph shows % of well asigned controls (x asis) vs well asigned GTS (y axis)

## best predictor based on heavy_tics vs gnomads tested on families on the short gene list = 3 genes, 11 cadd



## Draw a ROC curve 

for this we need false positived and true positives as a function of treshold

In [188]:
false_pos = []
true_pos = []

for x in np.linspace(-1,1,40):

    top_log = hl.array(skat_log_40.order_by('p_value').id.take(3))
    mt_test_skat_log = mt_test.filter_rows(hl.any(lambda x: hl.literal(top_log).contains(x), mt_test.nearest_genes_20kb))


    mt_test_skat_log = mt_test_skat_log.filter_rows(mt_test_skat_log.cadd > 11)
    skat_log_res = mt_test_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log.GT.is_non_ref())) #count variants per sample

    test = skat_log_res.cols()
    test = test.annotate(test_result = (((hl.float(variants_controls[0,1]) - test.non_refs) / hl.float(variants_controls[0,1])) < x))
    test = test.annotate(success = (test.test_result == test.category))
    test2 = test.filter(test.phenotypes.disease == 'NO')
    test1 = test.filter(test.phenotypes.disease == 'YES')
    
    try:
        false_pos.append(test2.aggregate(hl.agg.counter(test2.success))[0]/43)
        
    except(KeyError, TypeError):
        false_pos.append(0)
                            
    try:
        true_pos.append(test1.aggregate(hl.agg.counter(test1.success))[1]/62)
        
    except(KeyError, TypeError):
        true_pos.append(0)

In [189]:
y = np.linspace(0,1,10)

In [190]:
x = np.linspace(0,1,10)

In [275]:
np.save('false_pos', false_pos)
np.save('true_pos', true_pos)

In [191]:
from bokeh.plotting import figure, output_notebook, show

output_notebook

p4 = figure(plot_width=800, plot_height=800)



p4.line(x, y, line_width=4, line_color='lightgrey')

p4.line(false_pos, true_pos, line_width=4, alpha=0.5)

p4.xaxis.axis_label = 'false positives'
p4.yaxis.axis_label = 'true positives'


# show the results
show(p4)

In [192]:
false_poss = np.array(false_pos)

In [193]:
true_poss = np.array(true_pos)

In [26]:
def auc(fake, real): #simple integral
    area = 0
    for i in range(1,len(fake)):
        rectangle = (fake[i] - fake[i-1])*real[i]
        triangle = ((fake[i] - fake[i-1])/2)*((real[i] - real[i-1]))
      
        area = area + rectangle + triangle
            
    return(area)

In [195]:
auc(false_poss, true_poss)

0.6866091522880721

In [196]:
def auc2(fake, real): #simple integral
    area = 0
    for i in range(1,len(fake)):
        rectangle = (fake[i] - fake[i-1])*real[i]
      
        area = area + rectangle
            
    return(area)

In [197]:
auc2(false_poss, true_poss)

0.6376594148537134

## Get the false positive rate

discover false positive rate of this roc

In [27]:
skat_log_40.count()

49

In [28]:
top_log = hl.array(skat_log_40.order_by('p_value').id.take(3))

In [29]:
hl.eval(top_log)

['L3MBTL2', 'CHADL', 'PCDH10']

In [30]:
genes_background = skat_log_40.id.collect()

In [31]:
genes_background = [i for i in genes_background if i not in hl.eval(top_log)]

In [32]:
len(genes_background)

46

In [33]:
variants_controls = np.load('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/variants_controls_40.npy')

In [34]:
variants_controls

array([[ 43.375,  38.75 ,  28.25 ,  12.1  ,  10.35 ,   8.9  ],
       [144.8  , 119.   ,  91.05 ,  43.775,  34.9  ,  29.   ],
       [167.75 , 137.675, 107.8  ,  50.125,  40.3  ,  32.8  ],
       [268.075, 224.2  , 178.95 ,  88.825,  74.925,  61.475]])

In [35]:
controls_test = np.load('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/controls_test_40.npy')

In [36]:
controls_test[0,5]

31.0

In [37]:
gts_test = np.load('/net/archive/groups/plggneuromol/GTS-analysis/large_vcf_analysis/gts_test_40.npy')

In [None]:
gts_test[0,5]

In [None]:
false_pos_test = np.zeros((len(range(0,50)), len(np.linspace(-3,3,50))))
true_pos_test = np.zeros((len(range(0,50)), len(np.linspace(-3,3,50))))

for gene, s in enumerate(range(0,50)):
    
    
    randoms = []
    for i in range(0,3):
        i = hl.int(hl.rand_unif(0, 46))
        randoms.append(i)
    top_log = [j for i, j in enumerate(genes_background) if i in hl.eval(randoms)]


    for step, z in enumerate(np.linspace(-3,3,50)):

        mt_test_skat_log = mt_test.filter_rows(hl.any(lambda x: hl.literal(top_log).contains(x), mt_test.nearest_genes_20kb))


        mt_test_skat_log = mt_test_skat_log.filter_rows(mt_test_skat_log.cadd > 11)
        skat_log_res = mt_test_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log.GT.is_non_ref())) #count variants per sample

        test = skat_log_res.cols()
        test = test.annotate(test_result = (((hl.float(variants_controls[0,5]) - test.non_refs) / hl.float(variants_controls[0,5])) < z))
        test = test.annotate(success = (test.test_result == test.category))
        test2 = test.filter(test.phenotypes.disease == 'NO')
        test1 = test.filter(test.phenotypes.disease == 'YES')

        try:
            false_pos_test[gene, step] = test2.aggregate(hl.agg.counter(test2.success))[0]/43

        except(KeyError, TypeError):
            
            false_pos_test[gene, step] = 0

        try:
            true_pos_test[gene, step] = test1.aggregate(hl.agg.counter(test1.success))[1]/62

        except(KeyError, TypeError):
            true_pos_test[gene, step] = 0

In [71]:
aucs = []
for i in range(0,50):
    aucs.append(auc(false_pos_test[i,], true_pos_test[i,]))

In [72]:
np.percentile(aucs, 95)

0.8211365341335334

In [73]:
false_pos = np.load('false_pos.npy')
true_pos = np.load('true_pos.npy')

In [74]:
auc_test = auc(false_pos, true_pos)

In [75]:
auc_test

0.6866091522880721

In [76]:
y = np.linspace(0,1,10)

In [77]:
x = np.linspace(0,1,10)

In [78]:
from bokeh.plotting import figure, output_notebook, show

output_notebook


p4 = figure(plot_width=800, plot_height=800)



p4.line(x, y, line_width=4, line_color='lightgrey')

p4.line(false_pos, true_pos, line_width=4, line_color='orange')

for i in range(0,20):
    p4.line(true_pos_test[i,], false_pos_test[i,], line_width=4, alpha=0.5)

p4.xaxis.axis_label = 'false positives'
p4.yaxis.axis_label = 'true positives'


# show the results
show(p4)