In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

In [None]:
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain
import statistics as stat

import bokeh.palettes

In [None]:
from bokeh.plotting import figure, show, output_notebook
output_notebook()

## Load SKAT functions

In [None]:
def run_skat_log(mtx, gene_list, pcs):
    
    mtx = mtx.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mtx.nearest_genes_20kb))
    mtx = mtx.filter_rows(hl.agg.any(mtx.GT.is_non_ref()))
    mtx = mtx.explode_rows(mtx.nearest_genes_20kb)
    mtx = mtx.filter_rows(hl.literal(gene_list).contains(mtx.nearest_genes_20kb))
    
    
    scores = [mtx.scores[x] for x in list(range(pcs))]
                          
    
    skat_table = hl.skat(
                         key_expr=mtx.nearest_genes_20kb,
                         weight_expr=mtx.cadd,
                         y=mtx.category,
                         x=mtx.GT.n_alt_alleles(),
                         covariates=[1] + scores,
                         max_size = 2500,
                         logistic = True)
    
    genes_result = skat_table.filter(skat_table.p_value < 0.05/len(gene_list)).id.collect() 

    
    skat_table.order_by('p_value').show(20)
    #skat_table.filter(skat_table.p_value < 0.002).show(20)

    qq_plot = hl.plot.qq(skat_table.p_value)
    qq_plot.xaxis.axis_label_text_font_size = "15pt"
    qq_plot.xaxis.major_label_text_font_size = "15pt"
    qq_plot.yaxis.axis_label_text_font_size = "15pt"
    qq_plot.yaxis.major_label_text_font_size = "15pt"
    
    show(qq_plot)
    
    
    return(skat_table, genes_result, qq_plot)

In [None]:
def test_model(geneset_name):  
    for c_idx, c in enumerate(cadds):

        test_asignment = np.zeros((len(top_genes), 145)) 
        mt_skat_log = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt_for_skat_'+str(c)+'.mt')
        mt_test_skat_log = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt_test_'+str(c)+'.mt')
      
        for rows, n in enumerate(top_genes):

            mt_skat_log_gene = mt_skat_log.filter_rows(mt_skat_log.nearest_genes_20kb.contains(n)) # to ma gnomadów i heavy tics
        
            mt_test_skat_log_gene = mt_test_skat_log.filter_rows(mt_test_skat_log.nearest_genes_20kb.contains(n)) # to ma rodziny

            mt_skat_log_gene = mt_skat_log_gene.annotate_cols(non_refs = hl.agg.count_where(mt_skat_log_gene.GT.is_non_ref())) #count variants per sample of gnomads and heavy tics  
            non_refs = mt_skat_log_gene.non_refs.collect()
                                                
            mt_test_skat_log_gene = mt_test_skat_log_gene.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log_gene.GT.is_non_ref())) #count variants per sample, prepare also test matrix
            non_refs_test = mt_test_skat_log_gene.non_refs.collect()

            variants_controls = np.mean(np.array(non_refs)[np.invert(categories)])
                
            test_asignment[rows] = (non_refs_test - variants_controls)
            
            print(test_asignment.shape)
                
        for s in sets:
        
            test_asignment_subset = test_asignment[range(5-s,5), :]
            
            print(test_asignment_subset.shape)
                
            test_asignment_subset = np.sum(test_asignment_subset, axis = 0)
                
            false_pos = []
            true_pos = []
            
            for x in np.linspace(-100,100,10000):
            
                false_pos.append(np.sum((test_asignment_subset > x)[np.invert(categories_test)])/56)
                true_pos.append(np.sum((test_asignment_subset > x)[categories_test])/89)
            
            print(np.trapz(false_pos, true_pos))
        
            np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-false_pos'+str(s)+'cadd'+str(c)+'gene'+geneset_name, false_pos)
            np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-true_pos'+str(s)+'cadd'+str(c)+'gene'+geneset_name, true_pos)

    
    false_pos = []
    true_pos = []

    for s in sets:
        for c in cadds:
            a = np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-false_pos'+str(s)+'cadd'+str(c)+'gene'+geneset_name+'.npy')
            b = np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-true_pos'+str(s)+'cadd'+str(c)+'gene'+geneset_name+'.npy')
            
            if (a[0] == 1) & (a[9999] == 0) & (b[0] == 1) & (b[9999] == 0):
                false_pos.append(a)
                true_pos.append(b)

    aucs = []
    for i in range(0,14):
        aucs.append(np.trapz(true_pos[i], false_pos[i]))

    aucs = np.array(aucs)
    print(aucs)

    test1 = np.array(([0,1,2,4,5], [0,2,2,3,5]))
    test2 = np.array(([1,1,1,1,1], [2,2,2,2,2]))

    y = np.linspace(0,1,10)
    x = np.linspace(0,1,10)

    false_pos_ori = []
    true_pos_ori = []

    for s in sets:
        for c in cadds:
            false_pos_ori.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-false_pos'+str(s)+'cadd'+str(c)+'.npy'))
            true_pos_ori.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-true_pos'+str(s)+'cadd'+str(c)+'.npy'))

    p4 = figure(plot_width=800, plot_height=800)
    p4.line(x, y, line_width=4, line_color='lightgrey')


    for i in range(0,14):
        p4.line(false_pos[i], true_pos[i], line_width=1, alpha=0.25)
    
    p4.line(false_pos_ori[13], true_pos_ori[13], line_width=4, line_color='orange')

    p4.xaxis.axis_label = 'false positives'
    p4.yaxis.axis_label = 'true positives'


    p4.xaxis.axis_label_text_font_size = "15pt"
    p4.xaxis.major_label_text_font_size = "15pt"
    p4.yaxis.axis_label_text_font_size = "15pt"
    p4.yaxis.major_label_text_font_size = "15pt"
    p4.title.text = geneset_name
    p4.title.text_font_size = "20px"

    # show the results
    show(p4)

In [None]:
sets = [2,3,4,5]
cadds = [5,10,15,20]

## Run SKAT on all genes

In [None]:
allgenes= np.load('/net/archive/groups/plggneuromol/GTS-analysis/analysis/numpy/genes_background.npy')

In [None]:
allgenes.shape

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-model-expanded-may.mt') #this goes into each model

In [None]:
mt_for_skat = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-skat-may.mt')
mt_test = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-test-may.mt')
categories = mt_for_skat.category.collect()
categories_test = mt_test.category.collect()

In [None]:
skat_table, genes_result, qq_plot = run_skat_log(mt, list(allgenes), 9)

In [None]:
skat_table.checkpoint('/net/archive/groups/plggneuromol/GTS-analysis/data/skat-all.mt')

In [None]:
#assign top genes manually
top_genes = ['PCNT', 'CNN2', 'TMEM259', 'FRG2', 'DUX4']

In [None]:
test_model('allgenes')

In [None]:
geneset_name='allgenes'

In [None]:
false_pos = []
true_pos = []

for s in sets:
    for c in cadds:
        a = np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-false_pos'+str(s)+'cadd'+str(c)+'gene'+geneset_name+'.npy')
        b = np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-true_pos'+str(s)+'cadd'+str(c)+'gene'+geneset_name+'.npy')
           
        if (a[0] == 1) & (a[9999] == 0) & (b[0] == 1) & (b[9999] == 0):
            false_pos.append(a)
            true_pos.append(b)

In [None]:
len(true_pos)

In [None]:
aucs = []

for i in range(0,14):
    aucs.append(np.trapz(true_pos[i], false_pos[i]))

aucs = np.array(aucs)
print(aucs)

test1 = np.array(([0,1,2,4,5], [0,2,2,3,5]))
test2 = np.array(([1,1,1,1,1], [2,2,2,2,2]))

y = np.linspace(0,1,10)
x = np.linspace(0,1,10)

false_pos_ori = []
true_pos_ori = []

for s in sets:
    for c in cadds:
        false_pos_ori.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-false_pos'+str(s)+'cadd'+str(c)+'.npy'))
        true_pos_ori.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/numpy/may-true_pos'+str(s)+'cadd'+str(c)+'.npy'))

p4 = figure(plot_width=800, plot_height=800)
p4.line(x, y, line_width=4, line_color='lightgrey')


for i in range(0,14):
    p4.line(false_pos[i], true_pos[i], line_width=1, alpha=0.25)

p4.line(false_pos_ori[13], true_pos_ori[13], line_width=4, line_color='orange')

p4.xaxis.axis_label = 'false positives'
p4.yaxis.axis_label = 'true positives'


p4.xaxis.axis_label_text_font_size = "15pt"
p4.xaxis.major_label_text_font_size = "15pt"
p4.yaxis.axis_label_text_font_size = "15pt"
p4.yaxis.major_label_text_font_size = "15pt"
p4.title.text = geneset_name
p4.title.text_font_size = "20px"

# show the results
show(p4)

## run model on other gene lists

- brain enriched
- other gene lists

In [None]:
genes_neuro = list(set([line.rstrip('\n') for line in open('/net/archive/groups/plggneuromol/GTS-analysis/analysis/gts_gene_lists/brain_enriched.txt')]))

In [None]:
len(genes_neuro)

In [None]:
skat, genes_result, qq_plot = run_skat_log(mt, genes_neuro, 9)

#top genes:

"KCNK4"	81	1.37e+03	2.54e-05	0
"S100B"	103	8.07e+02	6.18e-05	0
"GABRA2"	375	2.13e+03	1.19e-04	0
"GRM1"	575	2.35e+03	5.58e-04	0
"ENO2"	73	1.17e+03	5.81e-04	0

In [None]:
top_genes = skat.order_by('p_value').id.take(5)
test_model('neuro')