In [8]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [9]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

Running on Apache Spark version 2.4.3
SparkUI available at http://p1078.prometheus:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /net/archive/groups/plggneuromol/GTS-analysis/imdik-zekanowski-gts/hail-20210215-0956-0.2.30-2ae07d872f43.log


In [10]:
display(HTML("<style>.container { width:100% !important; }</style>"))

In [11]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain
import statistics as stat

import bokeh.palettes

In [12]:
from bokeh.plotting import figure, show, output_notebook
output_notebook()

## Load SKAT functions

In [6]:
def remove_sex_chrom(mtx):
    mtx = mtx.filter_rows(mtx.locus.contig != "chrY")
    return(mtx)


def run_pca(mtx, mtx_subset):
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mtx_subset.GT)
    mtx = mtx.annotate_cols(scores = pcs[mtx.s].scores)

    return(mtx)

def run_skat_log(mtx, gene_list, pcs):
    
    mtx = mtx.filter_rows(hl.any(lambda x: hl.literal(gene_list).contains(x), mtx.nearest_genes_20kb))
    mtx = mtx.filter_rows(hl.agg.any(mtx.GT.is_non_ref()))
    mtx = mtx.explode_rows(mtx.nearest_genes_20kb)
    mtx = mtx.filter_rows(hl.literal(gene_list).contains(mtx.nearest_genes_20kb))
    
    
    scores = [mtx.scores[x] for x in list(range(pcs))]
                          
    
    skat_table = hl.skat(
                         key_expr=mtx.nearest_genes_20kb,
                         weight_expr=mtx.cadd,
                         y=mtx.category,
                         x=mtx.GT.n_alt_alleles(),
                         covariates=[1] + scores,
                         max_size = 2500,
                         logistic = True)
    
    genes_result = skat_table.filter(skat_table.p_value < 0.05/len(gene_list)).id.collect() 

    skat_table.filter(skat_table.p_value < 0.002).show(20)

    skat_table = skat_table.annotate(label = hl.literal(genes).contains(skat_table.id))

    qq_plot = hl.plot.qq(skat_table.p_value,
                                         label = skat_table.label,
                                         n_divisions = len(gene_list))
    show(qq_plot)
    
    return(skat_table, genes_result, qq_plot)


def full_skat_log(mtx, mtx_subset, gene_list, pcs):
    
    mtx = remove_sex_chrom(mtx)
    mtx_subset = remove_sex_chrom(mtx_subset)
  
    mtx = run_pca(mtx, mtx_subset) #this matrix will be returned, so I can do SKAT with other list and parameters
    skat_table, genes_result, qq_plot = run_skat_log(mtx, gene_list, pcs)
    
    return(mtx, skat_table, genes_result, qq_plot)

In [7]:
def test_model(genes, sets, cadds, geneset_name):
    
    for s in sets:   
        for c in cadds:
        
            variants_controls = np.zeros((len(top_genes)))
            variants_gts = np.zeros((len(top_genes)))
    
            variants_controls_test = np.zeros((len(top_genes)))
            variants_gts_test = np.zeros((len(top_genes)))

            model_asignment = np.zeros((len(top_genes), 78))
            test_asignment = np.zeros((len(top_genes), 144)) 
    
            for rows, n in enumerate(top_genes):
            
                mt_skat_log = mt_for_skat.filter_rows(mt_for_skat.nearest_genes_20kb.contains(n)) # to ma gnomadów i heavy tics
            mt_test_skat_log = mt_test.filter_rows(mt_test.nearest_genes_20kb.contains(n)) # to ma rodziny

            mt_skat_log = mt_skat_log.filter_rows(mt_skat_log.cadd > c)
            mt_skat_log = mt_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_skat_log.GT.is_non_ref())) #count variants per sample of gnomads and heavy tics  
            non_refs = mt_skat_log.non_refs.collect()

            mt_test_skat_log = mt_test_skat_log.filter_rows(mt_test_skat_log.cadd > c)
            mt_test_skat_log = mt_test_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log.GT.is_non_ref())) #count variants per sample, prepare also test matrix
            non_refs_test = mt_test_skat_log.non_refs.collect()

            variants_gts[rows] = np.mean(np.array(non_refs)[categories])
            variants_controls[rows] = np.mean(np.array(non_refs)[np.invert(categories)])

            variants_gts_test[rows] = np.mean(np.array(non_refs_test)[categories_test])
            variants_controls_test[rows] = np.mean(np.array(non_refs_test)[np.invert(categories_test)])

            results = (non_refs - variants_controls[rows]) 
            results_test = (non_refs_test - variants_controls[rows]) 

            model_asignment[rows] = (results)
            test_asignment[rows] = (results_test)

        model_asignment = np.sum(model_asignment, axis = 0)
        test_asignment = np.sum(test_asignment, axis = 0)
            
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_gts'+str(s)+'cadd'+str(c), variants_gts)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_controls'+str(s)+'cadd'+str(c), variants_controls)

        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_gts_test'+str(s)+'cadd'+str(c), variants_gts_test)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_controls_test'+str(s)+'cadd'+str(c), variants_controls_test)

        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/model_asignment'+str(s)+'cadd'+str(c), model_asignment)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/test_asignment'+str(s)+'cadd'+str(c), test_asignment)

        false_pos = []
        true_pos = []
            
        for x in np.linspace(-40,40,10000):
            false_pos.append(np.sum((test_asignment > x)[np.invert(categories_test)])/53)
            true_pos.append(np.sum((test_asignment > x)[categories_test])/91)
                
            np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/false_pos'+str(s)+'cadd'+str(c), false_pos)
            np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/true_pos'+str(s)+'cadd'+str(c), true_pos)
            
        print(str(s)+'cadd'+str(c))
        print(variants_gts)
        print(variants_controls)
        
    return(false_pos, true_pos, p, auc)

In [7]:
def full_model(gene_list):
    
    mt_for_model, skat_table, genes_result, qq_plot = full_skat_log(mt_for_skat, mt_subset, gene_list, 7)
    skat_table, genes_result, qq_plot = run_skat_log(mt_for_model, gene_list, 7)
    top_genes = skat_table.order_by('p_value').id.take(4)
    false_pos, true_pos, p, auc = test_model(top_genes)
    return(skat_table, qq_plot, top_genes, false_pos, true_pos, p, auc)

In [13]:
mt_for_skat = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-skat.mt')
mt_test = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-test.mt')
mt_subset = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-subset.mt')

### SKAT overrepresentation analysis:

In [9]:
genes = ['DCC', 'RBFOX', 'SLC30A9', 'DCAF4L1', 'SORCS3', 'KCNQ5', 'KCNQ-IT1', 'APOPT1', 'C14orf2', 'NAA11', 'NEGR1',
        'CHADL', 'SOX5', 'PANK2', 'COL27A1', 'PDGFB', 'CELSR3', 'OPA1', 'FBN2', 'WWC1', 'NIPBL', 
             'FN1', 'FBN2', 'SLITRK1', 'SLITRK2', 'SLITRK3', 'SLITRK4', 'SLITRK5', 'SLITRK6', 
             'HDC', 'OPRK1', 'PCDH10', 'NTSR2', 'OPRK1', 'CHD8', 'SCUBE1', 'PNKD', 'CNTNAP2', 'MOG', 
             'DRD2', 'DRD3', 'DRD4', 'DRD5', 'DAT1', 'DBH', 'HTR2A', 'TPH2', 'EAAT1', 'SAPAP3',
            'CTNNA3', 'NLGN4', 'FSCB', 'IMMP2L', 'NRXN1', 'AADAC', 'DBH', 'MAOA', 'HTR1A', 'HTR2C', 'SLC6A4',
             'TPH2', 'COL27A1', '5-HTTLPR', 'EAAT1', 'COL8A1', 'KCNE1', 'KCNE2',
         'RICTOR', 'WWC1', 'CELSR3, NIPBL', 'FN1', 'PNKD', 'CDH26', 'CADM2', 'OPCML', 'CDH9',
         'NCAM2', 'CD47', 'CDH5', 'CADM4', 'C1QBP', 'CTTN', 'LSAMP',
         'PKP4', 'PCDH1', 'CNTNAP2', 'MBP', 'GABBR2', 'GABBR2', 'GRIK4', 'NCR1', 'FLT3', 'IL12A', 'HDAC9',
         'CD180', 'CDH26', 'NCAM2', 'NTM', 'ROBO2'] # the other gene next to chadl - 'L3MBTL2' was deleted not to confuse the analysis


allgenes = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/analysis/gts_gene_lists/human-genes-with-GO-and-symbols') 
allgenes = allgenes.select('UniProtKB Gene Name symbol')

allgenes = allgenes.filter(allgenes['UniProtKB Gene Name symbol'] != "")
allgenes = allgenes['UniProtKB Gene Name symbol'].collect()

genes_scores = list(set(genes))

2021-02-11 22:09:18 Hail: INFO: Reading table with no type imputation
  Loading column 'Gene stable ID' as type 'str' (type not specified)
  Loading column 'UniProtKB Gene Name symbol' as type 'str' (type not specified)



KeyboardInterrupt: 

In [9]:
len(genes_scores)

86

In [15]:
mt_for_model, skat_table_log, genes_result_log, qq_plot_log = full_skat_log(mt_for_skat, mt_subset, genes_scores, 7)

2021-02-09 19:01:28 Hail: INFO: hwe_normalized_pca: running PCA using 10045 variants.
2021-02-09 19:01:42 Hail: INFO: pca: running PCA with 10 components...
2021-02-09 19:03:36 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-02-09 19:04:42 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""HDC""",118,546.0,0.00179,0


2021-02-09 19:05:52 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-02-09 19:06:09 Hail: INFO: Coerced sorted dataset
2021-02-09 19:06:09 Hail: INFO: Coerced dataset with out-of-order partitions.


In [16]:
plot = hl.plot.qq(skat_table_log.p_value)

2021-02-09 19:07:01 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-02-09 19:07:18 Hail: INFO: Coerced sorted dataset
2021-02-09 19:07:18 Hail: INFO: Coerced dataset with out-of-order partitions.


In [17]:
show(plot)

In [18]:
skat_table_log.order_by('p_value').show(20)

2021-02-09 19:08:07 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault,label
str,int32,float64,float64,int32,bool
"""HDC""",118,546.0,0.00179,0,True
"""CHADL""",98,835.0,0.00829,0,True
"""MAOA""",95,696.0,0.0135,0,True
"""NAA11""",173,689.0,0.0241,0,True
"""CD47""",180,606.0,0.032,0,True
"""DRD3""",194,551.0,0.0348,0,True
"""PKP4""",699,3120.0,0.0625,0,True
"""DRD2""",224,719.0,0.0659,0,True
"""GRIK4""",1055,3240.0,0.0682,0,True
"""SLITRK2""",81,297.0,0.0962,0,True


#### Prepare matrix table

In [19]:
skat_table_log.write('/net/archive/groups/plggneuromol/GTS-analysis/data/skat-expanded.ht') # 7 PCs, zero samples deleted from the analysis

#skat_table_log.write('/net/archive/groups/plggneuromol/GTS-analysis/data/skat.ht')

2021-02-09 19:09:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-02-09 19:09:37 Hail: INFO: wrote table with 75 rows in 75 partitions to /net/archive/groups/plggneuromol/GTS-analysis/data/skat-expanded.ht


In [20]:
mt_for_model.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-model-expanded.mt')

#mt_for_model.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-model.mt')

2021-02-09 19:10:47 Hail: INFO: wrote matrix table with 6432039 rows and 78 columns in 6622 partitions to /net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-model-expanded.mt


### Collect the number of variants in control individuals for the classifier

A few iterations over various numbers of genes and CADD score cutt-offs were run

In [None]:
skat = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/data/skat-expanded.ht')

In [28]:
skat.order_by('p_value').show(20)

id,size,q_stat,p_value,fault,label
str,int32,float64,float64,int32,bool
"""HDC""",118,546.0,0.00179,0,True
"""CHADL""",98,835.0,0.00829,0,True
"""MAOA""",95,696.0,0.0135,0,True
"""NAA11""",173,689.0,0.0241,0,True
"""CD47""",180,606.0,0.032,0,True
"""DRD3""",194,551.0,0.0348,0,True
"""PKP4""",699,3120.0,0.0625,0,True
"""DRD2""",224,719.0,0.0659,0,True
"""GRIK4""",1055,3240.0,0.0682,0,True
"""SLITRK2""",81,297.0,0.0962,0,True


### reimport the matrixtable again and prepare the test dataset


In [10]:
mt_for_skat = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-skat.mt')

In [11]:
categories = mt_for_skat.category.collect()
categories_test = mt_test.category.collect()

NameError: name 'mt_test' is not defined

### test the test dataset 

#### how to calculate confusion matrix values

sum(asignment[np.invert(categories)]) #number of false positives

sum(np.invert(asignment)[categories]) # number of false negatives
        
sum(np.invert(asignment)[np.invert(categories)]) #number of true negatives

sum(asignment[categories]) #number of true positives 

In [9]:
sets = [2,3,4,5]
cadds = [5,10,15,20]

In [29]:
for s in sets:
    
    top_genes = skat.order_by('p_value').id.take(s)   
   
    for c in cadds:
        
        variants_controls = np.zeros((len(top_genes)))
        variants_gts = np.zeros((len(top_genes)))
    
        variants_controls_test = np.zeros((len(top_genes)))
        variants_gts_test = np.zeros((len(top_genes)))

        model_asignment = np.zeros((len(top_genes), 78))
        test_asignment = np.zeros((len(top_genes), 144)) 
    
        for rows, n in enumerate(top_genes):
            
            mt_skat_log = mt_for_skat.filter_rows(mt_for_skat.nearest_genes_20kb.contains(n)) # to ma gnomadów i heavy tics
            mt_test_skat_log = mt_test.filter_rows(mt_test.nearest_genes_20kb.contains(n)) # to ma rodziny

            mt_skat_log = mt_skat_log.filter_rows(mt_skat_log.cadd > c)
            mt_skat_log = mt_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_skat_log.GT.is_non_ref())) #count variants per sample of gnomads and heavy tics  
            non_refs = mt_skat_log.non_refs.collect()

            mt_test_skat_log = mt_test_skat_log.filter_rows(mt_test_skat_log.cadd > c)
            mt_test_skat_log = mt_test_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log.GT.is_non_ref())) #count variants per sample, prepare also test matrix
            non_refs_test = mt_test_skat_log.non_refs.collect()

            variants_gts[rows] = np.mean(np.array(non_refs)[categories])
            variants_controls[rows] = np.mean(np.array(non_refs)[np.invert(categories)])

            variants_gts_test[rows] = np.mean(np.array(non_refs_test)[categories_test])
            variants_controls_test[rows] = np.mean(np.array(non_refs_test)[np.invert(categories_test)])

            results = (non_refs - variants_controls[rows]) 
            results_test = (non_refs_test - variants_controls[rows]) 

            model_asignment[rows] = (results)
            test_asignment[rows] = (results_test)

        model_asignment = np.sum(model_asignment, axis = 0)
        test_asignment = np.sum(test_asignment, axis = 0)
            
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_gts'+str(s)+'cadd'+str(c), variants_gts)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_controls'+str(s)+'cadd'+str(c), variants_controls)

        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_gts_test'+str(s)+'cadd'+str(c), variants_gts_test)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/variants_controls_test'+str(s)+'cadd'+str(c), variants_controls_test)

        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/model_asignment'+str(s)+'cadd'+str(c), model_asignment)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/test_asignment'+str(s)+'cadd'+str(c), test_asignment)

        false_pos = []
        true_pos = []
            
        for x in np.linspace(-40,40,10000):
            false_pos.append(np.sum((test_asignment > x)[np.invert(categories_test)])/53)
            true_pos.append(np.sum((test_asignment > x)[categories_test])/91)
                
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/false_pos'+str(s)+'cadd'+str(c), false_pos)
        np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/true_pos'+str(s)+'cadd'+str(c), true_pos)
            
        print(str(s)+'cadd'+str(c))
        print(variants_gts)
        print(variants_controls)

2cadd5
[ 5.58974359 10.79487179]
[7.33333333 8.82051282]
2cadd10
[1.8974359  3.82051282]
[2.28205128 3.35897436]
2cadd15
[1.17948718 0.02564103]
[1.51282051 0.        ]
2cadd20
[0.         0.02564103]
[0. 0.]
3cadd5
[ 5.58974359 10.79487179  4.30769231]
[7.33333333 8.82051282 3.41025641]
3cadd10
[1.8974359  3.82051282 0.28205128]
[2.28205128 3.35897436 0.15384615]
3cadd15
[1.17948718 0.02564103 0.28205128]
[1.51282051 0.         0.15384615]
3cadd20
[0.         0.02564103 0.        ]
[0. 0. 0.]
4cadd5
[ 5.58974359 10.79487179  4.30769231 20.92307692]
[ 7.33333333  8.82051282  3.41025641 23.51282051]
4cadd10
[1.8974359  3.82051282 0.28205128 5.61538462]
[2.28205128 3.35897436 0.15384615 6.61538462]
4cadd15
[1.17948718 0.02564103 0.28205128 1.94871795]
[1.51282051 0.         0.15384615 2.48717949]
4cadd20
[0.         0.02564103 0.         0.05128205]
[0.         0.         0.         0.15384615]
5cadd5
[ 5.58974359 10.79487179  4.30769231 20.92307692  9.41025641]
[ 7.33333333  8.82051282 

In [12]:
false_pos = []
true_pos = []

for s in sets:
        for c in cadds:
            false_pos.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/false_pos'+str(s)+'cadd'+str(c)+'.npy'))
            true_pos.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/true_pos'+str(s)+'cadd'+str(c)+'.npy'))

### ROC

In [13]:
colors = bokeh.palettes.Category20[16]

y = np.linspace(0,1,10)
x = np.linspace(0,1,10)

from bokeh.plotting import figure, output_notebook, show

output_notebook

p = figure(plot_width=800, plot_height=800)
p.line(x, y, line_width=4, line_color='lightgrey')

for i in range(16):
    p.line(false_pos[i], true_pos[i], line_width=4, alpha=0.5, line_color=colors[i])
    

p.line(false_pos[9], true_pos[9], line_width=4, alpha=1, line_color=colors[9])



p.xaxis.axis_label = 'false positives'
p.yaxis.axis_label = 'true positives'

p.yaxis.axis_label_text_font_size = "25px"
p.xaxis.axis_label_text_font_size = "25px"

show(p)

In [14]:
auc = []

for i in range(16):
    auc.append(-np.trapz(true_pos[i], false_pos[i]))

In [15]:
max(auc)

0.602633215840763

Get the false positive rate of models:

In [None]:
genes_background = mt_test.nearest_genes_20kb.collect()

In [None]:
genes_background_1 =  set([val for sublist in genes_background for val in sublist])

In [None]:
len(genes_background_1)

In [None]:
len(allgenes)

In [None]:
genes_background = [x for x in genes_background_1 if x in allgenes]

len(genes_background)

In [None]:
genes_background = np.array(genes_background)

In [None]:
#np.save('genes_background', genes_background)

In [14]:
genes_background = np.load('/net/archive/groups/plggneuromol/GTS-analysis/analysis/numpy/genes_background.npy')

In [27]:
sets = [2,3,4,5]
cadds = [5,10,15,20]

In [16]:
categories = mt_for_skat.category.collect()
categories_test = mt_test.category.collect()

In [None]:
for gene in range(0,1000):

    randoms = np.random.randint(len(genes_background), size=5)
    geneset = [j for i, j in enumerate(genes_background) if i in randoms]

    print(geneset)
    
    for s_idx, s in enumerate(sets):
        
        top_genes = geneset[0:s]   
        
        for c_idx, c in enumerate(cadds):

            test_asignment = np.zeros((len(top_genes), 144)) 
            
            print(str(c))
            print(str(s))
            print(top_genes)
                 
            for rows, n in enumerate(top_genes):

                mt_skat_log = mt_for_skat.filter_rows(mt_for_skat.nearest_genes_20kb.contains(n)) # to ma gnomadów i heavy tics
                mt_test_skat_log = mt_test.filter_rows(mt_test.nearest_genes_20kb.contains(n)) # to ma rodziny

                mt_skat_log = mt_skat_log.filter_rows(mt_skat_log.cadd > c)
                mt_skat_log = mt_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_skat_log.GT.is_non_ref())) #count variants per sample of gnomads and heavy tics  
                non_refs = mt_skat_log.non_refs.collect()
                
                print(non_refs)

                mt_test_skat_log = mt_test_skat_log.filter_rows(mt_test_skat_log.cadd > c)
                mt_test_skat_log = mt_test_skat_log.annotate_cols(non_refs = hl.agg.count_where(mt_test_skat_log.GT.is_non_ref())) #count variants per sample, prepare also test matrix
                non_refs_test = mt_test_skat_log.non_refs.collect()

                variants_controls = np.mean(np.array(non_refs)[np.invert(categories)])
                
                test_asignment[rows] = (non_refs_test - variants_controls)

        
            test_asignment = np.sum(test_asignment, axis = 0)
        
        
            false_pos = []
            true_pos = []
            
            for x in np.linspace(-40,40,10000):
            
                false_pos.append(np.sum((test_asignment > x)[np.invert(categories_test)])/53)
                true_pos.append(np.sum((test_asignment > x)[categories_test])/91)
            
        
            np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/auc-test/false_pos'+str(s)+'cadd'+str(c)+'gene'+str(gene), false_pos)
            np.save('/net/archive/groups/plggneuromol/GTS-analysis/data/auc-test/true_pos'+str(s)+'cadd'+str(c)+'gene'+str(gene), true_pos)
        
     
  
        print('I have completed iteration number: ' + str(gene))

    

['LRRC71', 'FYB1', 'DMTN', 'SRSF5', 'GINM1']
5
2
['LRRC71', 'FYB1']
[5, 7, 4, 17, 6, 20, 14, 27, 6, 17, 5, 19, 6, 4, 6, 7, 7, 7, 3, 6, 21, 15, 15, 6, 18, 20, 6, 16, 6, 19, 6, 7, 5, 14, 5, 7, 7, 19, 3, 12, 12, 11, 12, 13, 9, 14, 11, 8, 12, 13, 15, 11, 14, 8, 17, 14, 8, 12, 7, 13, 11, 10, 8, 8, 10, 13, 10, 15, 12, 13, 13, 11, 11, 8, 12, 13, 10, 9]
[31, 35, 29, 26, 42, 34, 43, 28, 42, 38, 30, 41, 40, 35, 34, 31, 38, 30, 27, 32, 0, 42, 33, 34, 29, 30, 34, 42, 24, 31, 36, 37, 29, 29, 45, 26, 35, 28, 37, 38, 32, 26, 37, 35, 37, 36, 30, 33, 38, 37, 36, 33, 31, 38, 36, 36, 35, 32, 38, 35, 44, 35, 34, 33, 35, 35, 31, 31, 34, 33, 34, 34, 37, 36, 32, 34, 30, 33]
10
2
['LRRC71', 'FYB1']
[2, 3, 1, 7, 3, 8, 5, 9, 2, 7, 2, 8, 2, 2, 2, 2, 2, 3, 1, 3, 7, 6, 5, 3, 7, 8, 2, 5, 3, 8, 2, 2, 2, 5, 2, 2, 3, 8, 1, 3, 6, 3, 4, 6, 2, 7, 2, 2, 5, 5, 7, 2, 5, 3, 6, 4, 4, 5, 3, 4, 5, 3, 3, 3, 4, 5, 4, 9, 5, 6, 5, 5, 5, 2, 4, 6, 5, 3]
[3, 4, 4, 2, 3, 4, 3, 3, 5, 5, 4, 5, 4, 2, 4, 2, 2, 3, 1, 4, 0, 7, 3, 4, 5, 2, 4,

## AUC

In [26]:
false_pos = []
true_pos = []

for s in sets:
        for c in cadds:
            for gene in range(0,4):
                false_pos.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/auc-test/false_pos'+str(s)+'cadd'+str(c)+'gene'+str(gene)+'.npy'))
                true_pos.append(np.load('/net/archive/groups/plggneuromol/GTS-analysis/data/auc-test/true_pos'+str(s)+'cadd'+str(c)+'gene'+str(gene)+'.npy'))

FileNotFoundError: [Errno 2] No such file or directory: '/net/archive/groups/plggneuromol/GTS-analysis/data/auc-test/false_pos2cadd5gene0.npy'

In [25]:
ls /net/archive/groups/plggneuromol/GTS-analysis/data/auc-test/ 

false_pos2cadd20gene0.npy  false_pos4cadd20gene3.npy  true_pos3cadd20gene1.npy
false_pos2cadd20gene1.npy  false_pos4cadd20gene4.npy  true_pos3cadd20gene2.npy
false_pos2cadd20gene2.npy  false_pos5cadd20gene0.npy  true_pos3cadd20gene3.npy
false_pos2cadd20gene3.npy  false_pos5cadd20gene1.npy  true_pos3cadd20gene4.npy
false_pos2cadd20gene4.npy  false_pos5cadd20gene2.npy  true_pos4cadd20gene0.npy
false_pos2cadd20gene5.npy  false_pos5cadd20gene3.npy  true_pos4cadd20gene1.npy
false_pos3cadd20gene0.npy  false_pos5cadd20gene4.npy  true_pos4cadd20gene2.npy
false_pos3cadd20gene1.npy  true_pos2cadd20gene0.npy   true_pos4cadd20gene3.npy
false_pos3cadd20gene2.npy  true_pos2cadd20gene1.npy   true_pos4cadd20gene4.npy
false_pos3cadd20gene3.npy  true_pos2cadd20gene2.npy   true_pos5cadd20gene0.npy
false_pos3cadd20gene4.npy  true_pos2cadd20gene3.npy   true_pos5cadd20gene1.npy
false_pos4cadd20gene0.npy  true_pos2cadd20gene4.npy   true_pos5cadd20gene2.npy
false_pos4cadd20gene1.npy  true_pos2cadd

In [41]:
aucs = []
for i in range(0,100):
    aucs.append(np.trapz(true_pos_test[i,], false_pos_test[i,]))

In [42]:
aucs = np.array(aucs)

In [44]:
np.percentile(-aucs, 95)

0.5940182459050383

In [45]:
test1 = np.array(([0,1,2,4,5], [0,2,2,3,5]))
test2 = np.array(([1,1,1,1,1], [2,2,2,2,2]))

In [46]:
y = np.linspace(0,1,10)
x = np.linspace(0,1,10)

In [89]:
p4 = figure(plot_width=800, plot_height=800)
p4.line(x, y, line_width=4, line_color='lightgrey')


for i in range(0,100):
    p4.line(false_pos_test[i,], true_pos_test[i,], line_width=1, alpha=0.25)
    
p4.line(false_pos, true_pos, line_width=4, line_color='orange')

p4.xaxis.axis_label = 'false positives'
p4.yaxis.axis_label = 'true positives'

p4.yaxis.axis_label_text_font_size = "25px"
p4.xaxis.axis_label_text_font_size = "25px"

# show the results
show(p4)

NameError: name 'x' is not defined

## investigate variants that went into the model

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-gnomad-sex.mt')

In [None]:
top_genes = ['HDC', 'CHADL', 'MAOA', 'NAA11']

In [95]:
mt = mt.filter_rows(hl.any(lambda x: hl.literal(top_genes).contains(x), mt.nearest_genes_20kb))

In [96]:
mt.count()

(1230, 370)

In [97]:
mt = mt.filter_rows(mt.cadd > 10)

In [98]:
mt.count()

(53, 370)

In [100]:
#mt.write('/net/archive/groups/plggneuromol/GTS-analysis/data/top-variants.mt')

2020-11-09 17:54:13 Hail: INFO: wrote matrix table with 53 rows and 370 columns in 6622 partitions to /net/archive/groups/plggneuromol/GTS-analysis/data/top-variants.mt


In [188]:
top = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/top-variants.mt')

In [177]:
top_genes

['HDC', 'CHADL', 'MAOA', 'NAA11']

In [68]:
top = top.annotate_rows(
                      all_gnomads_non_ref = hl.agg.filter((top.phenotypes.phenotype == 'gnomad'), hl.agg.count_where(top.GT.is_non_ref()))/185,
                      all_gnomads_hom_var = hl.agg.filter((top.phenotypes.phenotype == 'gnomad'), hl.agg.count_where(top.GT.is_hom_var()))/185,
                      controls_non_ref = hl.agg.filter((top.phenotypes.disease == 'NO'), hl.agg.count_where(top.GT.is_non_ref()))/53,
                      controls_hom_var = hl.agg.filter((top.phenotypes.disease == 'NO'), hl.agg.count_where(top.GT.is_hom_var()))/53,
                      gts_all_non_ref = hl.agg.filter((top.phenotypes.disease == 'YES'), hl.agg.count_where(top.GT.is_non_ref()))/130,
                      gts_all_hom_var = hl.agg.filter((top.phenotypes.disease == 'YES'), hl.agg.count_where(top.GT.is_hom_var()))/130)

# controls = 53 gts = 130 gnomad 185

In [73]:
top = top.rows()

In [75]:
top = top.to_pandas()

In [78]:
top.to_csv('/net/archive/groups/plggneuromol/GTS-analysis/data/top-variants.csv')

## Run SKAT on all genes

In [11]:
#reimport the genes table again

allgenes = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/analysis/gts_gene_lists/human-genes-with-GO-and-symbols') 
allgenes = allgenes.select('UniProtKB Gene Name symbol')

allgenes = allgenes['UniProtKB Gene Name symbol'].collect()

2020-11-13 10:02:18 Hail: INFO: Reading table with no type imputation
  Loading column 'Gene stable ID' as type 'str' (type not specified)
  Loading column 'UniProtKB Gene Name symbol' as type 'str' (type not specified)



In [45]:
skat_table_all, qq_plot_all, top_genes_all, false_pos_all, true_pos_all, p_all, auc_all = full_model(allgenes)

2020-11-12 12:50:48 Hail: INFO: hwe_normalized_pca: running PCA using 10045 variants.
2020-11-12 12:50:59 Hail: INFO: pca: running PCA with 10 components...
2020-11-12 12:54:24 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 12:57:01 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""AACS""",246,751.0,0.000981,0
"""ABCA7""",233,3730.0,3.36e-08,0
"""ADAMTSL4""",87,794.0,0.000969,0
"""ADCY5""",483,3210.0,3.11e-05,0
"""ADGRL4""",1088,5470.0,0.000834,0
"""AFTPH""",152,1510.0,0.00097,0
"""AGK""",153,590.0,0.000817,0
"""AIFM1""",68,2790.0,1.47e-05,0
"""AMIGO3""",78,684.0,0.000571,0
"""ANKRA2""",90,1070.0,0.00038,0


2020-11-12 12:59:37 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 12:59:53 Hail: INFO: Ordering unsorted dataset with network shuffle


2020-11-12 13:03:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 13:05:54 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""AACS""",246,751.0,0.000981,0
"""ABCA7""",233,3730.0,3.36e-08,0
"""ADAMTSL4""",87,794.0,0.000969,0
"""ADCY5""",483,3210.0,3.11e-05,0
"""ADGRL4""",1088,5470.0,0.000834,0
"""AFTPH""",152,1510.0,0.00097,0
"""AGK""",153,590.0,0.000817,0
"""AIFM1""",68,2790.0,1.47e-05,0
"""AMIGO3""",78,684.0,0.000571,0
"""ANKRA2""",90,1070.0,0.00038,0


2020-11-12 13:08:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 13:08:49 Hail: INFO: Ordering unsorted dataset with network shuffle


2020-11-12 13:12:02 Hail: INFO: Ordering unsorted dataset with network shuffle


In [47]:
auc_all

0.45998341281360156

## run model on other gene lists

- brain enriched
- other gene lists

In [48]:
genes_neuro = list(set([line.rstrip('\n') for line in open('/net/archive/groups/plggneuromol/GTS-analysis/analysis/gts_gene_lists/brain_enriched.txt')]))

In [49]:
len(genes_neuro)

488

In [50]:
skat_table_neuro, qq_plot_neuro, top_genes_neuro, false_pos_neuro, true_pos_neuro, p_neuro, auc_neuro = full_model(genes_neuro)

2020-11-12 14:18:07 Hail: INFO: hwe_normalized_pca: running PCA using 10045 variants.
2020-11-12 14:18:18 Hail: INFO: pca: running PCA with 10 components...
2020-11-12 14:19:50 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 14:20:48 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""CHRM5""",282,955.0,0.00153,0
"""GRM3""",399,2530.0,0.000965,0
"""KCNK4""",81,1320.0,0.000101,0
"""MTURN""",198,1140.0,0.00119,0
"""NEUROD1""",75,682.0,0.00146,0
"""PDZD4""",95,1200.0,0.000155,0
"""S100B""",103,654.0,7.3e-05,0
"""SCG3""",152,985.0,0.00105,0


2020-11-12 14:21:45 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 14:22:00 Hail: INFO: Coerced sorted dataset
2020-11-12 14:22:00 Hail: INFO: Coerced dataset with out-of-order partitions.


2020-11-12 14:22:47 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 14:23:39 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""CHRM5""",282,955.0,0.00153,0
"""GRM3""",399,2530.0,0.000965,0
"""KCNK4""",81,1320.0,0.000101,0
"""MTURN""",198,1140.0,0.00119,0
"""NEUROD1""",75,682.0,0.00146,0
"""PDZD4""",95,1200.0,0.000155,0
"""S100B""",103,654.0,7.3e-05,0
"""SCG3""",152,985.0,0.00105,0


2020-11-12 14:24:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-12 14:24:42 Hail: INFO: Coerced sorted dataset
2020-11-12 14:24:42 Hail: INFO: Coerced dataset with out-of-order partitions.


2020-11-12 14:25:25 Hail: INFO: Ordering unsorted dataset with network shuffle


In [51]:
auc_neuro

0.46174580136844284

In [138]:
new_gene_lists = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/analysis/gts_gene_lists/custom_lists_gts.csv')

2020-11-17 12:45:43 Hail: INFO: Reading table with no type imputation
  Loading column 'neurotranmitters' as type 'str' (type not specified)
  Loading column 'glutamate' as type 'str' (type not specified)
  Loading column 'serotonine' as type 'str' (type not specified)
  Loading column 'dop_ach' as type 'str' (type not specified)
  Loading column 'GTS_genes' as type 'str' (type not specified)
  Loading column 'synaptic_genes' as type 'str' (type not specified)
  Loading column 'tryptofane' as type 'str' (type not specified)
  Loading column 'receptors' as type 'str' (type not specified)
  Loading column 'calcium' as type 'str' (type not specified)
  Loading column 'androgenic_receptor' as type 'str' (type not specified)
  Loading column 'addictions' as type 'str' (type not specified)



In [139]:
tra = new_gene_lists['neurotranmitters'].collect()
glut = new_gene_lists['glutamate'].collect()
ser = new_gene_lists['serotonine'].collect()
dop = new_gene_lists['dop_ach'].collect()
gts = new_gene_lists['GTS_genes'].collect()
syn = new_gene_lists['synaptic_genes'].collect()
tryp = new_gene_lists['tryptofane'].collect()
rec = new_gene_lists['receptors'].collect()
ca = new_gene_lists['calcium'].collect()
andr = new_gene_lists['androgenic_receptor'].collect()
add = new_gene_lists['addictions'].collect()

In [52]:
gene_lists = [tra, glut, ser, dop, gts, syn, tryp, rec, ca, andr, add]
list_aucs = []

In [None]:
for gene in gene_lists:
    s, qq, top, fp, tp, p, auc = full_model(gene)
    list_aucs.append(auc)

In [17]:
#np.save('numpy/list_aucs', list_aucs)

In [55]:
list_aucs = np.load('numpy/list_aucs.npy')

In [62]:
list_aucs

array([0.49295045, 0.48227244, 0.50839726, 0.585113  , 0.52114866,
       0.50829359, 0.41986316, 0.51969728, 0.52840556])

In [180]:
s, qq, top, fp, tp, p, auc = full_model(dop)

2020-11-17 14:17:01 Hail: INFO: hwe_normalized_pca: running PCA using 10045 variants.
2020-11-17 14:17:05 Hail: INFO: pca: running PCA with 10 components...
2020-11-17 14:18:55 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-17 14:20:19 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""CHRM2""",422,1830.0,0.00128,0
"""CHRM5""",282,955.0,0.00153,0
"""HDC""",118,546.0,0.00179,0


2020-11-17 14:21:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-17 14:22:11 Hail: INFO: Coerced sorted dataset
2020-11-17 14:22:11 Hail: INFO: Coerced dataset with out-of-order partitions.


2020-11-17 14:23:05 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-17 14:24:55 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""CHRM2""",422,1830.0,0.00128,0
"""CHRM5""",282,955.0,0.00153,0
"""HDC""",118,546.0,0.00179,0


2020-11-17 14:26:51 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-11-17 14:27:38 Hail: INFO: Coerced sorted dataset
2020-11-17 14:27:38 Hail: INFO: Coerced dataset with out-of-order partitions.


2020-11-17 14:28:47 Hail: INFO: Ordering unsorted dataset with network shuffle


In [63]:
list_aucs = np.concatenate((list_aucs, list_aucs_2))

In [64]:
list_aucs

array([0.49295045, 0.48227244, 0.50839726, 0.585113  , 0.52114866,
       0.50829359, 0.41986316, 0.51969728, 0.52840556, 0.50642753,
       0.41011818])

In [185]:
s.order_by(s.p_value).show(10)

2020-11-17 14:53:00 Hail: INFO: Ordering unsorted dataset with network shuffle


id,size,q_stat,p_value,fault,label
str,int32,float64,float64,int32,bool
"""CHRM2""",422,1830.0,0.00128,0,False
"""CHRM5""",282,955.0,0.00153,0,False
"""HDC""",118,546.0,0.00179,0,False
"""SCAMP2""",92,858.0,0.00208,0,False
"""GNG2""",504,2050.0,0.00239,0,False
"""GAD1""",210,1380.0,0.00353,0,False
"""CHRM1""",56,305.0,0.00748,0,False
"""HRH2""",124,182.0,0.0119,0,False
"""MAOA""",95,696.0,0.0135,0,False
"""CHRNB3""",130,212.0,0.0178,0,False
