In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [14]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/vip.txt', min_set_size=min_size, annotation_obj=annotations)

In [3]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern_candidates.txt')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/eastern_background.txt.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central_candidates.txt')
c_candidates.annotate_variants(annotation_obj=annotations)

i_candidates = psp.Variants(variant_file='data/internal_candidates.txt')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/internal_background.txt.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [15]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [16]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [17]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [7]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [19]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [9]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [23]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [24]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,57,38.215933,0.001133,0.9994,0.0112,1.0,0.018133,0.9994
11,vip:0002012,IAV,51,40.711567,0.049498,0.965234,0.261533,1.0,0.278391,0.9994
1,vip:0002002,DENV,7,3.4562,0.052198,0.980834,0.261533,1.0,0.278391,0.9994
9,vip:0002010,HSV,16,11.067967,0.079297,0.953902,0.261533,1.0,0.317189,0.9994
6,vip:0002007,HCV,23,17.8635,0.117829,0.923203,0.272853,1.0,0.330732,0.9994
10,vip:0002011,HTLV,8,5.190033,0.130962,0.937535,0.277161,1.0,0.330732,0.9994
5,vip:0002006,HCMV,4,2.058867,0.144695,0.949935,0.277161,1.0,0.330732,0.9994
3,vip:0002004,EBV,42,38.0451,0.258725,0.792607,0.402371,1.0,0.517449,0.9994
0,vip:0002001,ADV,11,9.172033,0.305923,0.800173,0.429293,1.0,0.543863,0.9994
14,vip:0002015,VACV,4,2.992733,0.349555,0.828639,0.498083,1.0,0.559288,0.9994


In [25]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
1,vip:0002002,DENV,6,3.459933,0.125229,0.948868,1.0,1.0,0.985434,0.948868
4,vip:0002005,HBV,11,8.266667,0.19436,0.88827,1.0,1.0,0.985434,0.947488
0,vip:0002001,ADV,11,9.206467,0.309423,0.79564,1.0,1.0,0.985434,0.909303
6,vip:0002007,HCV,20,17.906233,0.334389,0.750575,1.0,1.0,0.985434,0.909303
7,vip:0002008,HIV,41,38.243133,0.342422,0.718809,1.0,1.0,0.985434,0.909303
10,vip:0002011,HTLV,6,5.178767,0.415386,0.750508,1.0,1.0,0.985434,0.909303
2,vip:0002003,EBOV,2,2.0505,0.618179,0.661445,1.0,1.0,0.985434,0.909303
13,vip:0002014,SV40,7,7.503433,0.640345,0.519416,1.0,1.0,0.985434,0.909303
9,vip:0002010,HSV,10,11.126267,0.689177,0.438019,1.0,1.0,0.985434,0.881637
8,vip:0002009,HPV,28,30.323567,0.70601,0.366688,1.0,1.0,0.985434,0.881637


In [26]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,66,39.339267,6.7e-05,1.0,0.000367,1.0,0.001067,1.0
7,vip:0002008,HIV,56,36.8718,0.0005,0.9998,0.002167,1.0,0.004,1.0
5,vip:0002006,HCMV,6,1.900667,0.008466,0.9984,0.0308,1.0,0.045154,1.0
1,vip:0002002,DENV,8,3.192133,0.011766,0.9966,0.034733,1.0,0.046718,1.0
6,vip:0002007,HCV,26,16.837367,0.0146,0.9919,0.034733,1.0,0.046718,1.0
2,vip:0002003,EBOV,5,1.8881,0.039065,0.990234,0.077322,1.0,0.091121,1.0
9,vip:0002010,HSV,17,10.945367,0.039865,0.980001,0.077322,1.0,0.091121,1.0
4,vip:0002005,HBV,12,7.879133,0.085297,0.956168,0.120308,1.0,0.164202,1.0
15,vip:0002016,Coronaviruses,4,1.7422,0.092364,0.974634,0.120308,1.0,0.164202,1.0
12,vip:0002013,KSHV,31,25.835233,0.153628,0.889137,0.19321,1.0,0.237325,1.0


In [27]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [28]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [29]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,98,76.459067,0.004733,0.996667,0.051633,1.0,0.075731,0.996667
1,vip:0002002,DENV,13,6.916133,0.019899,0.991467,0.125067,1.0,0.159195,0.996667
6,vip:0002007,HCV,43,35.769733,0.113063,0.916736,0.4882,1.0,0.571714,0.996667
10,vip:0002011,HTLV,14,10.3688,0.142929,0.914503,0.4882,1.0,0.571714,0.996667
0,vip:0002001,ADV,22,18.3785,0.213926,0.850105,0.551993,1.0,0.585492,0.996667
9,vip:0002010,HSV,26,22.194233,0.219559,0.838739,0.551993,1.0,0.585492,0.996667
11,vip:0002012,IAV,87,81.562433,0.274091,0.764308,0.551993,1.0,0.626493,0.996667
4,vip:0002005,HBV,18,16.5219,0.381487,0.711143,0.650875,1.0,0.704214,0.996667
5,vip:0002006,HCMV,5,4.111833,0.39612,0.774608,0.658322,1.0,0.704214,0.996667
13,vip:0002014,SV40,15,14.983967,0.541782,0.568348,0.762547,1.0,0.795796,0.996667


In [30]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,97,75.114933,0.004233,0.9971,0.0444,1.0,0.033066,0.997433
11,vip:0002012,IAV,102,80.190133,0.006,0.9958,0.0444,1.0,0.033066,0.997433
1,vip:0002002,DENV,14,6.652067,0.0062,0.997433,0.0444,1.0,0.033066,0.997433
6,vip:0002007,HCV,46,34.7436,0.026232,0.983167,0.078833,1.0,0.10493,0.997433
4,vip:0002005,HBV,23,16.1458,0.046998,0.971168,0.118407,1.0,0.150395,0.997433
5,vip:0002006,HCMV,7,3.953633,0.09433,0.961868,0.199839,1.0,0.224754,0.997433
2,vip:0002003,EBOV,7,3.9386,0.09833,0.958835,0.199839,1.0,0.224754,0.997433
0,vip:0002001,ADV,23,18.220767,0.143629,0.90357,0.236442,1.0,0.271932,0.997433
9,vip:0002010,HSV,27,22.071633,0.152962,0.893504,0.236442,1.0,0.271932,0.997433
15,vip:0002016,Coronaviruses,5,3.592733,0.28999,0.853472,0.401453,1.0,0.463985,0.997433


In [31]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,113,75.087733,3.3e-05,1.0,0.0,1.0,0.000267,1.0
11,vip:0002012,IAV,117,80.050833,3.3e-05,1.0,0.0,1.0,0.000267,1.0
1,vip:0002002,DENV,15,6.648333,0.002233,0.9992,0.009111,1.0,0.011911,1.0
5,vip:0002006,HCMV,10,3.959533,0.004967,0.998767,0.01485,1.0,0.019866,1.0
6,vip:0002007,HCV,49,34.700867,0.007266,0.995733,0.017233,1.0,0.023253,1.0
9,vip:0002010,HSV,33,22.013333,0.009866,0.9946,0.019194,1.0,0.02631,1.0
2,vip:0002003,EBOV,7,3.9489,0.098097,0.958101,0.190514,1.0,0.215605,1.0
14,vip:0002015,VACV,9,5.927933,0.130662,0.934836,0.215383,1.0,0.215605,1.0
0,vip:0002001,ADV,23,18.186333,0.139362,0.907903,0.215383,1.0,0.215605,1.0
10,vip:0002011,HTLV,14,10.349767,0.141195,0.917769,0.215383,1.0,0.215605,1.0


In [32]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,154,113.330867,0.0001,0.999933,0.000767,1.0,0.0016,0.999933
1,vip:0002002,DENV,21,10.108267,0.001067,0.999533,0.0069,1.0,0.006755,0.999933
11,vip:0002012,IAV,153,120.9017,0.001267,0.999133,0.0069,1.0,0.006755,0.999933
6,vip:0002007,HCV,69,52.6071,0.0106,0.993034,0.0322,1.0,0.042399,0.999933
5,vip:0002006,HCMV,11,6.0125,0.034799,0.984401,0.092293,1.0,0.111356,0.999933
9,vip:0002010,HSV,43,33.1396,0.043699,0.970634,0.097611,1.0,0.116529,0.999933
0,vip:0002001,ADV,34,27.3928,0.105796,0.924869,0.203,1.0,0.238179,0.999933
4,vip:0002005,HBV,30,24.401033,0.132462,0.907303,0.220096,1.0,0.238179,0.999933
10,vip:0002011,HTLV,20,15.528533,0.137295,0.912236,0.220096,1.0,0.238179,0.999933
2,vip:0002003,EBOV,9,5.9994,0.148862,0.921769,0.220096,1.0,0.238179,0.999933
