In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [2]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/vip.txt', min_set_size=min_size, annotation_obj=annotations)

In [5]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern-1.5e-05-candidate.snps.bed.gz')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/pbsnj-bg.snps.bed.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central-2e-05-candidate.snps.bed.gz')
c_candidates.annotate_variants(annotation_obj=annotations)



In [7]:
i_candidates = psp.Variants(variant_file='data/ancestral-5e-04-candidate.snps.bed.gz')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/ancestral-bg.bed.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [8]:
i_background.variants

+--------------+-----------+-----------+
| Chromosome   | Start     | End       |
| (category)   | (int32)   | (int32)   |
|--------------+-----------+-----------|
| 1            | 1235069   | 1235070   |
| 1            | 1238849   | 1238850   |
| 1            | 1252699   | 1252700   |
| 1            | 1278319   | 1278320   |
| ...          | ...       | ...       |
| 22           | 48777099  | 48777100  |
| 22           | 48779099  | 48779100  |
| 22           | 48780899  | 48780900  |
| 22           | 48784099  | 48784100  |
+--------------+-----------+-----------+
Unstranded PyRanges object has 295,645 rows and 3 columns from 23 chromosomes.
For printing, the PyRanges was sorted on Chromosome.

In [9]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [10]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [11]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [12]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)

In [13]:
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)

In [14]:
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [15]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [16]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [17]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [18]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
6,vip:0002007,HCV,4,2.052,0.148528,0.944769,1.0,1.0,0.821992,0.947302
13,vip:0002014,SV40,2,0.855667,0.210526,0.947302,1.0,1.0,0.821992,0.947302
4,vip:0002005,HBV,2,0.989567,0.258058,0.925336,1.0,1.0,0.821992,0.947302
7,vip:0002008,HIV,6,4.3702,0.274724,0.852305,1.0,1.0,0.821992,0.947302
3,vip:0002004,EBV,6,4.5107,0.297557,0.834339,1.0,1.0,0.821992,0.947302
11,vip:0002012,IAV,6,4.6249,0.318256,0.818039,1.0,1.0,0.821992,0.947302
9,vip:0002010,HSV,2,1.26,0.359621,0.868538,1.0,1.0,0.821992,0.947302
10,vip:0002011,HTLV,1,0.605167,0.459118,0.879137,1.0,1.0,0.918236,0.947302
0,vip:0002001,ADV,1,1.041933,0.652545,0.719076,1.0,1.0,1.0,0.947302
12,vip:0002013,KSHV,2,3.109567,0.824506,0.396153,1.0,1.0,1.0,0.947302


In [19]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,7,4.4751,0.160461,0.919469,1.0,1.0,1.0,0.941435
0,vip:0002001,ADV,2,1.055667,0.284557,0.911136,1.0,1.0,1.0,0.941435
1,vip:0002002,DENV,1,0.389233,0.323423,0.941435,1.0,1.0,1.0,0.941435
11,vip:0002012,IAV,6,4.683267,0.327022,0.809806,1.0,1.0,1.0,0.941435
10,vip:0002011,HTLV,1,0.613033,0.462018,0.874571,1.0,1.0,1.0,0.941435
8,vip:0002009,HPV,4,3.601867,0.48505,0.708343,1.0,1.0,1.0,0.941435
13,vip:0002014,SV40,1,0.8796,0.59158,0.782707,1.0,1.0,1.0,0.941435
12,vip:0002013,KSHV,3,3.1816,0.626846,0.60258,1.0,1.0,1.0,0.941435
4,vip:0002005,HBV,1,1.002233,0.642745,0.735542,1.0,1.0,1.0,0.941435
3,vip:0002004,EBV,4,4.5863,0.678777,0.51425,1.0,1.0,1.0,0.941435


In [20]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,14,4.4427,3.3e-05,1.0,0.0,1.0,0.000533,1.0
2,vip:0002003,EBOV,4,0.2056,6.7e-05,1.0,0.000117,1.0,0.000533,1.0
7,vip:0002008,HIV,10,4.210333,0.0092,0.9971,0.019644,1.0,0.049065,1.0
0,vip:0002001,ADV,4,1.0044,0.016766,0.996667,0.034825,1.0,0.067064,1.0
9,vip:0002010,HSV,4,1.2535,0.034199,0.9926,0.057193,1.0,0.101063,1.0
6,vip:0002007,HCV,5,1.9319,0.042965,0.9876,0.062628,1.0,0.101063,1.0
1,vip:0002002,DENV,2,0.3604,0.049732,0.9952,0.062628,1.0,0.101063,1.0
13,vip:0002014,SV40,3,0.848933,0.050532,0.990634,0.062628,1.0,0.101063,1.0
5,vip:0002006,HCMV,1,0.209167,0.19056,0.982501,0.20767,1.0,0.338774,1.0
14,vip:0002015,VACV,1,0.3289,0.282924,0.958868,0.299063,1.0,0.452678,1.0


In [21]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [22]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [23]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,13,8.8453,0.108763,0.938969,1.0,1.0,0.841927,0.938969
11,vip:0002012,IAV,12,9.308167,0.224559,0.857938,1.0,1.0,0.841927,0.938969
13,vip:0002014,SV40,3,1.735267,0.249925,0.90367,1.0,1.0,0.841927,0.938969
4,vip:0002005,HBV,3,1.9918,0.321923,0.863305,1.0,1.0,0.841927,0.938969
10,vip:0002011,HTLV,2,1.2182,0.348455,0.877571,1.0,1.0,0.841927,0.938969
0,vip:0002001,ADV,3,2.0976,0.348722,0.839939,1.0,1.0,0.841927,0.938969
6,vip:0002007,HCV,5,4.135167,0.395587,0.765541,1.0,1.0,0.841927,0.938969
3,vip:0002004,EBV,10,9.097,0.423753,0.69581,1.0,1.0,0.841927,0.938969
9,vip:0002010,HSV,3,2.555833,0.473584,0.745508,1.0,1.0,0.841927,0.938969
1,vip:0002002,DENV,1,0.7756,0.540949,0.817339,1.0,1.0,0.865518,0.938969


In [24]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,20,9.125967,0.0009,0.999733,0.0088,1.0,0.007466,0.999967
2,vip:0002003,EBOV,4,0.440667,0.000933,0.999967,0.0088,1.0,0.007466,0.999967
7,vip:0002008,HIV,17,8.685433,0.006133,0.997333,0.017244,1.0,0.03271,0.999967
0,vip:0002001,ADV,6,2.060067,0.017233,0.9959,0.038592,1.0,0.068931,0.999967
1,vip:0002002,DENV,3,0.749633,0.040365,0.9941,0.07082,1.0,0.129169,0.999967
13,vip:0002014,SV40,4,1.728533,0.09293,0.971401,0.147828,1.0,0.247814,0.999967
9,vip:0002010,HSV,5,2.549333,0.113063,0.957701,0.164781,1.0,0.258429,0.999967
6,vip:0002007,HCV,6,4.015067,0.214593,0.891437,0.279575,1.0,0.429186,0.999967
10,vip:0002011,HTLV,2,1.214033,0.343989,0.880804,0.434985,1.0,0.574061,0.999967
5,vip:0002006,HCMV,1,0.439567,0.358788,0.929869,0.448743,1.0,0.574061,0.999967


In [25]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,20,9.0676,0.000767,0.999833,0.0065,1.0,0.007466,1.0
2,vip:0002003,EBOV,4,0.4277,0.000933,1.0,0.0065,1.0,0.007466,1.0
7,vip:0002008,HIV,16,8.580533,0.010933,0.994967,0.0392,1.0,0.058309,1.0
6,vip:0002007,HCV,9,3.9839,0.018766,0.993734,0.042117,1.0,0.075064,1.0
13,vip:0002014,SV40,5,1.7046,0.026266,0.992967,0.047333,1.0,0.084051,1.0
9,vip:0002010,HSV,6,2.5135,0.040665,0.988167,0.063594,1.0,0.108441,1.0
0,vip:0002001,ADV,5,2.046333,0.053098,0.983701,0.078962,1.0,0.121367,1.0
1,vip:0002002,DENV,2,0.746767,0.170894,0.961435,0.2318,1.0,0.341789,1.0
3,vip:0002004,EBV,11,8.959,0.286124,0.812506,0.319626,1.0,0.481157,1.0
4,vip:0002005,HBV,3,1.9221,0.300723,0.875304,0.319626,1.0,0.481157,1.0


In [26]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,26,13.750867,0.001167,0.9996,0.011267,1.0,0.018666,0.9996
2,vip:0002003,EBOV,4,0.662767,0.0044,0.9994,0.02255,1.0,0.029155,0.9996
7,vip:0002008,HIV,23,13.055633,0.005466,0.9972,0.02255,1.0,0.029155,0.9996
0,vip:0002001,ADV,7,3.102,0.037132,0.987167,0.099658,1.0,0.144315,0.9996
13,vip:0002014,SV40,6,2.5842,0.045098,0.985867,0.105173,1.0,0.144315,0.9996
6,vip:0002007,HCV,10,6.067067,0.08583,0.956868,0.133383,1.0,0.203422,0.9996
9,vip:0002010,HSV,7,3.809333,0.088997,0.962535,0.133383,1.0,0.203422,0.9996
1,vip:0002002,DENV,3,1.136,0.105263,0.972134,0.135983,1.0,0.210526,0.9996
10,vip:0002011,HTLV,3,1.8192,0.275724,0.893637,0.363489,1.0,0.490176,0.9996
4,vip:0002005,HBV,4,2.924333,0.336089,0.832739,0.405833,1.0,0.537742,0.9996
