In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [2]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/vip.txt', min_set_size=min_size, annotation_obj=annotations)

In [3]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern-3.5e-05-candidate.snps.bed.gz')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/pbsnj-bg.snps.bed.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central-4e-05-candidate.snps.bed.gz')
c_candidates.annotate_variants(annotation_obj=annotations)



In [24]:
i_candidates = psp.Variants(variant_file='data/ancestral-0.001-candidate.snps.bed.gz')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/ancestral-bg.bed.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [5]:
i_background.variants

+--------------+-----------+-----------+
| Chromosome   | Start     | End       |
| (category)   | (int32)   | (int32)   |
|--------------+-----------+-----------|
| 1            | 1235069   | 1235070   |
| 1            | 1238849   | 1238850   |
| 1            | 1252699   | 1252700   |
| 1            | 1278319   | 1278320   |
| ...          | ...       | ...       |
| 22           | 48777099  | 48777100  |
| 22           | 48779099  | 48779100  |
| 22           | 48780899  | 48780900  |
| 22           | 48784099  | 48784100  |
+--------------+-----------+-----------+
Unstranded PyRanges object has 295,645 rows and 3 columns from 23 chromosomes.
For printing, the PyRanges was sorted on Chromosome.

In [6]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [7]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [25]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [9]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)

In [10]:
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)

In [26]:
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [27]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [13]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [28]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [15]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
6,vip:0002007,HCV,8,3.947367,0.043932,0.982067,0.437667,1.0,0.577794,0.982067
14,vip:0002015,VACV,2,0.640467,0.133429,0.975401,0.667183,1.0,0.577794,0.982067
7,vip:0002008,HIV,12,8.4111,0.136562,0.920203,0.667183,1.0,0.577794,0.982067
1,vip:0002002,DENV,2,0.750233,0.172061,0.961235,0.667183,1.0,0.577794,0.982067
11,vip:0002012,IAV,12,8.881733,0.180561,0.89057,0.667183,1.0,0.577794,0.982067
4,vip:0002005,HBV,3,1.886367,0.291857,0.886704,0.667183,1.0,0.714843,0.982067
10,vip:0002011,HTLV,2,1.159867,0.324656,0.894504,0.667183,1.0,0.714843,0.982067
2,vip:0002003,EBOV,1,0.439033,0.357421,0.929269,0.667183,1.0,0.714843,0.982067
9,vip:0002010,HSV,3,2.4334,0.441352,0.774108,0.667183,1.0,0.784626,0.982067
13,vip:0002014,SV40,2,1.642833,0.496117,0.777007,0.667183,1.0,0.793787,0.982067


In [16]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,12,8.440833,0.135329,0.922269,1.0,1.0,1.0,0.926769
11,vip:0002012,IAV,12,8.9447,0.184661,0.88677,1.0,1.0,1.0,0.926769
10,vip:0002011,HTLV,2,1.157933,0.322056,0.893037,1.0,1.0,1.0,0.926769
2,vip:0002003,EBOV,1,0.4467,0.362221,0.926769,1.0,1.0,1.0,0.926769
14,vip:0002015,VACV,1,0.6546,0.482484,0.860538,1.0,1.0,1.0,0.926769
8,vip:0002009,HPV,7,6.832167,0.530782,0.626112,1.0,1.0,1.0,0.926769
1,vip:0002002,DENV,1,0.7595,0.536615,0.824239,1.0,1.0,1.0,0.926769
12,vip:0002013,KSHV,6,5.980467,0.556948,0.611513,1.0,1.0,1.0,0.926769
0,vip:0002001,ADV,2,2.021767,0.60348,0.667978,1.0,1.0,1.0,0.926769
6,vip:0002007,HCV,3,3.961967,0.764908,0.436552,1.0,1.0,1.0,0.926769


In [29]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,22,8.5927,0.0001,0.999967,0.0003,1.0,0.0016,0.999967
7,vip:0002008,HIV,19,8.153667,0.0004,0.9999,0.00145,1.0,0.002933,0.999967
0,vip:0002001,ADV,8,1.943833,0.000667,0.999967,0.002156,1.0,0.002933,0.999967
2,vip:0002003,EBOV,4,0.4098,0.000733,0.999967,0.002156,1.0,0.002933,0.999967
9,vip:0002010,HSV,7,2.432767,0.009433,0.997867,0.018373,1.0,0.030186,0.999967
13,vip:0002014,SV40,5,1.630733,0.021533,0.994567,0.027422,1.0,0.05742,0.999967
6,vip:0002007,HCV,8,3.717633,0.032132,0.9882,0.045543,1.0,0.073445,0.999967
5,vip:0002006,HCMV,2,0.410633,0.063065,0.9928,0.069567,1.0,0.126129,0.999967
4,vip:0002005,HBV,4,1.792067,0.101697,0.969001,0.114148,1.0,0.180794,0.999967
8,vip:0002009,HPV,10,6.607033,0.123296,0.935635,0.121793,1.0,0.197273,0.999967


In [30]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [33]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [34]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,24,16.851933,0.051398,0.967868,0.5243,1.0,0.503602,0.967868
11,vip:0002012,IAV,24,17.826433,0.087297,0.942335,0.5243,1.0,0.503602,0.967868
14,vip:0002015,VACV,3,1.295067,0.138429,0.959335,0.5243,1.0,0.503602,0.967868
6,vip:0002007,HCV,11,7.909333,0.169894,0.899503,0.5243,1.0,0.503602,0.967868
1,vip:0002002,DENV,3,1.509733,0.192294,0.935369,0.5243,1.0,0.503602,0.967868
10,vip:0002011,HTLV,4,2.3178,0.200993,0.918136,0.5243,1.0,0.503602,0.967868
2,vip:0002003,EBOV,2,0.885733,0.220326,0.941369,0.5243,1.0,0.503602,0.967868
4,vip:0002005,HBV,4,3.7941,0.535582,0.670544,0.807763,1.0,0.941884,0.967868
0,vip:0002001,ADV,4,4.0203,0.572248,0.624279,0.827478,1.0,0.941884,0.967868
8,vip:0002009,HPV,13,13.6095,0.606813,0.502417,0.83003,1.0,0.941884,0.967868


In [35]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,34,17.5374,0.0003,0.999933,0.0027,1.0,0.0048,0.999933
7,vip:0002008,HIV,31,16.5945,0.0008,0.999667,0.003483,1.0,0.0064,0.999933
2,vip:0002003,EBOV,5,0.8565,0.001367,0.9999,0.003911,1.0,0.007289,0.999933
0,vip:0002001,ADV,10,3.9656,0.006733,0.998067,0.0148,1.0,0.026932,0.999933
13,vip:0002014,SV40,6,3.289733,0.11043,0.953702,0.240353,1.0,0.309856,0.999933
9,vip:0002010,HSV,8,4.866867,0.116196,0.942935,0.240353,1.0,0.309856,0.999933
6,vip:0002007,HCV,11,7.6796,0.147762,0.915369,0.240353,1.0,0.332522,0.999933
1,vip:0002002,DENV,3,1.445,0.174594,0.942135,0.240353,1.0,0.332522,0.999933
8,vip:0002009,HPV,17,13.4392,0.189594,0.873238,0.240353,1.0,0.332522,0.999933
5,vip:0002006,HCMV,2,0.8487,0.207826,0.946235,0.254123,1.0,0.332522,0.999933


In [36]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,34,17.474433,0.0003,0.999833,0.002267,1.0,0.0048,0.999867
7,vip:0002008,HIV,31,16.564767,0.000767,0.999667,0.003017,1.0,0.006133,0.999867
2,vip:0002003,EBOV,5,0.848833,0.001967,0.999867,0.006644,1.0,0.010489,0.999867
6,vip:0002007,HCV,16,7.665,0.004533,0.998067,0.010692,1.0,0.018133,0.999867
0,vip:0002001,ADV,10,3.942367,0.0061,0.997833,0.011027,1.0,0.019519,0.999867
9,vip:0002010,HSV,10,4.866167,0.023566,0.9908,0.039506,1.0,0.062842,0.999867
13,vip:0002014,SV40,7,3.273567,0.044465,0.984701,0.064552,1.0,0.101635,0.999867
1,vip:0002002,DENV,4,1.435733,0.056165,0.984867,0.089371,1.0,0.11233,0.999867
4,vip:0002005,HBV,7,3.678433,0.071498,0.970501,0.089371,1.0,0.127107,0.999867
14,vip:0002015,VACV,3,1.270933,0.135129,0.960768,0.1412,1.0,0.216206,0.999867


In [37]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,46,26.419133,0.0003,0.999833,0.002333,1.0,0.004267,0.999833
7,vip:0002008,HIV,43,25.0056,0.000533,0.9998,0.00235,1.0,0.004267,0.999833
2,vip:0002003,EBOV,6,1.295533,0.0018,0.9997,0.005989,1.0,0.0096,0.999833
0,vip:0002001,ADV,12,5.964133,0.018166,0.992534,0.0471,1.0,0.072664,0.999833
6,vip:0002007,HCV,19,11.626967,0.025566,0.985967,0.05566,1.0,0.081811,0.999833
1,vip:0002002,DENV,5,2.195233,0.070164,0.977034,0.137644,1.0,0.187105,0.999833
9,vip:0002010,HSV,11,7.300267,0.11493,0.938635,0.183767,1.0,0.230096,0.999833
13,vip:0002014,SV40,8,4.932567,0.119929,0.942569,0.183767,1.0,0.230096,0.999833
14,vip:0002015,VACV,4,1.925533,0.129429,0.955035,0.183767,1.0,0.230096,0.999833
4,vip:0002005,HBV,8,5.586167,0.19636,0.89587,0.251053,1.0,0.314176,0.999833
