In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [2]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/vip.txt', min_set_size=min_size, annotation_obj=annotations)

In [3]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern-0.000228-candidate.snps.bed.gz')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/pbsnj-bg.snps.bed.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central-0.000192-candidate.snps.bed.gz')
c_candidates.annotate_variants(annotation_obj=annotations)



In [17]:
i_candidates = psp.Variants(variant_file='data/ancestral-0.005-candidate.snps.bed.gz')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/ancestral-bg.bed.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [18]:
i_background.variants

+--------------+-----------+-----------+
| Chromosome   | Start     | End       |
| (category)   | (int32)   | (int32)   |
|--------------+-----------+-----------|
| 1            | 1235069   | 1235070   |
| 1            | 1238849   | 1238850   |
| 1            | 1252699   | 1252700   |
| 1            | 1278319   | 1278320   |
| ...          | ...       | ...       |
| 22           | 48777099  | 48777100  |
| 22           | 48779099  | 48779100  |
| 22           | 48780899  | 48780900  |
| 22           | 48784099  | 48784100  |
+--------------+-----------+-----------+
Unstranded PyRanges object has 295,645 rows and 3 columns from 23 chromosomes.
For printing, the PyRanges was sorted on Chromosome.

In [4]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [5]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [19]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [7]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)


AttributeError: 'TestObject' object has no attribute 'n_candidates'

In [20]:
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [21]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [22]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [23]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [24]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,55,35.158733,0.000367,0.999833,0.0032,1.0,0.005866,0.999833
1,vip:0002002,DENV,7,3.183967,0.034866,0.9886,0.185133,1.0,0.176927,0.999833
11,vip:0002012,IAV,48,37.436733,0.039565,0.971901,0.185133,1.0,0.176927,0.999833
9,vip:0002010,HSV,16,10.2015,0.044232,0.976667,0.185133,1.0,0.176927,0.999833
6,vip:0002007,HCV,23,16.464033,0.058831,0.965001,0.185133,1.0,0.18826,0.999833
5,vip:0002006,HCMV,4,1.883,0.111963,0.964901,0.224517,1.0,0.298568,0.999833
10,vip:0002011,HTLV,7,4.7801,0.189394,0.905603,0.31961,1.0,0.4329,0.999833
0,vip:0002001,ADV,11,8.4365,0.221259,0.865938,0.353029,1.0,0.442519,0.999833
3,vip:0002004,EBV,39,35.121033,0.254392,0.79974,0.359889,1.0,0.452252,0.999833
14,vip:0002015,VACV,4,2.747967,0.296723,0.865738,0.390883,1.0,0.474758,0.999833


In [25]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
4,vip:0002005,HBV,11,7.694933,0.137995,0.926902,1.0,0.963062,0.982167,0.926902
1,vip:0002002,DENV,5,3.2184,0.214193,0.905403,1.0,0.963062,0.982167,0.926902
0,vip:0002001,ADV,10,8.571,0.355821,0.768574,1.0,0.963062,0.982167,0.878371
6,vip:0002007,HCV,18,16.643633,0.396653,0.697677,1.0,0.963062,0.982167,0.860197
7,vip:0002008,HIV,37,35.6146,0.429252,0.641812,1.0,0.963062,0.982167,0.860197
10,vip:0002011,HTLV,5,4.843767,0.545715,0.646745,1.0,0.963062,0.982167,0.860197
13,vip:0002014,SV40,7,6.979,0.560481,0.60398,1.0,0.963062,0.982167,0.860197
2,vip:0002003,EBOV,2,1.916733,0.579814,0.69891,1.0,0.963062,0.982167,0.860197
9,vip:0002010,HSV,10,10.3427,0.597413,0.536015,1.0,0.963062,0.982167,0.860197
11,vip:0002012,IAV,34,37.962833,0.782107,0.276757,1.0,0.903956,0.982167,0.860197


In [26]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,63,36.887033,6.7e-05,0.999967,0.0004,1.0,0.001067,0.999967
7,vip:0002008,HIV,53,34.665,0.000833,0.9995,0.00375,1.0,0.006666,0.999967
5,vip:0002006,HCMV,6,1.776067,0.0056,0.998967,0.020911,1.0,0.029866,0.999967
1,vip:0002002,DENV,8,2.999267,0.007966,0.9979,0.023633,1.0,0.031866,0.999967
6,vip:0002007,HCV,25,15.8268,0.013,0.9933,0.025107,1.0,0.041599,0.999967
9,vip:0002010,HSV,17,10.283867,0.023199,0.988367,0.041656,1.0,0.061865,0.999967
2,vip:0002003,EBOV,5,1.783467,0.028999,0.9933,0.048295,1.0,0.066284,0.999967
4,vip:0002005,HBV,12,7.424567,0.059598,0.970501,0.083379,1.0,0.119196,0.999967
8,vip:0002009,HPV,32,27.507767,0.19716,0.855671,0.254356,1.0,0.285102,0.999967
3,vip:0002004,EBV,40,35.187733,0.20396,0.844205,0.254356,1.0,0.285102,0.999967


In [27]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [28]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [29]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,92,70.773333,0.004467,0.996833,0.052333,1.0,0.071464,0.996833
1,vip:0002002,DENV,12,6.402367,0.023699,0.990167,0.1492,1.0,0.189594,0.996833
6,vip:0002007,HCV,41,33.107667,0.08753,0.937735,0.382611,1.0,0.466829,0.996833
9,vip:0002010,HSV,26,20.5442,0.121463,0.917203,0.387475,1.0,0.48585,0.996833
0,vip:0002001,ADV,21,17.0075,0.183594,0.873438,0.504267,1.0,0.526516,0.996833
11,vip:0002012,IAV,82,75.399567,0.218926,0.815573,0.504267,1.0,0.526516,0.996833
10,vip:0002011,HTLV,12,9.623867,0.247525,0.842372,0.504267,1.0,0.526516,0.996833
4,vip:0002005,HBV,18,15.304967,0.263258,0.817039,0.504267,1.0,0.526516,0.996833
5,vip:0002006,HCMV,5,3.797467,0.328189,0.825139,0.514985,1.0,0.583447,0.996833
13,vip:0002014,SV40,15,13.879833,0.415486,0.69161,0.57282,1.0,0.664778,0.996833


In [30]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
11,vip:0002012,IAV,97,74.849867,0.0037,0.9972,0.045033,1.0,0.04231,0.9972
7,vip:0002008,HIV,90,70.2796,0.007733,0.995067,0.045033,1.0,0.04231,0.9972
1,vip:0002002,DENV,13,6.217667,0.007933,0.9967,0.045033,1.0,0.04231,0.9972
4,vip:0002005,HBV,23,15.1195,0.024033,0.987434,0.071608,1.0,0.09613,0.9972
6,vip:0002007,HCV,43,32.470433,0.033132,0.977967,0.084153,1.0,0.106023,0.9972
5,vip:0002006,HCMV,7,3.690533,0.068198,0.974001,0.138489,1.0,0.170737,0.9972
2,vip:0002003,EBOV,7,3.7002,0.074698,0.969668,0.138489,1.0,0.170737,0.9972
9,vip:0002010,HSV,27,20.626567,0.08653,0.944369,0.138489,1.0,0.173061,0.9972
0,vip:0002001,ADV,21,16.934067,0.175594,0.881204,0.269063,1.0,0.312167,0.9972
13,vip:0002014,SV40,15,13.827233,0.409386,0.69601,0.558493,1.0,0.605135,0.9972


In [31]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,108,69.823733,3.3e-05,1.0,0.0,1.0,0.000267,1.0
11,vip:0002012,IAV,111,74.323767,3.3e-05,1.0,0.0,1.0,0.000267,1.0
1,vip:0002002,DENV,15,6.183233,0.001267,0.999667,0.004956,1.0,0.006755,1.0
5,vip:0002006,HCMV,10,3.659067,0.002833,0.999267,0.007208,1.0,0.009066,1.0
6,vip:0002007,HCV,48,32.290833,0.002967,0.9984,0.007208,1.0,0.009066,1.0
9,vip:0002010,HSV,33,20.485367,0.0034,0.998333,0.0074,1.0,0.009066,1.0
2,vip:0002003,EBOV,7,3.6784,0.074098,0.971101,0.13211,1.0,0.169366,1.0
0,vip:0002001,ADV,22,16.799567,0.11313,0.924869,0.187183,1.0,0.226259,1.0
3,vip:0002004,EBV,79,70.308767,0.136862,0.89067,0.19443,1.0,0.24331,1.0
4,vip:0002005,HBV,19,15.0346,0.162861,0.89377,0.224697,1.0,0.257446,1.0


In [32]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
7,vip:0002008,HIV,145,105.438333,3.3e-05,1.0,0.0,1.0,0.000533,1.0
11,vip:0002012,IAV,145,112.2866,0.0005,0.999733,0.003067,1.0,0.003911,1.0
1,vip:0002002,DENV,20,9.401633,0.000733,0.999667,0.003067,1.0,0.003911,1.0
6,vip:0002007,HCV,66,48.934467,0.006966,0.995233,0.020508,1.0,0.027866,1.0
9,vip:0002010,HSV,43,30.828067,0.015066,0.990267,0.038247,1.0,0.048212,1.0
5,vip:0002006,HCMV,11,5.573533,0.020133,0.9917,0.042211,1.0,0.053687,1.0
4,vip:0002005,HBV,30,22.729533,0.064398,0.957001,0.123148,1.0,0.147195,1.0
0,vip:0002001,ADV,32,25.370567,0.09963,0.928202,0.1631,1.0,0.190453,1.0
2,vip:0002003,EBOV,9,5.595133,0.10713,0.947302,0.166452,1.0,0.190453,1.0
10,vip:0002011,HTLV,17,14.477167,0.275924,0.81004,0.38449,1.0,0.441479,1.0
