In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [2]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/kegg.txt', min_set_size=min_size, annotation_obj=annotations)

In [3]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern-1.5e-05-candidate.snps.bed.gz')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/pbsnj-bg.snps.bed.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central-2e-05-candidate.snps.bed.gz')
c_candidates.annotate_variants(annotation_obj=annotations)



In [4]:
i_candidates = psp.Variants(variant_file='data/ancestral-5e-04-candidate.snps.bed.gz')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/ancestral-bg.bed.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [5]:
i_background.variants

+--------------+-----------+-----------+
| Chromosome   | Start     | End       |
| (category)   | (int32)   | (int32)   |
|--------------+-----------+-----------|
| 1            | 1235069   | 1235070   |
| 1            | 1238849   | 1238850   |
| 1            | 1252699   | 1252700   |
| 1            | 1278319   | 1278320   |
| ...          | ...       | ...       |
| 22           | 48777099  | 48777100  |
| 22           | 48779099  | 48779100  |
| 22           | 48780899  | 48780900  |
| 22           | 48784099  | 48784100  |
+--------------+-----------+-----------+
Unstranded PyRanges object has 295,645 rows and 3 columns from 23 chromosomes.
For printing, the PyRanges was sorted on Chromosome.

In [6]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [7]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [8]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [9]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)

In [10]:
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)

In [11]:
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [12]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [13]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [14]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [15]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
26,hsa00310,Lysine degradation,3,0.368567,0.005366,0.999700,0.7765,1.00000,0.59038,0.9997
320,hsa05144,Malaria,1,0.140533,0.131762,0.991800,0.7765,1.00000,1.00000,0.9997
110,hsa03010,Ribosome,1,0.127933,0.120363,0.992800,0.7765,1.00000,1.00000,0.9997
292,hsa04975,Fat digestion and absorption,1,0.127600,0.119896,0.992534,0.7765,1.00000,1.00000,0.9997
294,hsa04977,Vitamin digestion and absorption,1,0.124133,0.117463,0.993567,0.7765,1.00000,1.00000,0.9997
...,...,...,...,...,...,...,...,...,...,...
61,hsa00590,Arachidonic acid metabolism,0,0.216433,1.000000,0.804140,1.0000,1.00000,1.00000,0.9997
62,hsa00591,Linoleic acid metabolism,0,0.115733,1.000000,0.889870,1.0000,1.00000,1.00000,0.9997
63,hsa00592,alpha-Linolenic acid metabolism,0,0.099300,1.000000,0.904737,1.0000,1.00000,1.00000,0.9997
362,hsa05310,Asthma,0,0.027900,1.000000,0.972434,1.0000,1.00000,1.00000,0.9997


In [16]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
320,hsa05144,Malaria,2,0.141600,0.008000,0.999767,1.0,1.0,1.0,0.999767
26,hsa00310,Lysine degradation,0,0.385500,1.000000,0.675944,1.0,1.0,1.0,0.999767
15,hsa00190,Oxidative phosphorylation,0,0.298400,1.000000,0.740209,1.0,1.0,1.0,0.999767
16,hsa00199,Cytochrome P450,0,0.191800,1.000000,0.826406,1.0,1.0,1.0,0.999767
331,hsa05166,Human T-cell leukemia virus 1 infection,0,1.130133,1.000000,0.316356,1.0,1.0,1.0,0.999767
...,...,...,...,...,...,...,...,...,...,...
158,hsa04068,FoxO signaling pathway,1,0.731367,0.524316,0.836039,1.0,1.0,1.0,0.999767
275,hsa04931,Insulin resistance,1,0.730100,0.523849,0.834972,1.0,1.0,1.0,0.999767
178,hsa04144,Endocytosis,2,1.732300,0.522316,0.752908,1.0,1.0,1.0,0.999767
354,hsa05222,Small cell lung cancer,1,0.718900,0.518649,0.838905,1.0,1.0,1.0,0.999767


In [17]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
117,hsa03019,Messenger RNA biogenesis,7,1.758700,0.002400,0.999500,0.319567,1.000000,0.863971,0.999867
26,hsa00310,Lysine degradation,3,0.360667,0.005200,0.999667,0.371600,1.000000,0.935969,0.999867
124,hsa03036,Chromosome and associated proteins,13,6.796667,0.017199,0.992600,0.819144,0.997501,1.000000,0.999867
6,hsa00053,Ascorbate and aldarate metabolism,1,0.023967,0.023866,0.999867,0.870642,1.000000,1.000000,0.999867
125,hsa03040,Spliceosome,2,0.284667,0.032632,0.997600,0.975833,1.000000,1.000000,0.999867
...,...,...,...,...,...,...,...,...,...,...
183,hsa04151,PI3K-Akt signaling pathway,1,2.836833,0.945235,0.213460,1.000000,0.574670,1.000000,0.999867
91,hsa01002,Peptidases and inhibitors,2,4.064400,0.922869,0.218526,1.000000,0.574670,1.000000,0.999867
181,hsa04147,Exosome,2,3.865700,0.904803,0.250925,1.000000,0.574670,1.000000,0.999867
251,hsa04750,Inflammatory mediator regulation of TRP channels,1,0.990867,0.635312,0.738542,1.000000,0.875260,1.000000,0.999867


In [18]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [19]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [20]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
320,hsa05144,Malaria,3,0.282133,0.003067,0.999833,0.500733,1.0,0.357588,0.999833
292,hsa04975,Fat digestion and absorption,2,0.257600,0.028432,0.997767,0.500733,1.0,0.787358,0.999833
140,hsa04010,MAPK signaling pathway,10,4.859067,0.024333,0.990567,0.500733,1.0,0.768974,0.999833
64,hsa00600,Sphingolipid metabolism,3,0.604133,0.022966,0.996533,0.500733,1.0,0.768974,0.999833
334,hsa05169,Epstein-Barr virus infection,5,1.500933,0.017966,0.995167,0.500733,1.0,0.718643,0.999833
...,...,...,...,...,...,...,...,...,...,...
6,hsa00053,Ascorbate and aldarate metabolism,0,0.079767,1.000000,0.923136,1.000000,1.0,1.000000,0.999833
3,hsa00040,Pentose and glucuronate interconversions,0,0.155000,1.000000,0.854905,1.000000,1.0,1.000000,0.999833
2,hsa00030,Pentose phosphate pathway,0,0.175100,1.000000,0.839605,1.000000,1.0,1.000000,0.999833
358,hsa05226,Gastric cancer,0,1.963333,1.000000,0.137529,1.000000,1.0,1.000000,0.999833


In [21]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
107,hsa03000,Transcription factors,16,9.125967,0.021899,0.989434,1.0,1.000000,1.0,0.998133
352,hsa05220,Chronic myeloid leukemia,0,0.934333,1.000000,0.390387,1.0,1.000000,1.0,0.998133
362,hsa05310,Asthma,0,0.068133,1.000000,0.934169,1.0,1.000000,1.0,0.998133
361,hsa05235,PD-L1 expression and PD-1 checkpoint pathway i...,0,1.096367,1.000000,0.331922,1.0,1.000000,1.0,0.998133
360,hsa05231,Choline metabolism in cancer,0,1.791067,1.000000,0.163328,1.0,0.980527,1.0,0.998133
...,...,...,...,...,...,...,...,...,...,...
96,hsa01009,Protein phosphatases and associated proteins,9,8.797733,0.525216,0.618013,1.0,1.000000,1.0,0.998133
181,hsa04147,Exosome,8,7.810967,0.523516,0.618913,1.0,1.000000,1.0,0.998133
316,hsa05135,Yersinia infection,2,1.717500,0.516283,0.754308,1.0,1.000000,1.0,0.998133
273,hsa04929,GnRH secretion,2,1.672833,0.503650,0.766408,1.0,1.000000,1.0,0.998133


In [22]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
26,hsa00310,Lysine degradation,6,0.729233,0.000133,1.000000,0.013900,1.000000,0.047998,1.0
125,hsa03040,Spliceosome,4,0.585333,0.002867,0.999567,0.230350,1.000000,0.515983,1.0
108,hsa03008,Ribosome biogenesis in eukaryotes,3,0.527633,0.015233,0.998000,0.878844,1.000000,1.000000,1.0
109,hsa03009,Ribosome biogenesis,5,1.517700,0.018233,0.996000,0.878844,1.000000,1.000000,1.0
333,hsa05168,Herpes simplex virus 1 infection,6,2.038800,0.019599,0.994667,0.878844,1.000000,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...
259,hsa04915,Estrogen signaling pathway,2,2.091200,0.623579,0.653812,1.000000,0.925419,1.000000,1.0
190,hsa04217,Necroptosis,1,0.942933,0.612013,0.758341,1.000000,0.987970,1.000000,1.0
156,hsa04064,NF-kappa B signaling pathway,1,0.930233,0.608180,0.758441,1.000000,0.987970,1.000000,1.0
113,hsa03013,RNA transport,1,0.908767,0.596913,0.767674,1.000000,0.987970,1.000000,1.0


In [23]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
26,hsa00310,Lysine degradation,6,1.114733,0.000767,0.999933,0.123033,1.000000,0.275991,0.999933
140,hsa04010,MAPK signaling pathway,13,7.294667,0.032232,0.985234,0.743667,1.000000,0.892586,0.999933
218,hsa04621,NOD-like receptor signaling pathway,6,2.238600,0.024199,0.993534,0.743667,1.000000,0.725976,0.999933
124,hsa03036,Chromosome and associated proteins,31,21.237633,0.021433,0.986734,0.743667,1.000000,0.701431,0.999933
107,hsa03000,Transcription factors,22,13.759100,0.019633,0.989334,0.743667,1.000000,0.701431,0.999933
...,...,...,...,...,...,...,...,...,...,...
253,hsa04812,Cytoskeleton proteins,3,10.127633,0.997867,0.008000,1.000000,0.162833,1.000000,0.999933
143,hsa04015,Rap1 signaling pathway,1,6.441267,0.998700,0.010166,1.000000,0.162833,1.000000,0.999933
15,hsa00190,Oxidative phosphorylation,0,0.863033,1.000000,0.422586,1.000000,1.000000,1.000000,0.999933
12,hsa00120,Primary bile acid biosynthesis,0,0.246533,1.000000,0.781441,1.000000,1.000000,1.000000,0.999933
