In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [2]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/kegg.txt', min_set_size=min_size, annotation_obj=annotations)

In [3]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern-3.5e-05-candidate.snps.bed.gz')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/pbsnj-bg.snps.bed.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central-4e-05-candidate.snps.bed.gz')
c_candidates.annotate_variants(annotation_obj=annotations)



In [4]:
i_candidates = psp.Variants(variant_file='data/ancestral-0.001-candidate.snps.bed.gz')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/ancestral-bg.bed.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [5]:
i_background.variants

+--------------+-----------+-----------+
| Chromosome   | Start     | End       |
| (category)   | (int32)   | (int32)   |
|--------------+-----------+-----------|
| 1            | 1235069   | 1235070   |
| 1            | 1238849   | 1238850   |
| 1            | 1252699   | 1252700   |
| 1            | 1278319   | 1278320   |
| ...          | ...       | ...       |
| 22           | 48777099  | 48777100  |
| 22           | 48779099  | 48779100  |
| 22           | 48780899  | 48780900  |
| 22           | 48784099  | 48784100  |
+--------------+-----------+-----------+
Unstranded PyRanges object has 295,645 rows and 3 columns from 23 chromosomes.
For printing, the PyRanges was sorted on Chromosome.

In [6]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [7]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [8]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [9]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)

In [10]:
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)

In [11]:
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [12]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [13]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [14]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [15]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
27,hsa00330,Arginine and proline metabolism,3,0.211300,0.001167,0.999967,0.162767,1.000000,0.419986,0.999967
84,hsa00910,Nitrogen metabolism,2,0.126567,0.007200,0.999700,0.567733,1.000000,1.000000,0.999967
334,hsa05169,Epstein-Barr virus infection,5,1.432433,0.014066,0.997000,0.782478,1.000000,1.000000,0.999967
218,hsa04621,NOD-like receptor signaling pathway,5,1.457333,0.014766,0.996933,0.782478,1.000000,1.000000,0.999967
22,hsa00260,"Glycine, serine and threonine metabolism",2,0.200167,0.017199,0.999000,0.782478,1.000000,1.000000,0.999967
...,...,...,...,...,...,...,...,...,...,...
206,hsa04512,ECM-receptor interaction,2,1.721500,0.520783,0.756541,1.000000,1.000000,1.000000,0.999967
303,hsa05030,Cocaine addiction,1,0.700600,0.512450,0.846905,1.000000,1.000000,1.000000,0.999967
360,hsa05231,Choline metabolism in cancer,2,1.686367,0.510550,0.768241,1.000000,1.000000,1.000000,0.999967
246,hsa04728,Dopaminergic synapse,3,2.805667,0.537782,0.689810,1.000000,0.990757,1.000000,0.999967


In [16]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
218,hsa04621,NOD-like receptor signaling pathway,6,1.461767,0.003400,0.999400,0.544033,1.0,0.542100,0.999633
259,hsa04915,Estrogen signaling pathway,4,2.015200,0.139662,0.952002,0.544033,1.0,0.865802,0.999633
324,hsa05152,Tuberculosis,3,1.273333,0.133762,0.963001,0.544033,1.0,0.865802,0.999633
181,hsa04147,Exosome,11,7.452567,0.127829,0.930802,0.544033,1.0,0.852194,0.999633
84,hsa00910,Nitrogen metabolism,1,0.127000,0.120329,0.993467,0.544033,1.0,0.817331,0.999633
...,...,...,...,...,...,...,...,...,...,...
356,hsa05224,Breast cancer,0,1.638033,1.000000,0.185627,1.000000,1.0,1.000000,0.999633
345,hsa05213,Endometrial cancer,0,0.926967,1.000000,0.379587,1.000000,1.0,1.000000,0.999633
346,hsa05214,Glioma,0,1.295067,1.000000,0.260758,1.000000,1.0,1.000000,0.999633
11,hsa00100,Steroid biosynthesis,0,0.087600,1.000000,0.915903,1.000000,1.0,1.000000,0.999633


In [17]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
117,hsa03019,Messenger RNA biogenesis,11,3.423800,0.000600,0.999867,0.080700,1.000000,0.215993,0.999867
28,hsa00340,Histidine metabolism,2,0.109767,0.005566,0.999800,0.416583,1.000000,0.539982,0.999867
124,hsa03036,Chromosome and associated proteins,23,13.098800,0.005966,0.997200,0.416583,1.000000,0.539982,0.999867
126,hsa03041,Spliceosome,8,2.837500,0.006000,0.998400,0.416583,1.000000,0.539982,0.999867
113,hsa03013,RNA transport,4,0.834400,0.009600,0.998600,0.416583,1.000000,0.563981,0.999867
...,...,...,...,...,...,...,...,...,...,...
158,hsa04068,FoxO signaling pathway,1,1.351667,0.750975,0.602980,1.000000,0.778729,1.000000,0.999867
347,hsa05215,Prostate cancer,1,1.341233,0.748308,0.609713,1.000000,0.778729,1.000000,0.999867
157,hsa04066,HIF-1 signaling pathway,1,1.341700,0.746675,0.609146,1.000000,0.778729,1.000000,0.999867
355,hsa05223,Non-small cell lung cancer,1,1.272067,0.735209,0.634212,1.000000,0.807055,1.000000,0.999867


In [18]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [19]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [20]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
218,hsa04621,NOD-like receptor signaling pathway,11,2.919100,0.000200,1.000000,0.027533,1.0,0.071998,1.0
334,hsa05169,Epstein-Barr virus infection,10,2.880267,0.000533,0.999967,0.044300,1.0,0.095997,1.0
317,hsa05140,Leishmaniasis,5,0.870467,0.001500,0.999700,0.087811,1.0,0.161995,1.0
84,hsa00910,Nitrogen metabolism,3,0.253567,0.001800,0.999867,0.087811,1.0,0.161995,1.0
217,hsa04620,Toll-like receptor signaling pathway,5,1.235400,0.007533,0.998667,0.280700,1.0,0.422557,1.0
...,...,...,...,...,...,...,...,...,...,...
88,hsa00982,Drug metabolism - cytochrome P450,0,0.544667,1.000000,0.577914,1.000000,1.0,1.000000,1.0
367,hsa05330,Allograft rejection,0,0.248200,1.000000,0.781241,1.000000,1.0,1.000000,1.0
368,hsa05332,Graft-versus-host disease,0,0.251767,1.000000,0.778274,1.000000,1.0,1.000000,1.0
3,hsa00040,Pentose and glucuronate interconversions,0,0.306600,1.000000,0.734876,1.000000,1.0,1.000000,1.0


In [21]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
26,hsa00310,Lysine degradation,5,1.404633,0.012166,0.997467,1.0,1.0,0.821973,0.998667
253,hsa04812,Cytoskeleton proteins,10,12.937667,0.842572,0.246925,1.0,1.0,1.000000,0.998667
345,hsa05213,Endometrial cancer,1,1.772800,0.841339,0.460151,1.0,1.0,1.000000,0.998667
174,hsa04137,Mitophagy - animal,1,1.649700,0.836372,0.493750,1.0,1.0,1.000000,0.998667
305,hsa05032,Morphine addiction,3,4.511800,0.835639,0.332456,1.0,1.0,1.000000,0.998667
...,...,...,...,...,...,...,...,...,...,...
301,hsa05017,Spinocerebellar ataxia,6,4.987167,0.382887,0.767974,1.0,1.0,1.000000,0.998667
374,hsa05418,Fluid shear stress and atherosclerosis,3,2.203533,0.380154,0.823873,1.0,1.0,1.000000,0.998667
259,hsa04915,Estrogen signaling pathway,5,4.030767,0.376721,0.785874,1.0,1.0,1.000000,0.998667
272,hsa04928,"Parathyroid hormone synthesis, secretion and a...",5,4.005733,0.374354,0.788040,1.0,1.0,1.000000,0.998667


In [22]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
27,hsa00330,Arginine and proline metabolism,4,0.422267,0.000900,0.999933,0.145233,1.000000,0.323989,0.999933
26,hsa00310,Lysine degradation,6,1.397700,0.001933,0.999567,0.170033,1.000000,0.347988,0.999933
125,hsa03040,Spliceosome,5,1.119200,0.006066,0.999033,0.369633,1.000000,0.602980,0.999933
234,hsa04672,Intestinal immune network for IgA production,3,0.377700,0.006700,0.999400,0.369633,1.000000,0.602980,0.999933
117,hsa03019,Messenger RNA biogenesis,14,7.048467,0.011600,0.994634,0.441627,1.000000,0.769174,0.999933
...,...,...,...,...,...,...,...,...,...,...
326,hsa05161,Hepatitis B,3,3.096133,0.604780,0.626479,1.000000,0.891160,1.000000,0.999933
331,hsa05166,Human T-cell leukemia virus 1 infection,4,4.154667,0.602513,0.599147,1.000000,0.886893,1.000000,0.999933
112,hsa03012,Translation factors,1,0.901900,0.597313,0.772141,1.000000,0.944114,1.000000,0.999933
261,hsa04917,Prolactin signaling pathway,1,1.505567,0.786440,0.553782,1.000000,0.886043,1.000000,0.999933


In [23]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
334,hsa05169,Epstein-Barr virus infection,12,4.212567,0.001067,0.999567,0.1927,1.0,0.143995,0.999767
218,hsa04621,NOD-like receptor signaling pathway,12,4.302333,0.001100,0.999767,0.1927,1.0,0.143995,0.999767
26,hsa00310,Lysine degradation,8,2.116300,0.001200,0.999667,0.1927,1.0,0.143995,0.999767
27,hsa00330,Arginine and proline metabolism,4,0.633033,0.003700,0.999467,0.1927,1.0,0.304790,0.999767
333,hsa05168,Herpes simplex virus 1 infection,14,6.051333,0.004233,0.998167,0.1927,1.0,0.304790,0.999767
...,...,...,...,...,...,...,...,...,...,...
34,hsa00430,Taurine and hypotaurine metabolism,0,0.331700,1.000000,0.713210,1.0000,1.0,1.000000,0.999767
97,hsa01040,Biosynthesis of unsaturated fatty acids,0,0.901200,1.000000,0.401687,1.0000,1.0,1.000000,0.999767
118,hsa03020,RNA polymerase,0,0.432567,1.000000,0.647445,1.0000,1.0,1.000000,0.999767
51,hsa00533,Glycosaminoglycan biosynthesis - keratan sulfate,0,0.452667,1.000000,0.630112,1.0000,1.0,1.000000,0.999767
