In [1]:
from src import pysetperm as psp
import numpy as np
import pandas as pd

In [2]:
n_perms = 30000
cores = 6
# +-2kb of gene definition: range_modification=2000
gene_def_plus=2000
# can set minimum size of the candidate gene set.
min_size=10
annotations = psp.AnnotationSet(annotation_file='data/genes.txt', range_modification=gene_def_plus)
function_sets = psp.FunctionSets(function_set_file='data/kegg.txt', min_set_size=min_size, annotation_obj=annotations)

In [3]:
# specific inputs
e_candidates = psp.Variants(variant_file='data/eastern_candidates.txt')
e_candidates.annotate_variants(annotation_obj=annotations)
e_background = psp.Variants(variant_file='data/eastern_background.txt.gz')
e_background.annotate_variants(annotation_obj=annotations)

# central can use eastern background.
c_candidates = psp.Variants(variant_file='data/central_candidates.txt')
c_candidates.annotate_variants(annotation_obj=annotations)

i_candidates = psp.Variants(variant_file='data/internal_candidates.txt')
i_candidates.annotate_variants(annotation_obj=annotations)
i_background = psp.Variants(variant_file='data/internal_background.txt.gz')
i_background.annotate_variants(annotation_obj=annotations)

In [4]:
# test objects
e_test_obj = psp.TestObject(e_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [5]:
c_test_obj = psp.TestObject(c_candidates,
                            e_background,
                            function_sets,
                            n_cores=cores)

In [6]:
i_test_obj = psp.TestObject(i_candidates,
                            i_background,
                            function_sets,
                            n_cores=cores)

In [7]:
e_permutations = psp.Permutation(e_test_obj, n_perms, cores)
c_permutations = psp.Permutation(c_test_obj, n_perms, cores)
i_permutations = psp.Permutation(i_test_obj, n_perms, cores)

In [8]:
# distributions across permutations
e_per_set = psp.SetPerPerm(e_permutations,
                           function_sets,
                           e_test_obj,
                           cores)

c_per_set = psp.SetPerPerm(c_permutations,
                           function_sets,
                           c_test_obj,
                           cores)

i_per_set = psp.SetPerPerm(i_permutations,
                           function_sets,
                           i_test_obj,
                           cores)

In [9]:
# results tables
def make_results_table(test_obj, function_set_obj, set_perm_obj):
    out = function_set_obj.function_sets.groupby('Id', as_index=False).agg({'FunctionName': pd.Series.unique})
    out = out[out['Id'].isin(function_set_obj.function_array2d_ids)]
    out['n_candidates'] = test_obj.n_candidate_per_function
    out['mean_n_resample'] = set_perm_obj.mean_per_set
    out['emp_p_e'] = set_perm_obj.p_enrichment
    out['emp_p_d'] = set_perm_obj.p_depletion
    out['fdr_e'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_e'], method='enrichment')
    out['fdr_d'] = psp.fdr_from_p_matrix(set_perm_obj.set_n_per_perm, out['emp_p_d'], method='depletion')
    out['BH_fdr_e'] = psp.p_adjust_bh(out['emp_p_e'])
    out['BH_fdr_d'] = psp.p_adjust_bh(out['emp_p_d'])
    out = out.sort_values('emp_p_e')
    return out


In [10]:
e_results = make_results_table(e_test_obj, function_sets, e_per_set)
c_results = make_results_table(c_test_obj, function_sets, c_per_set)
i_results = make_results_table(i_test_obj, function_sets, i_per_set)

In [20]:
e_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
226,hsa04658,Th1 and Th2 cell differentiation,11,3.700333,0.000633,0.999933,0.117167,1.000000,0.135995,0.999933
4,hsa00051,Fructose and mannose metabolism,6,1.442000,0.000867,0.999900,0.117167,1.000000,0.135995,0.999933
47,hsa00520,Amino sugar and nucleotide sugar metabolism,7,1.892600,0.001133,0.999900,0.117167,1.000000,0.135995,0.999933
334,hsa05169,Epstein-Barr virus infection,15,6.616700,0.002033,0.999367,0.117167,1.000000,0.182994,0.999933
109,hsa03009,Ribosome biogenesis,14,6.726633,0.005866,0.997900,0.243267,1.000000,0.422386,0.999933
...,...,...,...,...,...,...,...,...,...,...
336,hsa05200,Pathways in cancer,18,29.603533,0.996233,0.007833,1.000000,0.709267,1.000000,0.999933
203,hsa04390,Hippo signaling pathway,4,10.430333,0.996933,0.011466,1.000000,0.709267,1.000000,0.999933
268,hsa04924,Renin secretion,1,5.967667,0.998900,0.008600,1.000000,0.709267,1.000000,0.999933
247,hsa04730,Long-term depression,4,7.791133,0.977934,0.068364,1.000000,1.000000,1.000000,0.999933


In [21]:
c_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
153,hsa04060,Cytokine-cytokine receptor interaction,22,6.456367,0.000033,1.000000,0.000000,1.0,0.012000,1.0
369,hsa05340,Primary immunodeficiency,6,0.741167,0.000167,1.000000,0.012100,1.0,0.029999,1.0
317,hsa05140,Leishmaniasis,8,2.011000,0.000500,0.999833,0.029089,1.0,0.059998,1.0
150,hsa04050,Cytokine receptors,10,3.505133,0.001733,0.999467,0.084342,1.0,0.155995,1.0
154,hsa04061,Viral protein interaction with cytokine and cy...,6,1.483533,0.003433,0.999267,0.132627,1.0,0.247192,1.0
...,...,...,...,...,...,...,...,...,...,...
367,hsa05330,Allograft rejection,0,0.585033,1.000000,0.547882,1.000000,1.0,1.000000,1.0
116,hsa03018,RNA degradation,0,2.578133,1.000000,0.066398,1.000000,1.0,1.000000,1.0
6,hsa00053,Ascorbate and aldarate metabolism,0,0.375767,1.000000,0.681644,1.000000,1.0,1.000000,1.0
280,hsa04940,Type I diabetes mellitus,0,1.065033,1.000000,0.329022,1.000000,1.0,1.000000,1.0


In [22]:
i_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
124,hsa03036,Chromosome and associated proteins,84,57.947833,0.000167,0.999967,0.023533,1.000000,0.059998,0.999967
119,hsa03021,Transcription machinery,21,10.078033,0.000800,0.999800,0.071267,1.000000,0.139195,0.999967
113,hsa03013,RNA transport,11,3.963933,0.001600,0.999567,0.102133,1.000000,0.139195,0.999967
26,hsa00310,Lysine degradation,9,3.045400,0.001900,0.999600,0.102133,1.000000,0.139195,0.999967
374,hsa05418,Fluid shear stress and atherosclerosis,12,4.980367,0.001933,0.999600,0.102133,1.000000,0.139195,0.999967
...,...,...,...,...,...,...,...,...,...,...
205,hsa04510,Focal adhesion,13,17.341267,0.916103,0.141595,1.000000,0.420551,1.000000,0.863971
307,hsa05034,Alcoholism,4,6.753633,0.927936,0.167194,1.000000,0.486280,1.000000,0.970806
214,hsa04611,Platelet activation,6,9.188367,0.928202,0.149462,1.000000,0.443070,1.000000,0.896770
173,hsa04136,Autophagy - other,1,2.028633,0.902170,0.366821,1.000000,0.828091,1.000000,0.999967


In [14]:
# join objects
# test objs
ce_test_obj = psp.TestObject.add_objects(c_test_obj,e_test_obj)
ci_test_obj = psp.TestObject.add_objects(c_test_obj,i_test_obj)
ei_test_obj = psp.TestObject.add_objects(e_test_obj,i_test_obj)
cei_test_obj = psp.TestObject.add_objects(ce_test_obj,i_test_obj)

# n per permuation objs
ce_per_set=psp.SetPerPerm.join_objects(c_per_set,e_per_set)
ci_per_set=psp.SetPerPerm.join_objects(c_per_set,i_per_set)
ei_per_set=psp.SetPerPerm.join_objects(e_per_set,i_per_set)
cei_per_set=psp.SetPerPerm.join_objects(ce_per_set,i_per_set)

In [15]:
# joint results
ce_results = make_results_table(ce_test_obj, function_sets, ce_per_set)
ci_results = make_results_table(ci_test_obj, function_sets, ci_per_set)
ei_results = make_results_table(ei_test_obj, function_sets, ei_per_set)
cei_results = make_results_table(cei_test_obj, function_sets, cei_per_set)

In [23]:
ce_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
153,hsa04060,Cytokine-cytokine receptor interaction,31,12.924867,0.000033,1.000000,0.000000,1.000000,0.0060,1.0
4,hsa00051,Fructose and mannose metabolism,11,2.878100,0.000033,1.000000,0.000000,1.000000,0.0060,1.0
317,hsa05140,Leishmaniasis,14,4.011633,0.000067,1.000000,0.001692,1.000000,0.0060,1.0
226,hsa04658,Th1 and Th2 cell differentiation,20,7.409567,0.000067,0.999967,0.001692,1.000000,0.0060,1.0
369,hsa05340,Primary immunodeficiency,8,1.483100,0.000100,1.000000,0.002773,1.000000,0.0072,1.0
...,...,...,...,...,...,...,...,...,...,...
253,hsa04812,Cytoskeleton proteins,46,57.150667,0.962401,0.052765,1.000000,0.612777,1.0000,1.0
210,hsa04530,Tight junction,16,22.618667,0.962968,0.064931,1.000000,0.676830,1.0000,1.0
352,hsa05220,Chronic myeloid leukemia,4,8.130600,0.968701,0.078497,1.000000,0.699913,1.0000,1.0
330,hsa05165,Human papillomavirus infection,30,37.764433,0.939569,0.088630,1.000000,0.717928,1.0000,1.0


In [24]:
ci_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
153,hsa04060,Cytokine-cytokine receptor interaction,29,13.190467,0.000100,0.999967,0.01380,1.000000,0.017999,1.000000
317,hsa05140,Leishmaniasis,13,4.047433,0.000133,1.000000,0.01380,1.000000,0.017999,1.000000
369,hsa05340,Primary immunodeficiency,8,1.498800,0.000167,0.999900,0.01380,1.000000,0.017999,1.000000
198,hsa04350,TGF-beta signaling pathway,21,9.343267,0.000200,0.999900,0.01380,1.000000,0.017999,1.000000
226,hsa04658,Th1 and Th2 cell differentiation,17,7.318067,0.000800,0.999733,0.03288,1.000000,0.057598,1.000000
...,...,...,...,...,...,...,...,...,...,...
55,hsa00537,Glycosylphosphatidylinositol (GPI)-anchored pr...,5,8.087867,0.963568,0.107130,1.00000,0.650561,1.000000,1.000000
288,hsa04971,Gastric acid secretion,7,11.660367,0.964801,0.075664,1.00000,0.515411,1.000000,0.924369
152,hsa04054,Pattern recognition receptors,1,3.367800,0.968968,0.141929,1.00000,0.771725,1.000000,1.000000
242,hsa04724,Glutamatergic synapse,20,27.596967,0.969834,0.051165,1.00000,0.430718,1.000000,0.796127


In [25]:
ei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
226,hsa04658,Th1 and Th2 cell differentiation,19,7.309167,0.000100,0.999967,0.013900,1.000000,0.023999,0.999967
334,hsa05169,Epstein-Barr virus infection,27,12.827967,0.000133,0.999933,0.013900,1.000000,0.023999,0.999967
26,hsa00310,Lysine degradation,16,6.219000,0.000233,0.999867,0.014211,1.000000,0.027999,0.999967
124,hsa03036,Chromosome and associated proteins,152,118.902800,0.000633,0.999633,0.031008,1.000000,0.056998,0.999967
327,hsa05162,Measles,18,8.240400,0.001100,0.999700,0.045867,1.000000,0.079197,0.999967
...,...,...,...,...,...,...,...,...,...,...
211,hsa04540,Gap junction,13,15.907600,0.845572,0.240559,1.000000,0.763435,1.000000,0.999967
44,hsa00513,Various types of N-glycan biosynthesis,4,5.775900,0.851638,0.292457,1.000000,0.852204,1.000000,0.999967
136,hsa03430,Mismatch repair,1,1.836333,0.856071,0.438619,1.000000,1.000000,1.000000,0.999967
46,hsa00515,Mannose type O-glycan biosynthesis,2,2.913300,0.837405,0.410153,1.000000,0.997228,1.000000,0.999967


In [19]:
cei_results.sort_values('fdr_e')

Unnamed: 0,Id,FunctionName,n_candidates,mean_n_resample,emp_p_e,emp_p_d,fdr_e,fdr_d,BH_fdr_e,BH_fdr_d
226,hsa04658,Th1 and Th2 cell differentiation,28,11.018400,0.000033,1.000000,0.000000,1.000000,0.006,1.000000
317,hsa05140,Leishmaniasis,19,6.048067,0.000033,1.000000,0.000000,1.000000,0.006,1.000000
153,hsa04060,Cytokine-cytokine receptor interaction,38,19.658967,0.000100,0.999967,0.003733,1.000000,0.009,1.000000
334,hsa05169,Epstein-Barr virus infection,39,19.479433,0.000100,0.999967,0.003733,1.000000,0.009,1.000000
369,hsa05340,Primary immunodeficiency,10,2.240733,0.000167,0.999967,0.005920,1.000000,0.012,1.000000
...,...,...,...,...,...,...,...,...,...,...
283,hsa04961,Endocrine and other factor-regulated calcium r...,10,13.740067,0.912003,0.155195,1.000000,0.685574,1.000,0.936362
54,hsa00536,Glycosaminoglycan binding proteins,30,36.558200,0.911636,0.124129,1.000000,0.616496,1.000,0.859356
349,hsa05217,Basal cell carcinoma,5,7.946367,0.908903,0.179294,1.000000,0.738772,1.000,1.000000
43,hsa00512,Mucin type O-glycan biosynthesis,7,10.086600,0.929369,0.150628,1.000000,0.684768,1.000,0.936362
