# ORA
the general and specfic for KEGG, REACTOME and GO

In [1]:
import pandas as pd
from pypathway import Reactome, GO, KEGG, ORA
from pypathway import ColorectalCancer, IdMapping, GMTUtils
from pypathway import EnrichmentExport
import os
import sys

In [2]:
# load a gmt file.
gmt = GMTUtils.parse_gmt_file("../../pypathway/tests/gmt_file/h.all.v6.0.entrez.gmt")

In [3]:
# load the example
c = ColorectalCancer()

In [4]:
# infomation of datasets
len(c.deg_list), len(c.background)

(5320, 17216)

In [5]:
res_h = ORA.run(c.deg_list, c.background, gmt)

In [6]:
res_h.table.head()

Unnamed: 0,name,mapped,number in study,p-value,fdr
0,HALLMARK_G2M_CHECKPOINT,177,50,0.8013938,0.8904376
1,HALLMARK_HYPOXIA,180,98,3.951669e-11,2.82262e-10
2,HALLMARK_PANCREAS_BETA_CELLS,38,14,0.2640805,0.3301007
3,HALLMARK_MTORC1_SIGNALING,176,90,1.551015e-08,5.965443e-08
4,HALLMARK_UV_RESPONSE_DN,125,77,1.237892e-12,1.547365e-11


In [7]:
# inline bar plot.
res_h.plot()

## KEGG

In [8]:
# kegg enrichment for certain organism
r_kg = KEGG.run(c.deg_list, c.background, 'hsa')

In [9]:
r_kg.table.head()

Unnamed: 0,ID,Name,mapped,deg,p-value,fdr
0,hsa05217,Basal cell carcinoma - Homo sapiens (human),57,17,0.618982,0.747088
1,hsa04721,Synaptic vesicle cycle - Homo sapiens (human),56,23,0.068614,0.156073
2,hsa04142,Lysosome - Homo sapiens (human),112,46,0.014182,0.053893
3,hsa04640,Hematopoietic cell lineage - Homo sapiens (human),81,24,0.638773,0.764162
4,hsa00970,Aminoacyl-tRNA biosynthesis - Homo sapiens (hu...,39,16,0.117185,0.222652


In [10]:
r_kg.plot()

## Reactome

In [12]:
# the Example of using the warpper of Reactome gene set enrichment analysis

In [11]:
sybs = [x[1][0] for x in IdMapping.convert(input_id=c.deg_list, species='hsa', source='ENTREZID', target='SYMBOL') if x[1]]

In [12]:
sybs[:10]

['A2M',
 'MKKS',
 'S100A3',
 'ANKRD29',
 'TMEM250',
 'NAT1',
 'NAT2',
 'SERPINA3',
 'AAMP',
 'AARS']

In [13]:
# the input is a list of symbol
r = Reactome.run(sybs[:10], organism='Homo sapiens')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [15]:
# the result
r.table.head()

Unnamed: 0,name,dbId,found,p-value,fdr,species
0,Acetylation,156582,2,0.000557,0.02396,Homo sapiens
1,Defective SLC6A2 causes orthostatic intoleranc...,5619109,1,0.006418,0.131517,Homo sapiens
2,Amino acid transport across the plasma membrane,352230,2,0.009394,0.131517,Homo sapiens
3,Neurotransmitter uptake and metabolism In glia...,112313,1,0.02124,0.162962,Homo sapiens
4,Astrocytic Glutamate-Glutamine Uptake And Meta...,210455,1,0.02124,0.162962,Homo sapiens


## Gene ontology

In [14]:
# make the association file using the id_mapping function
# detail using will be shown in the utils section
r = IdMapping.convert_to_dict(input_id=c.background, source='ENTREZID', target="GO", species='hsa')

In [15]:
# run go enrichment analysis via goatools 
# the inputs of study, pop, and assoc is list, list,  dict.
# the path is the folder of go obo file
# the path should be a valid filesystem path
path = os.getcwd() + "/go.obo"
rg = GO.run([str(x) for x in c.deg_list], [str(x) for x in c.background], r, obo=path)

obo file not found, start to download
load obo file /Users/yangxu/PyPathway/examples/analysis/go.obo
/Users/yangxu/PyPathway/examples/analysis/go.obo: fmt(1.2) rel(2017-12-12) 49,237 GO Terms
fisher module not installed.  Falling back on scipy.stats.fisher_exact


Propagating term counts to parents ..


15,292 out of 17,216 population items found in association
Calculating uncorrected p-values using fisher_scipy_stats
 5,114 out of  5,320 study items found in association
Running multitest correction: local bonferroni
Running multitest correction: local sidak
Running multitest correction: local holm
  15,988 GO terms are associated with 5,114 of 5,320 study items
  21,081 GO terms are associated with 15,292 of 17,216 population items


In [16]:
rg.table.head()

Unnamed: 0,GO,NS,enrichment,name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_bonferroni,p_sidak,p_holm,hit
0,GO:0008150,BP,e,biological_process,4755/5320,14017/17216,5.1e-78,n.a.,4755,1.07e-73,1.0500000000000001e-73,1.07e-73,"10, 1000, 10000, 10005, 10006, 10008, 10013, 1..."
1,GO:0009987,BP,e,cellular process,4194/5320,12161/17216,3.65e-58,n.a.,4194,7.690000000000001e-54,7.5e-54,7.690000000000001e-54,"10, 1000, 10000, 10005, 10006, 10008, 10013, 1..."
2,GO:0044281,BP,e,small molecule metabolic process,684/5320,1502/17216,1.13e-35,n.a.,684,2.39e-31,2.3300000000000002e-31,2.39e-31,"10005, 10057, 10090, 10111, 10135, 10165, 1017..."
3,GO:0006082,BP,e,organic acid metabolic process,409/5320,837/17216,1.12e-28,n.a.,409,2.37e-24,2.31e-24,2.36e-24,"10005, 10057, 10090, 10170, 10327, 10352, 1036..."
4,GO:0043436,BP,e,oxoacid metabolic process,400/5320,821/17216,1.18e-27,n.a.,400,2.5e-23,2.43e-23,2.49e-23,"10005, 10057, 10090, 10170, 10327, 10352, 1036..."


In [17]:
# the file input of study, pop and assoc
# this function is the warpper of the Goatools
# Github: https://github.com/tanghaibao/goatools
# cite: Haibao Tang et al. (2015). GOATOOLS: Tools for Gene Ontology. Zenodo. 10.5281/zenodo.31628.
path = "../../pypathway/tests/data/"
rg = GO.run(path + 'study', path + 'population', path + 'association',
        obo=os.getcwd() + 'go-basic.obo')

obo file not found, start to download
load obo file /Users/yangxu/PyPathway/examples/analysisgo-basic.obo
/Users/yangxu/PyPathway/examples/analysisgo-basic.obo: fmt(1.2) rel(2017-12-12) 49,237 GO Terms
fisher module not installed.  Falling back on scipy.stats.fisher_exact


Propagating term counts to parents ..
goids not found: {'GO:0022625', 'GO:0022627'}


31,855 out of 33,239 population items found in association
Calculating uncorrected p-values using fisher_scipy_stats
   269 out of    276 study items found in association
Running multitest correction: local bonferroni
Running multitest correction: local sidak
Running multitest correction: local holm
  791 GO terms are associated with 269 of 276 study items
  6,130 GO terms are associated with 31,855 of 33,239 population items


In [18]:
rg.table.head()

Unnamed: 0,GO,NS,enrichment,name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_bonferroni,p_sidak,p_holm,hit
0,GO:0006464,BP,e,cellular protein modification process,33/276,1727/33239,8e-06,n.a.,33,0.0506,0.0493,0.0505,"AT1G13580, AT1G66610, AT1G66860, AT1G66980, AT..."
1,GO:0036211,BP,e,protein modification process,33/276,1727/33239,8e-06,n.a.,33,0.0506,0.0493,0.0505,"AT1G13580, AT1G66610, AT1G66860, AT1G66980, AT..."
2,GO:0006468,BP,e,protein phosphorylation,22/276,922/33239,1.1e-05,n.a.,22,0.0661,0.0644,0.066,"AT1G66980, AT2G29220, AT2G41140, AT2G41970, AT..."
3,GO:0016310,BP,e,phosphorylation,22/276,996/33239,3.5e-05,n.a.,22,0.213,0.207,0.213,"AT1G66980, AT2G29220, AT2G41140, AT2G41970, AT..."
4,GO:0043412,BP,e,macromolecule modification,33/276,1877/33239,5.7e-05,n.a.,33,0.352,0.343,0.351,"AT1G13580, AT1G66610, AT1G66860, AT1G66980, AT..."


In [19]:
# the interactive graph display the significance of the result by color
rg.graph()

In [20]:
# test export
c = EnrichmentExport.export([rg, rg])