Skip to content

Commit

Permalink
Merge pull request #117 from indralab/ctd
Browse files Browse the repository at this point in the history
New knowledge bases
  • Loading branch information
bgyori committed Jun 28, 2020
2 parents bd6bbe1 + 973336a commit 937734d
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 2 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ We include the information from these pre-existing databases:
- [TRRUST](https://omictools.com/trrust-tool)
- [PhosphoSitePlus](https://www.phosphosite.org/)
- [Causal Biological Networks Database](http://www.causalbionet.com/)
- [VirHostNet](http://virhostnet.prabi.fr/)
- [CTD](http://ctdbase.org/)
- [Phospho.ELM](http://phospho.elm.eu.org/)

These databases are retrieved primarily using the tools in `indra.sources`. The
statements extracted from all of these sources are stored and updated in the
Expand Down
64 changes: 62 additions & 2 deletions indra_db/managers/knowledgebase_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__all__ = ['TasManager', 'CBNManager', 'HPRDManager', 'SignorManager',
'BiogridManager', 'BelLcManager', 'PathwayCommonsManager',
'RlimspManager', 'TrrustManager', 'PhosphositeManager']
'RlimspManager', 'TrrustManager', 'PhosphositeManager',
'CTDManager', 'VirHostNetManager', 'PhosphoElmManager']

import os
import zlib
Expand Down Expand Up @@ -163,7 +164,8 @@ def _get_statements(self):
class PathwayCommonsManager(KnowledgebaseManager):
name = 'pc11'
source = 'biopax'
skips = {'psp', 'hprd', 'biogrid', 'phosphosite', 'phosphositeplus'}
skips = {'psp', 'hprd', 'biogrid', 'phosphosite', 'phosphositeplus',
'ctd'}

def __init__(self, *args, **kwargs):
self.counts = defaultdict(lambda: 0)
Expand All @@ -189,6 +191,64 @@ def _get_statements(self):
return filtered_stmts


class CTDManager(KnowledgebaseManager):
name = 'ctd'
source = 'ctd'
subsets = ['gene_disease', 'chemical_disease',
'chemical_gene']

def _get_statements(self):
s3 = boto3.client('s3')
all_stmts = []
for subset in self.subsets:
logger.info('Fetching CTD subset %s from S3...' % subset)
key = 'indra-db/ctd_%s.pkl' % subset
resp = s3.get_object(Bucket='bigmech', Key=key)
stmts = pickle.loads(resp['Body'].read())
all_stmts += [s for s in _expanded(stmts)]
# Return exactly one of multiple statements that are exactly the same
# in terms of content and evidence.
unique_stmts, _ = extract_duplicates(all_stmts,
KeyFunc.mk_and_one_ev_src)
return unique_stmts


class VirHostNetManager(KnowledgebaseManager):
name = 'virhostnet'
source = 'virhostnet'

def _get_statements(self):
from indra.sources import virhostnet
vp = virhostnet.process_from_web()
return [s for s in _expanded(vp.statements)]


class PhosphoElmManager(KnowledgebaseManager):
name = 'phosphoelm'
source = 'phosphoelm'

def _get_statements(self):
from indra.sources import phosphoelm
logger.info('Fetching PhosphoElm dump from S3...')
s3 = boto3.resource('s3')
tmp_dir = tempfile.mkdtemp('phosphoelm_files')
dump_file = os.path.join(tmp_dir, 'phosphoelm.dump')
s3.meta.client.download_file('bigmech',
'indra-db/phosphoELM_all_2015-04.dump',
dump_file)
logger.info('Processing PhosphoElm dump...')
pp = phosphoelm.process_from_dump(dump_file)
logger.info('Expanding evidences on PhosphoElm statements...')
# Expand evidences just in case, though this processor always
# produces a single evidence per statement.
stmts = [s for s in _expanded(pp.statements)]
# Return exactly one of multiple statements that are exactly the same
# in terms of content and evidence.
# Now make sure we don't include exact duplicates
unique_stmts, _ = extract_duplicates(stmts, KeyFunc.mk_and_one_ev_src)
return unique_stmts


class HPRDManager(KnowledgebaseManager):
name = 'hprd'
source = 'hprd'
Expand Down

0 comments on commit 937734d

Please sign in to comment.