Merge pull request #166 from indralab/kbs

Add new knowledge sources
gyorilab · May 11, 2021 · 5717000 · 5717000
2 parents d33d156 + d077b2a
commit 5717000
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -56,6 +56,9 @@ We include the information from these pre-existing databases:
 - [CTD](http://ctdbase.org/)
 - [Phospho.ELM](http://phospho.elm.eu.org/)
 - [DrugBank](https://www.drugbank.ca/)
+- [CONIB](https://pharmacome.github.io/conib/)
+- [CRoG](https://github.com/chemical-roles/chemical-roles)
+- [DGI](https://www.dgidb.org/)
 
 These databases are retrieved primarily using the tools in `indra.sources`. The
 statements extracted from all of these sources are stored and updated in the

diff --git a/indra_db/managers/knowledgebase_manager.py b/indra_db/managers/knowledgebase_manager.py
@@ -436,6 +436,73 @@ def _expanded(stmts):
             yield stmt
 
 
+class DgiManager(KnowledgebaseManager):
+    """This manager handles retrieval and processing of the DGI dataset."""
+    name = 'DGI'
+    short_name = 'dgi'
+    source = 'dgi'
+
+    def _get_statements(self):
+        from indra.sources import dgi
+        logger.info('Processing DGI from web')
+        dp = dgi.process_version('2020-Nov')
+        logger.info('Expanding evidences and deduplicating')
+        filtered_stmts = [s for s in _expanded(dp.statements)]
+        unique_stmts, _ = extract_duplicates(filtered_stmts,
+                                             KeyFunc.mk_and_one_ev_src)
+        return unique_stmts
+
+
+class CrogManager(KnowledgebaseManager):
+    """This manager handles retrieval and processing of the CRoG dataset."""
+    name = 'CRoG'
+    short_name = 'crog'
+    source = 'crog'
+
+    def _get_statements(self):
+        from indra.sources import crog
+        logger.info('Processing CRoG from web')
+        cp = crog.process_from_web()
+        logger.info('Expanding evidences and deduplicating')
+        filtered_stmts = [s for s in _expanded(cp.statements)]
+        unique_stmts, _ = extract_duplicates(filtered_stmts,
+                                             KeyFunc.mk_and_one_ev_src)
+        return unique_stmts
+
+
+class ConibManager(KnowledgebaseManager):
+    """This manager handles retrieval and processing of the CONIB dataset."""
+    name = 'CONIB'
+    short_name = 'conib'
+    source = 'bel'
+
+    def _get_statements(self):
+        import pybel
+        import requests
+        from indra.sources.bel import process_pybel_graph
+        logger.info('Processing CONIB from web')
+        url = 'https://github.com/pharmacome/conib/raw/master/conib' \
+            '/_cache.bel.nodelink.json'
+        res_json = requests.get(url).json()
+        graph = pybel.from_nodelink(res_json)
+        # Get INDRA statements
+        pbp = process_pybel_graph(graph)
+
+        # Fix and issue with PMID spaces
+        for stmt in pbp.statements:
+            for ev in stmt.evidence:
+                if ev.pmid:
+                    ev.pmid = ev.pmid.strip()
+                if ev.text_refs.get('PMID'):
+                    ev.text_refs['PMID'] = ev.text_refs['PMID'].strip()
+
+        logger.info('Expanding evidences and deduplicating')
+        filtered_stmts = [s for s in _expanded(pbp.statements)]
+        unique_stmts, _ = extract_duplicates(filtered_stmts,
+                                             KeyFunc.mk_and_one_ev_src)
+        return unique_stmts
+
+
 if __name__ == '__main__':
     import sys
     from indra_db.util import get_db