Skip to content

Commit

Permalink
Merge pull request #107 from pagreene/ro-dump-update
Browse files Browse the repository at this point in the history
Update the Readonly Dump to include resource files.
  • Loading branch information
pagreene committed Jun 1, 2020
2 parents 4199051 + 8832ae8 commit 6536a97
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 61 deletions.
76 changes: 61 additions & 15 deletions indra_db/belief.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from datetime import datetime

from indra_db import util as dbu
from indra_db.util import S3Path
from indra_db.util.dump_sif import upload_pickle_to_s3, S3_SUBDIR

logger = logging.getLogger('db_belief')
Expand Down Expand Up @@ -46,6 +47,9 @@ def __init__(self, mk_hash, evidence=None, supports=None,
def matches_key(self):
return self.__mk_hash

def get_hash(self):
return self.__mk_hash


class MockEvidence(object):
"""A class to imitate real INDRA Evidence for calculating belief."""
Expand Down Expand Up @@ -88,7 +92,7 @@ def populate_support(stmts, links):
return


def load_mock_statements(db):
def load_mock_statements(db, hashes=None, sup_links=None):
"""Generate a list of mock statements from the pa statement table."""
# Initialize a dictionary of evidence keyed by hash.
stmts_dict = {}
Expand All @@ -103,25 +107,44 @@ def add_evidence(src_api, mk_hash, sid, db_name=None):
stmts_dict[mk_hash].evidence.append(mev)

# Handle the evidence from reading.
res_rdg = db.select_all([db.Reading.reader,
q_rdg = db.filter_query([db.Reading.reader,
db.RawUniqueLinks.pa_stmt_mk_hash,
db.RawUniqueLinks.raw_stmt_id],
*db.link(db.Reading, db.RawUniqueLinks))
if hashes is not None:
q_rdg = q_rdg.filter(db.RawUniqueLinks.pa_stmt_mk_hash.in_(hashes))
res_rdg = q_rdg.all()

for src_api, mk_hash, sid in res_rdg:
add_evidence(src_api, mk_hash, sid)

# Handle the evidence from knowledge bases.
res_dbs = db.select_all([db.DBInfo.source_api,
q_dbs = db.filter_query([db.DBInfo.source_api,
db.DBInfo.db_name,
db.RawUniqueLinks.pa_stmt_mk_hash,
db.RawUniqueLinks.raw_stmt_id],
*db.link(db.DBInfo, db.RawUniqueLinks))
if hashes is not None:
q_dbs = q_dbs.filter(db.RawUniqueLinks.pa_stmt_mk_hash.in_(hashes))
res_dbs = q_dbs.all()

for src_api, db_name, mk_hash, sid in res_dbs:
add_evidence(src_api, mk_hash, sid, db_name)

# Get the support links and populate
sup_links = db.select_all([db.PASupportLinks.supported_mk_hash,
db.PASupportLinks.supporting_mk_hash])
if sup_links is None:
sup_link_q = db.filter_query([db.PASupportLinks.supported_mk_hash,
db.PASupportLinks.supporting_mk_hash])
if hashes is not None:
# I only use a constraint on the supported hash. If this is used in
# the usual way where the hashes are part of a connected component,
# then adding another constraint would have no effect (but to slow
# down the query). Otherwise this is a bit arbitrary.
sup_link_q = sup_link_q.filter(
db.PASupportLinks.supported_mk_hash.in_(hashes)
)
sup_links = sup_link_q.all()

populate_support(stmts_dict, sup_links)

return list(stmts_dict.values())
Expand All @@ -134,14 +157,38 @@ def calculate_belief(stmts):
be = BeliefEngine(scorer=scorer)
be.set_prior_probs(stmts)
be.set_hierarchy_probs(stmts)
return {s.matches_key(): s.belief for s in stmts}
return {str(s.get_hash()): s.belief for s in stmts}


def get_belief(db=None):
def get_belief(db=None, partition=True):
if db is None:
db = dbu.get_db('primary')
stmts = load_mock_statements(db)
return calculate_belief(stmts)

if partition:
import networkx as nx
hashes = {h for h, in db.select_all(db.PAStatements.mk_hash)}
link_pair = [db.PASupportLinks.supporting_mk_hash,
db.PASupportLinks.supported_mk_hash]
links = {tuple(link) for link in db.select_all(link_pair)}
g = nx.Graph()
g.add_nodes_from(hashes)
g.add_edges_from(links)

group = set()
beliefs = {}
for c in nx.connected_components(g):
group |= c

if len(group) >= 10000:
sg = g.subgraph(group)
stmts = load_mock_statements(db, hashes=group,
sup_links=list(sg.edges))
beliefs.update(calculate_belief(stmts))
group = set()
return beliefs
else:
stmts = load_mock_statements(db)
return calculate_belief(stmts)


if __name__ == '__main__':
Expand All @@ -157,12 +204,11 @@ def get_belief(db=None):
help='Upload belief dict to the bigmech s3 bucket '
'instead of saving it locally')
args = parser.parse_args()
fname = 's3:' + '/'.join([S3_SUBDIR,
datetime.utcnow().strftime('%Y-%m-%d'),
args.fname]) if args.s3 else args.fname
belief_dict = get_belief()
if fname.startswith('s3:'):
upload_pickle_to_s3(obj=belief_dict, key=fname)
if args.s3:
key = '/'.join([datetime.utcnow().strftime('%Y-%m-%d'), args.fname])
s3_path = S3Path(S3_SUBDIR, key)
upload_pickle_to_s3(obj=belief_dict, s3_path=s3_path)
else:
with open(fname, 'wb') as f:
with open(args.fname, 'wb') as f:
pickle.dump(belief_dict, f)

0 comments on commit 6536a97

Please sign in to comment.