In [21]:
from collections import Counter
from collections import defaultdict
import itertools
import json

import numpy as np

In [39]:
# Path to file with JSON provenance documents, one per line.
PATH_SAMPLE_DATA = "/Users/vr24/logstash-5.0.0/make_history_output.log"
RELATIONSHIP_TYPE_KEYNAME = unicode("prov")
SEED = 0
FIELDS_KEY = "@fields"
INSTANCE_KEY = "instance"
DOCUMENT_KEY = "document"
MESSAGE_KEY = "@message"

In [3]:
# Load and count log records.
with open(PATH_SAMPLE_DATA, "r") as logfile:
    logs = [json.loads(line.strip()) for line in logfile]
print("There are {} logs.".format(len(logs)))

There are 4360 logs.


In [4]:
# What's the space of depicted relationships look like?
relationship_type_names = {log[RELATIONSHIP_TYPE_KEYNAME] for log in logs}
relationship_type_names

{u'activity',
 u'agent',
 u'entity',
 u'hadMember',
 u'prefix',
 u'used',
 u'wasAssociatedWith',
 u'wasGeneratedBy',
 u'wasInfluencedBy'}

In [13]:
# Let's view some random records to see what we're up against.
np.random.seed(SEED)
sample_logs = np.asarray(logs)[np.random.randint(0, len(logs), 5)]
sample_logs

array([ {u'@fields': {u'_:id2': {u'prov:agent': u'people:vr24', u'prov:activity': u'is:97e270b8-7fd6-48b0-a8e2-13e1bdde8771'}}, u'prov': u'wasAssociatedWith', u'@timestamp': u'2016-11-06T00:34:29.709Z', u'instance': u'_:id2', u'host': u'127.0.0.1', u'@source_host': u'withme', u'@message': u'create_file3', u'document': u'is:97e270b8-7fd6-48b0-a8e2-13e1bdde8771', u'@version': 1, u'port': 65079},
       {u'@fields': {u'doc:gbd-read/schema/table': {}, u'code:tests/make_history.py': {u'unk:version_remote': u'https://vr24@stash.ihme.washington.edu/scm/~adolgert/provda.git', u'unk:version_branch_hash': u'372d74f21713f47642fc424e7e3289f38b2ed5a0', u'unk:script': u'/Users/vr24/code/provda/tests/make_history.py', u'unk:version_branch': u'tinkering'}, u'doc:gbd/first_history_test2/cvd_ihd.hdf': {}, u'doc:paf/first_history_test2/cvd_ihd.hdf': {}}, u'prov': u'entity', u'@timestamp': u'2016-11-06T00:34:26.834Z', u'instance': u'doc:gbd-read/schema/table', u'host': u'127.0.0.1', u'@source_host': u'wit

In [16]:
# What's the total space of possible record fields?
# In how many records is each field present?
records_fields_histogram = Counter()
for log in logs:
    records_fields_histogram.update(log.keys())
records_fields_histogram

Counter({u'@fields': 4360,
         u'@message': 4360,
         u'@source_host': 4360,
         u'@timestamp': 4360,
         u'@version': 4360,
         u'document': 4360,
         u'host': 4360,
         u'instance': 4360,
         u'port': 4360,
         u'prov': 4360})

In [18]:
# Excellent, we have a standard at the record level!
# What does the distribution of provenance document type name look like?
log_type_histogram = Counter(str(log[RELATIONSHIP_TYPE_KEYNAME]) for log in logs)
log_type_histogram

Counter({'activity': 220,
         'agent': 220,
         'entity': 1080,
         'hadMember': 200,
         'prefix': 1540,
         'used': 620,
         'wasAssociatedWith': 220,
         'wasGeneratedBy': 240,
         'wasInfluencedBy': 20})

In [20]:
# OK, we could consider using the provenance document type as the type within the ES index.
# Let's get a trio of log record samples for each of the document types.
logs_by_type = defaultdict(list)
for log in logs:
    logs_by_type[log[RELATIONSHIP_TYPE_KEYNAME]].append(log)
log_trio_by_type = {log_type: logs[:3] for log_type, logs in logs_by_type.items()}
log_trio_by_type

{u'activity': [{u'@fields': {u'is:21090197-6cce-4b66-8bc9-12e03029d84e': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
     u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
     u'unk:date': u'2016-11-05T16:53:06-07:00',
     u'unk:group_id': {u'$': 37253, u'type': u'xsd:int'},
     u'unk:hostname': u'Gladstone.domain',
     u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
     u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
     u'unk:process_id': {u'$': 37260, u'type': u'xsd:int'},
     u'unk:sge_job_id': u'100'}},
   u'@message': u'create_file3',
   u'@source_host': u'withme',
   u'@timestamp': u'2016-11-05T23:53:07.028Z',
   u'@version': 1,
   u'document': u'is:21090197-6cce-4b66-8bc9-12e03029d84e',
   u'host': u'127.0.0.1',
   u'instance': u'is:21090197-6cce-4b66-8bc9-12e03029d84e',
   u'port': 64675,
   u'prov': u'activity'},
  {u'@fields': {u'is:88a9ac0e-c9aa-408f-8f16-5b2e964d3661': {u'unk:args': u'--child

In [29]:
# Let's go type-wise to figure out how the '@fields' field differs.
fields_sample_by_doctype = {doctype: logs[0][FIELDS_KEY] 
                            for doctype, logs in logs_by_type.items()}
fields_sample_by_doctype

{u'activity': {u'is:21090197-6cce-4b66-8bc9-12e03029d84e': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
   u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
   u'unk:date': u'2016-11-05T16:53:06-07:00',
   u'unk:group_id': {u'$': 37253, u'type': u'xsd:int'},
   u'unk:hostname': u'Gladstone.domain',
   u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
   u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
   u'unk:process_id': {u'$': 37260, u'type': u'xsd:int'},
   u'unk:sge_job_id': u'100'}},
 u'agent': {u'people:vr24': {u'unk:fullname': u'Vincent Reuter',
   u'unk:homedir': u'/Users/vr24'}},
 u'entity': {u'code:tests/make_history.py': {u'unk:script': u'/Users/vr24/code/provda/tests/make_history.py',
   u'unk:version_branch': u'tinkering',
   u'unk:version_branch_hash': u'372d74f21713f47642fc424e7e3289f38b2ed5a0',
   u'unk:version_remote': u'https://vr24@stash.ihme.washington.edu/scm/~adolgert/provda.git'},
  u'doc:gbd

In [40]:
def get_unique_values(doc_key, logs):
    return {log[doc_key] for log in logs}

instances = get_unique_values(INSTANCE_KEY, logs)
documents = get_unique_values(DOCUMENT_KEY, logs)
messages = get_unique_values(MESSAGE_KEY, logs)
num_values_by_key = {
    INSTANCE_KEY: len(instances), 
    DOCUMENT_KEY: len(documents), 
    MESSAGE_KEY: len(messages)
}
num_values_by_key

{'@message': 1, 'document': 220, 'instance': 385}

In [41]:
messages

{u'create_file3'}