In [59]:
from collections import Counter
from collections import defaultdict
import datetime
import itertools
import json

import numpy as np

In [2]:
# Path to file with JSON provenance documents, one per line.
PATH_SAMPLE_DATA = "/Users/vr24/logstash-5.0.0/make_history_output.log"
RELATIONSHIP_TYPE_KEYNAME = unicode("prov")
SEED = 0
FIELDS_KEY = "@fields"
INSTANCE_KEY = "instance"
DOCUMENT_KEY = "document"
MESSAGE_KEY = "@message"

In [3]:
# Load and count log records.
with open(PATH_SAMPLE_DATA, "r") as logfile:
    logs = [json.loads(line.strip()) for line in logfile]
print("There are {} logs.".format(len(logs)))

There are 7103 logs.


In [4]:
# What's the space of depicted relationships look like?
relationship_type_names = {log[RELATIONSHIP_TYPE_KEYNAME] for log in logs}
relationship_type_names

{u'activity',
 u'agent',
 u'entity',
 u'hadMember',
 u'prefix',
 u'used',
 u'wasAssociatedWith',
 u'wasGeneratedBy',
 u'wasInfluencedBy'}

In [5]:
# Let's view some random records to see what we're up against.
np.random.seed(SEED)
sample_logs = np.asarray(logs)[np.random.randint(0, len(logs), 5)]
sample_logs

array([ {u'@fields': {u'_:id5': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:102'}, u'_:id4': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:101'}, u'_:id6': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:103'}, u'_:id3': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:100'}, u'_:id9': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:111'}, u'_:id8': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:110'}, u'_:id11': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:113'}, u'_:id10': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:112'}, u'_:id13': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:121'}, u'_:id12': {u'prov:collection': u'unk:processcollection', u'prov:entity': u'doc:120'}}, u'prov': u'hadMember', u'@timestamp': u'2016-11-06T11:00:33.990Z', u'instance': u'_:id8', u'host': u'127.0.0.1', 

In [6]:
# What's the total space of possible record fields?
# In how many records is each field present?
records_fields_histogram = Counter()
for log in logs:
    records_fields_histogram.update(log.keys())
records_fields_histogram

Counter({u'@fields': 7103,
         u'@message': 7103,
         u'@source_host': 7103,
         u'@timestamp': 7103,
         u'@version': 7103,
         u'document': 7103,
         u'host': 7103,
         u'instance': 7103,
         u'port': 7103,
         u'prov': 7103})

In [7]:
# Excellent, we have a standard at the record level!
# What does the distribution of provenance document type name look like?
log_type_histogram = Counter(str(log[RELATIONSHIP_TYPE_KEYNAME]) for log in logs)
log_type_histogram

Counter({'activity': 358,
         'agent': 361,
         'entity': 1748,
         'hadMember': 330,
         'prefix': 2506,
         'used': 1017,
         'wasAssociatedWith': 361,
         'wasGeneratedBy': 392,
         'wasInfluencedBy': 30})

In [8]:
# OK, we could consider using the provenance document type as the type within the ES index.
# Let's get a trio of log record samples for each of the document types.
logs_by_type = defaultdict(list)
for log in logs:
    logs_by_type[log[RELATIONSHIP_TYPE_KEYNAME]].append(log)
log_trio_by_type = {log_type: logs[:3] for log_type, logs in logs_by_type.items()}
log_trio_by_type

{u'activity': [{u'@fields': {u'is:0a16324a-0017-47e9-a727-199d1f3e0fce': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
     u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
     u'unk:date': u'2016-11-06T02:45:51-08:00',
     u'unk:group_id': {u'$': 42057, u'type': u'xsd:int'},
     u'unk:hostname': u'Gladstone.domain',
     u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
     u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
     u'unk:process_id': {u'$': 42064, u'type': u'xsd:int'},
     u'unk:sge_job_id': u'100'}},
   u'@message': u'create_file3',
   u'@source_host': u'withme',
   u'@timestamp': u'2016-11-06T10:45:51.928Z',
   u'@version': 1,
   u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
   u'host': u'127.0.0.1',
   u'instance': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
   u'port': 56339,
   u'prov': u'activity'},
  {u'@fields': {u'is:5a4e5132-cf82-4bb1-99b6-d4a673391512': {u'unk:args': u'--child

In [9]:
# Let's go type-wise to figure out how the '@fields' field differs.
fields_sample_by_doctype = {doctype: logs[0][FIELDS_KEY] 
                            for doctype, logs in logs_by_type.items()}
fields_sample_by_doctype

{u'activity': {u'is:0a16324a-0017-47e9-a727-199d1f3e0fce': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
   u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
   u'unk:date': u'2016-11-06T02:45:51-08:00',
   u'unk:group_id': {u'$': 42057, u'type': u'xsd:int'},
   u'unk:hostname': u'Gladstone.domain',
   u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
   u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
   u'unk:process_id': {u'$': 42064, u'type': u'xsd:int'},
   u'unk:sge_job_id': u'100'}},
 u'agent': {u'people:vr24': {u'unk:fullname': u'Vincent Reuter',
   u'unk:homedir': u'/Users/vr24'}},
 u'entity': {u'code:tests/make_history.py': {u'unk:script': u'/Users/vr24/code/provda/tests/make_history.py',
   u'unk:version_branch': u'tinkering',
   u'unk:version_branch_hash': u'372d74f21713f47642fc424e7e3289f38b2ed5a0',
   u'unk:version_remote': u'https://vr24@stash.ihme.washington.edu/scm/~adolgert/provda.git'},
  u'doc:gbd

In [10]:
def get_unique_values(doc_key, logs):
    return {log[doc_key] for log in logs}

instances = get_unique_values(INSTANCE_KEY, logs)
documents = get_unique_values(DOCUMENT_KEY, logs)
messages = get_unique_values(MESSAGE_KEY, logs)
num_values_by_key = {
    INSTANCE_KEY: len(instances), 
    DOCUMENT_KEY: len(documents), 
    MESSAGE_KEY: len(messages)
}
num_values_by_key

{'@message': 1, 'document': 361, 'instance': 523}

In [11]:
# OK, let's actually go type-wise and try to define a mapping for each "@fields".
fields_field_by_type = {doc_type: [log["@fields"] for log in logs] 
                        for doc_type, logs in logs_by_type.items()}

In [12]:
# Let's check out the possibilities for an "activity."
activity_fields_keys = set()
for fields in fields_field_by_type["activity"]:
    activity_fields_keys |= set(fields.keys())
activity_fields_keys

{u'is:00129b99-6eea-4b4d-9b61-8c6153c41b5d',
 u'is:005dde4e-5d43-4d5a-b761-4a336a03a0f2',
 u'is:00a2325e-95a0-4bb8-a14b-5f133a0f1608',
 u'is:00e6a7e2-5cd6-418b-b4de-d77e321d8e9c',
 u'is:0284f0ad-5357-4273-9c1f-0e2462581d4d',
 u'is:0416b8ba-89a3-4aad-ac51-9dcaa54dfb72',
 u'is:061806c9-32e1-44d1-aeaf-73aa90aa03ef',
 u'is:0629bae4-689b-44e6-be78-01ca5140ad08',
 u'is:076da3c1-8c6e-4038-afc5-fed7550c2587',
 u'is:07bb8d38-966d-4de2-a038-40e13ce4f0cc',
 u'is:07c2b9f3-e318-4344-9d61-7c47fcfd4cca',
 u'is:08be2178-0314-425e-8d3c-1ee510c5f93e',
 u'is:08ebd293-b123-46c2-91ed-4ad5384b94c4',
 u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'is:0aefb9f0-c849-4bf6-b519-4d2dbcd92fd7',
 u'is:0bf11b61-d464-4621-a1ea-3c5704142255',
 u'is:0c135172-dd66-4a53-9d7d-332a376d8c38',
 u'is:0c3f93a9-9927-41b5-90b3-6c84dfa38adb',
 u'is:0cc6b959-1950-4b4b-9387-215fdf711d40',
 u'is:0cce5da9-f531-4717-9e1a-1ba0af3e8105',
 u'is:0d153790-4efd-4ac9-8ed4-ffbf3becb140',
 u'is:0da325e9-e56e-4003-b888-6db11eef436a',
 u'is:0ef5

In [13]:
# OK, each "@fields" value for an acitivity appears to be a single K-V pair mapping (object).
# Let's check that hypothesis.
assert all([1 == len(fields_object) for fields_object in fields_field_by_type["activity"]])

In [14]:
# OK, let's investigate the values.
activity_fields_object_values_keys = set()
for instance in fields_field_by_type["activity"]:
    for kv_pairs in instance.values():
        activity_fields_object_values_keys |= set(kv_pairs.keys())
activity_fields_object_values_keys

{u'unk:args',
 u'unk:command',
 u'unk:date',
 u'unk:group_id',
 u'unk:hostname',
 u'unk:interpreter',
 u'unk:platform',
 u'unk:process_id',
 u'unk:sge_job_id'}

In [15]:
# Cool! It appears that each "@fields" instance is an object with uniform schema.
# Let's look back at the space of document types to bifurcate the document types
# as either relationship (edge) type or node type.
logs_by_type.keys()

[u'wasAssociatedWith',
 u'hadMember',
 u'used',
 u'agent',
 u'entity',
 u'prefix',
 u'activity',
 u'wasInfluencedBy',
 u'wasGeneratedBy']

In [16]:
# What is going on with "prefix?"
len(logs_by_type["prefix"])

2506

In [17]:
len({log["document"] for log in logs_by_type["prefix"]})

358

In [18]:
len(logs)

7103

In [19]:
# Hmm, how many document tags are there within the collection of logs?
unique_document_ids = {log["document"] for log in logs}
len(unique_document_ids)

361

In [20]:
# How many prefixes are there?
len(logs_by_type["activity"])

358

In [21]:
# What agents are out there?
len(logs_by_type["agent"])

361

In [22]:
# What defines an agent?
log_trio_by_type["agent"]

[{u'@fields': {u'people:vr24': {u'unk:fullname': u'Vincent Reuter',
    u'unk:homedir': u'/Users/vr24'}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:51.927Z',
  u'@version': 1,
  u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
  u'host': u'127.0.0.1',
  u'instance': u'people:vr24',
  u'port': 56339,
  u'prov': u'agent'},
 {u'@fields': {u'people:vr24': {u'unk:fullname': u'Vincent Reuter',
    u'unk:homedir': u'/Users/vr24'}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:52.471Z',
  u'@version': 1,
  u'document': u'is:5a4e5132-cf82-4bb1-99b6-d4a673391512',
  u'host': u'127.0.0.1',
  u'instance': u'people:vr24',
  u'port': 56340,
  u'prov': u'agent'},
 {u'@fields': {u'people:vr24': {u'unk:fullname': u'Vincent Reuter',
    u'unk:homedir': u'/Users/vr24'}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:53.581Z',
  u'@version

In [23]:
# Is an agent defined by its fields?
agent_fields_lengths = {len(log["@fields"]) for log in logs_by_type["agent"]}
agent_fields_lengths

{1}

In [24]:
# OK, let's assume that an agent is defined by its fields.
# An agent also seems to be keyed on 'instance' (e.g., "people:vr24")
# The fields is then a single-object key-value pair mapping, with key matching 'instance', 
# e.g. {"people:vr24": {"unk:fullname": "Vincent Reuter", "unk:homedir": "/Users/vr24"}}

In [25]:
len(logs_by_type)

9

In [26]:
# Let's look at the other nodes, specifically, "activity" and "entity."

In [27]:
log_trio_by_type["activity"]

[{u'@fields': {u'is:0a16324a-0017-47e9-a727-199d1f3e0fce': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
    u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
    u'unk:date': u'2016-11-06T02:45:51-08:00',
    u'unk:group_id': {u'$': 42057, u'type': u'xsd:int'},
    u'unk:hostname': u'Gladstone.domain',
    u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
    u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
    u'unk:process_id': {u'$': 42064, u'type': u'xsd:int'},
    u'unk:sge_job_id': u'100'}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:51.928Z',
  u'@version': 1,
  u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
  u'host': u'127.0.0.1',
  u'instance': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
  u'port': 56339,
  u'prov': u'activity'},
 {u'@fields': {u'is:5a4e5132-cf82-4bb1-99b6-d4a673391512': {u'unk:args': u'--child calculate_pafs --tag first_hist

In [28]:
# For activities, what does the instance space look like?
activity_instances = {log["instance"] for log in logs_by_type["activity"]}
len(activity_instances)

358

In [29]:
activity_instances

{u'is:00129b99-6eea-4b4d-9b61-8c6153c41b5d',
 u'is:005dde4e-5d43-4d5a-b761-4a336a03a0f2',
 u'is:00a2325e-95a0-4bb8-a14b-5f133a0f1608',
 u'is:00e6a7e2-5cd6-418b-b4de-d77e321d8e9c',
 u'is:0284f0ad-5357-4273-9c1f-0e2462581d4d',
 u'is:0416b8ba-89a3-4aad-ac51-9dcaa54dfb72',
 u'is:061806c9-32e1-44d1-aeaf-73aa90aa03ef',
 u'is:0629bae4-689b-44e6-be78-01ca5140ad08',
 u'is:076da3c1-8c6e-4038-afc5-fed7550c2587',
 u'is:07bb8d38-966d-4de2-a038-40e13ce4f0cc',
 u'is:07c2b9f3-e318-4344-9d61-7c47fcfd4cca',
 u'is:08be2178-0314-425e-8d3c-1ee510c5f93e',
 u'is:08ebd293-b123-46c2-91ed-4ad5384b94c4',
 u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'is:0aefb9f0-c849-4bf6-b519-4d2dbcd92fd7',
 u'is:0bf11b61-d464-4621-a1ea-3c5704142255',
 u'is:0c135172-dd66-4a53-9d7d-332a376d8c38',
 u'is:0c3f93a9-9927-41b5-90b3-6c84dfa38adb',
 u'is:0cc6b959-1950-4b4b-9387-215fdf711d40',
 u'is:0cce5da9-f531-4717-9e1a-1ba0af3e8105',
 u'is:0d153790-4efd-4ac9-8ed4-ffbf3becb140',
 u'is:0da325e9-e56e-4003-b888-6db11eef436a',
 u'is:0ef5

In [30]:
# OK great, each activity instance tag seems to be a single, hash-like string prefixed with 'is:' (perhaps namespace?)
# Let's investigage the fields for an activity
logs_by_type["activity"][0]

{u'@fields': {u'is:0a16324a-0017-47e9-a727-199d1f3e0fce': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
   u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
   u'unk:date': u'2016-11-06T02:45:51-08:00',
   u'unk:group_id': {u'$': 42057, u'type': u'xsd:int'},
   u'unk:hostname': u'Gladstone.domain',
   u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
   u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
   u'unk:process_id': {u'$': 42064, u'type': u'xsd:int'},
   u'unk:sge_job_id': u'100'}},
 u'@message': u'create_file3',
 u'@source_host': u'withme',
 u'@timestamp': u'2016-11-06T10:45:51.928Z',
 u'@version': 1,
 u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'host': u'127.0.0.1',
 u'instance': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'port': 56339,
 u'prov': u'activity'}

In [31]:
# OK, there are many more properties than for an agent, but this is manageable.
# Activity is defined by "@fields" object, too.
# Is there any variability in the prefix for each of the field names for an activity instance?
activity_fields_object_values_keys = set()
for activity_document in logs_by_type["activity"]:
    fields_object_values = activity_document["@fields"].values()
    assert 1 == sum(1 for _ in fields_object_values)
    activity_fields_object_values_keys.update(fields_object_values[0].keys())
len(activity_fields_object_values_keys)

9

In [32]:
activity_fields_object_values_keys

{u'unk:args',
 u'unk:command',
 u'unk:date',
 u'unk:group_id',
 u'unk:hostname',
 u'unk:interpreter',
 u'unk:platform',
 u'unk:process_id',
 u'unk:sge_job_id'}

In [33]:
# What about an entity?
log_trio_by_type["entity"]

[{u'@fields': {u'code:tests/make_history.py': {u'unk:script': u'/Users/vr24/code/provda/tests/make_history.py',
    u'unk:version_branch': u'tinkering',
    u'unk:version_branch_hash': u'372d74f21713f47642fc424e7e3289f38b2ed5a0',
    u'unk:version_remote': u'https://vr24@stash.ihme.washington.edu/scm/~adolgert/provda.git'},
   u'doc:gbd-read/schema/table': {},
   u'doc:gbd/first_history_test0/cvd_ihd.hdf': {},
   u'doc:paf/first_history_test0/cvd_ihd.hdf': {}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:51.927Z',
  u'@version': 1,
  u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
  u'host': u'127.0.0.1',
  u'instance': u'doc:gbd-read/schema/table',
  u'port': 56339,
  u'prov': u'entity'},
 {u'@fields': {u'code:tests/make_history.py': {u'unk:script': u'/Users/vr24/code/provda/tests/make_history.py',
    u'unk:version_branch': u'tinkering',
    u'unk:version_branch_hash': u'372d74f21713f47642fc424e7e3289f38b2ed5a0',
    u'un

In [34]:
logs[0]

{u'@fields': {u'_:id2': {u'prov:activity': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
   u'prov:agent': u'people:vr24'}},
 u'@message': u'create_file3',
 u'@source_host': u'withme',
 u'@timestamp': u'2016-11-06T10:45:51.927Z',
 u'@version': 1,
 u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'host': u'127.0.0.1',
 u'instance': u'_:id2',
 u'port': 56339,
 u'prov': u'wasAssociatedWith'}

In [39]:
instance_id = 'is:0a16324a-0017-47e9-a727-199d1f3e0fce'
specific_instance_logs = \
    filter(lambda log: log["instance"] == instance_id, logs)
len(specific_instance_logs)

1

In [40]:
specific_instance_logs[0]

{u'@fields': {u'is:0a16324a-0017-47e9-a727-199d1f3e0fce': {u'unk:args': u'--child calculate_pafs --tag first_history_test0',
   u'unk:command': u'/Users/vr24/virtualenvs/general_personal_dev_env/bin/python',
   u'unk:date': u'2016-11-06T02:45:51-08:00',
   u'unk:group_id': {u'$': 42057, u'type': u'xsd:int'},
   u'unk:hostname': u'Gladstone.domain',
   u'unk:interpreter': u'2.7.10 (default, Oct 23 2015, 19:19:21) ',
   u'unk:platform': u'Darwin-15.5.0-x86_64-i386-64bit',
   u'unk:process_id': {u'$': 42064, u'type': u'xsd:int'},
   u'unk:sge_job_id': u'100'}},
 u'@message': u'create_file3',
 u'@source_host': u'withme',
 u'@timestamp': u'2016-11-06T10:45:51.928Z',
 u'@version': 1,
 u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'host': u'127.0.0.1',
 u'instance': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'port': 56339,
 u'prov': u'activity'}

In [42]:
document_id = 'is:0a16324a-0017-47e9-a727-199d1f3e0fce'
specific_document_logs = \
    filter(lambda log: log["document"] == document_id, logs)
len(specific_document_logs)

18

In [43]:
specific_document_logs

[{u'@fields': {u'_:id2': {u'prov:activity': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
    u'prov:agent': u'people:vr24'}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:51.927Z',
  u'@version': 1,
  u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
  u'host': u'127.0.0.1',
  u'instance': u'_:id2',
  u'port': 56339,
  u'prov': u'wasAssociatedWith'},
 {u'@fields': {u'_:id1': {u'prov:activity': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
    u'prov:entity': u'code:tests/make_history.py'},
   u'_:id3': {u'prov:activity': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
    u'prov:entity': u'doc:gbd/first_history_test0/cvd_ihd.hdf'},
   u'_:id4': {u'prov:activity': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
    u'prov:entity': u'doc:gbd-read/schema/table'}},
  u'@message': u'create_file3',
  u'@source_host': u'withme',
  u'@timestamp': u'2016-11-06T10:45:51.927Z',
  u'@version': 1,
  u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0

In [44]:
# Does the space of documents account for the total log collection? Let's investigate!
document_ids = {log["document"] for log in logs}
len(document_ids)

361

In [45]:
def filter_records(logs, field, match_value):
    return [log for log in logs if log[field] == match_value]

# What's the distribution of number of records per document ID?
num_docs_by_record_count = defaultdict(int)
for doc_id in document_ids:
    num_docs_by_record_count[sum(1 for _ in filter_records(logs, "document", doc_id))] += 1
num_docs_by_record_count

defaultdict(int, {17: 3, 18: 264, 20: 64, 34: 30})

In [46]:
{log["prov"] for log in logs}

{u'activity',
 u'agent',
 u'entity',
 u'hadMember',
 u'prefix',
 u'used',
 u'wasAssociatedWith',
 u'wasGeneratedBy',
 u'wasInfluencedBy'}

In [50]:
# What's the range on timestamps?
timestamp_histogram = Counter([log["@timestamp"].split(".")[0] for log in logs])

In [51]:
timestamp_histogram

Counter({u'2016-11-06T10:45:51': 18,
         u'2016-11-06T10:45:52': 18,
         u'2016-11-06T10:45:53': 18,
         u'2016-11-06T10:45:54': 36,
         u'2016-11-06T10:45:55': 18,
         u'2016-11-06T10:45:56': 36,
         u'2016-11-06T10:45:57': 20,
         u'2016-11-06T10:45:58': 38,
         u'2016-11-06T10:45:59': 36,
         u'2016-11-06T10:46:00': 36,
         u'2016-11-06T10:46:01': 36,
         u'2016-11-06T10:46:02': 18,
         u'2016-11-06T10:46:03': 40,
         u'2016-11-06T10:46:05': 36,
         u'2016-11-06T10:46:06': 36,
         u'2016-11-06T10:46:07': 36,
         u'2016-11-06T10:46:08': 36,
         u'2016-11-06T10:46:09': 40,
         u'2016-11-06T10:46:10': 18,
         u'2016-11-06T10:46:11': 36,
         u'2016-11-06T10:46:12': 36,
         u'2016-11-06T10:46:13': 54,
         u'2016-11-06T10:46:14': 20,
         u'2016-11-06T10:46:15': 38,
         u'2016-11-06T10:46:16': 36,
         u'2016-11-06T10:46:17': 36,
         u'2016-11-06T10:46:18': 54,
 

In [52]:
code_logs = filter(lambda log: log["instance"].startswith("code"), logs)
len(code_logs)

718

In [53]:
len(logs)

7103

In [56]:
code_logs[0]

{u'@fields': {u'code:tests/make_history.py': {u'unk:script': u'/Users/vr24/code/provda/tests/make_history.py',
   u'unk:version_branch': u'tinkering',
   u'unk:version_branch_hash': u'372d74f21713f47642fc424e7e3289f38b2ed5a0',
   u'unk:version_remote': u'https://vr24@stash.ihme.washington.edu/scm/~adolgert/provda.git'},
  u'doc:gbd-read/schema/table': {},
  u'doc:gbd/first_history_test0/cvd_ihd.hdf': {},
  u'doc:paf/first_history_test0/cvd_ihd.hdf': {}},
 u'@message': u'create_file3',
 u'@source_host': u'withme',
 u'@timestamp': u'2016-11-06T10:45:51.927Z',
 u'@version': 1,
 u'document': u'is:0a16324a-0017-47e9-a727-199d1f3e0fce',
 u'host': u'127.0.0.1',
 u'instance': u'code:tests/make_history.py',
 u'port': 56339,
 u'prov': u'entity'}

In [67]:
def dt_from_record(record):
    dt_text = record["@timestamp"]
    return datetime.datetime.strptime(dt_text, "%Y-%m-%dT%H:%M:%S.%fZ")

In [68]:
dt_from_record(code_logs[0])

datetime.datetime(2016, 11, 6, 10, 45, 51, 927000)

In [66]:
datetime.datetime.strptime('2016-11-06T10:45:51Z', '%Y-%m-%dT%H:%M:%SZ')

datetime.datetime(2016, 11, 6, 10, 45, 51)

In [70]:
code_logs_times = sorted(map(dt_from_record, code_logs))
code_logs_times[0]

datetime.datetime(2016, 11, 6, 10, 45, 51, 927000)

In [71]:
min(code_logs_times)

datetime.datetime(2016, 11, 6, 10, 45, 51, 927000)

In [72]:
max(code_logs_times)

datetime.datetime(2016, 11, 6, 11, 5, 0, 596000)