In [2]:
# imports
import itertools
import os
import random
import six

from ml_metadata.proto import metadata_store_pb2

In [3]:
# Import shared utils.
%run 'mlmd_utils.ipynb'

In [4]:
# Run TFX to generate TFMA and Tensorboard events.
# cd ~
# git clone https://github.com/tensorflow/tfx.git
# sudo apt-get install python-pip python-virtualenv python-dev build-essential
# cd tfx/examples/chicago_taxi/
# pip install -r requirements.txt
# jupyter nbextension install --py --symlink --sys-prefix tensorflow_model_analysis
# jupyter nbextension enable --py --sys-prefix tensorflow_model_analysis
# 
# !! Restart your jupyter notebook to pick up these tfma notebook extensions.!!
#
# bash ./tfdv_analyze_and_validate_local.sh
# bash ./preprocess_local.sh
# bash ./train_local.sh
# vim process_tfma.py +64
# Change tfma.slicer.SingleSliceSpec to tfma.slicer.slicer.SingleSliceSpec. on L64 and L65.
# bash ./process_tfma_local.sh

In [5]:
delete_sqlite_db()

In [6]:
# util methods specific to this notebook (mlmd_test_data).

def _set_artifact_or_execution_type_properties(mlmd_type, **properties):
    """Sets the property types in mlmd_type based on given properties."""
    for k, v in properties.items():
        if isinstance(v, six.string_types):
            mlmd_type.properties[k] = metadata_store_pb2.STRING
        elif isinstance(v, six.integer_types):
            mlmd_type.properties[k] = metadata_store_pb2.INT
        elif type(v) is float:
            mlmd_type.properties[k] = metadata_store_pb2.DOUBLE
        else:
            raise ValueError(
                '{}\'s type {} must be a string/int/float'.format(k, type(v)))


def create_artifact_type(store, type_name, **properties):
    """Creates an artifact type with given `type_name` and `properties` and returns its id."""
    at = metadata_store_pb2.ArtifactType()
    at.name = type_name
    _set_artifact_or_execution_type_properties(at, **properties)
    return store.put_artifact_type(at)


def create_execution_type(store, type_name, **properties):
    """Creates an execution type with given `type_name` and `properties` and returns its id."""
    et = metadata_store_pb2.ExecutionType()
    et.name = type_name
    _set_artifact_or_execution_type_properties(et, **properties)
    return store.put_execution_type(et)

def _set_artifact_or_execution_properties(obj, is_artifact, **properties):
    """Sets the properties in an artifact/execution `obj`  based on given properties."""
    for k, v in properties.items():
        if is_artifact and k == 'uri':
            obj.uri = v
        elif isinstance(v, six.string_types):
            obj.properties[k].string_value = v
        elif isinstance(v, six.integer_types):
            obj.properties[k].int_value = v
        elif type(v) is float:
            obj.properties[k].float_value = v
        else:
            raise ValueError(
                '{}\'s type {} must be a string/int/float'.format(k, type(v)))

def create_artifact(type_name, **properties):
    """Creates an artifact with given `type_name` and `properties` and returns its id."""
    a = metadata_store_pb2.Artifact()
    a.type_id = tfx_artifact_types[type_name]
    _set_artifact_or_execution_properties(a, True, **properties)
    if random.randint(0, 10) % 2 == 0:
        a.custom_properties['optional_tag'].int_value = random.randint(0, 100)
    [a_id] = store.put_artifacts([a])
    return a_id

def create_execution(type_name, **properties):
    """Creates an execution with given `type_name` and `properties` and returns its id."""
    e = metadata_store_pb2.Execution()
    e.type_id = tfx_execution_types[type_name]
    _set_artifact_or_execution_properties(e, False, **properties)
    if random.randint(0, 10) % 2 == 0:
        e.custom_properties['optional_tag'].int_value = random.randint(0, 100)
    [e_id] = store.put_executions([e])
    return e_id

def create_events(execution_id, input_artifact_ids, output_artifact_ids):
    events = []

    # Input events.
    for input_artifact_id in input_artifact_ids:
        e = metadata_store_pb2.Event()
        e.artifact_id = input_artifact_id
        e.execution_id = execution_id
        e.type = metadata_store_pb2.Event.DECLARED_INPUT
        events.append(e)

    # Output events.
    for output_artifact_id in output_artifact_ids:
        e = metadata_store_pb2.Event()
        e.artifact_id = output_artifact_id
        e.execution_id = execution_id
        e.type = metadata_store_pb2.Event.DECLARED_OUTPUT
        events.append(e)

    return store.put_events(events)

In [7]:
# Setup a `metadata_store.MetadataStore` that connects to a SQLITE backend based off
# ~/tfx_metadata_sqlite.db.
store = get_metadata_store()

In [8]:
# Create TFX artifact and execution types.
def create_tfx_artifact_types():
    ats = {}
    ats[TFX_ARTIFACT_EXAMPLES] = create_artifact_type(
        store, TFX_ARTIFACT_EXAMPLES, span=0, split='', version=0)
    ats[TFX_ARTIFACT_SCHEMA] = create_artifact_type(
        store, TFX_ARTIFACT_SCHEMA, version=0)
    ats[TFX_ARTIFACT_EXAMPLE_VALIDATION] = create_artifact_type(
        store, TFX_ARTIFACT_EXAMPLE_VALIDATION)
    ats[TFX_ARTIFACT_EXAMPLE_STATS] = create_artifact_type(
        store, TFX_ARTIFACT_EXAMPLE_STATS, num_numeric_features=0,
        num_categorical_features=0)
    ats[TFX_ARTIFACT_TRANSFORMED_EXAMPLES] = create_artifact_type(
        store, TFX_ARTIFACT_TRANSFORMED_EXAMPLES, span=0, split='', version=0,
        transform_name="")
    ats[TFX_ARTIFACT_MODEL] = create_artifact_type(
        store, TFX_ARTIFACT_MODEL, model_type='', model_disk_size=0)
    ats[TFX_ARTIFACT_MODEL_EVAL] = create_artifact_type(
        store, TFX_ARTIFACT_MODEL_EVAL, evaluation_name="")
    return ats

def create_tfx_execution_types():
    ets = {}
    ets[TFX_EXECUTION_EXAMPLE_GEN] = create_execution_type(
        store, TFX_EXECUTION_EXAMPLE_GEN, start_time=0, end_time=0)
    ets[TFX_EXECUTION_STATISTICS_GEN] = create_execution_type(
        store, TFX_EXECUTION_STATISTICS_GEN, start_time=0, end_time=0)
    ets[TFX_EXECUTION_SCHEMA_GEN] = create_execution_type(
        store, TFX_EXECUTION_SCHEMA_GEN, start_time=0, end_time=0)
    ets[TFX_EXECUTION_EXAMPLE_VALIDATION] = create_execution_type(
        store, TFX_EXECUTION_EXAMPLE_VALIDATION, start_time=0, end_time=0)
    ets[TFX_EXECUTION_TRANSFORM] = create_execution_type(
        store, TFX_EXECUTION_TRANSFORM, name='', start_time=0, end_time=0)
    ets[TFX_EXECUTION_TRAINER] = create_execution_type(
        store, TFX_EXECUTION_TRAINER, algorithm='',
        hparams_csv='', start_time=0, end_time=0)
    ets[TFX_EXECUTION_EVALUATOR] = create_execution_type(
        store, TFX_EXECUTION_EVALUATOR, start_time=0, end_time=0)
    return ets

tfx_artifact_types = create_tfx_artifact_types()
tfx_execution_types = create_tfx_execution_types()

# print(tfx_artifact_types)
# print(tfx_execution_types)

In [9]:
# Fake some artifacts, executions and events.

data_combinations = [
    [1],  # spans
    ["train", "eval"],  # splits
    [1, 2] # versions
]

for span, split, version in itertools.product(*data_combinations):
    # Simulate a run of TFX_EXECUTION_EXAMPLE_GEN.
    # This run generates the examples.
    example_gen_execution_id = create_execution(
        TFX_EXECUTION_EXAMPLE_GEN, start_time=0, end_time=1) 
    examples_artifact_id = create_artifact(
        TFX_ARTIFACT_EXAMPLES, span=span, split=split, version=version)
    create_events(example_gen_execution_id, [], [examples_artifact_id])

    # Simulate a run of TFX_EXECUTION_STATISTICS_GEN.
    # This run consumes data to generate stats.
    stats_gen_execution_id = create_execution(
        TFX_EXECUTION_STATISTICS_GEN, start_time=1, end_time=2)
    example_stats_artifact_id = create_artifact(
        TFX_ARTIFACT_EXAMPLE_STATS, num_numeric_features=random.randrange(0, 10),
        num_categorical_features=random.randrange(0, 10),
        uri=os.path.join(
            os.environ['HOME'], 
            'tfx/examples/chicago_taxi/data/local_tfdv_output/eval_stats.tfrecord'))
    create_events(
        stats_gen_execution_id, [examples_artifact_id],
        [example_stats_artifact_id])

    # Simulate a run of TFX_EXECUTION_SCHEMA_GEN.
    # This run consumes stats and data to generate schema.
    schema_gen_execution_id = create_execution(
        TFX_EXECUTION_SCHEMA_GEN, start_time=2, end_time=3)
    schema_artifact_id = create_artifact(TFX_ARTIFACT_SCHEMA, version=1)
    create_events(
        schema_gen_execution_id, [examples_artifact_id, example_stats_artifact_id],
        [schema_artifact_id])

    # Simulate a run of TFX_EXECUTION_EXAMPLE_VALIDATION.
    # This run consumes schema and data to validate the data.
    example_validation_execution_id = create_execution(
        TFX_EXECUTION_EXAMPLE_VALIDATION, start_time=3, end_time=4)
    example_validation_artifact_id = create_artifact(TFX_ARTIFACT_EXAMPLE_VALIDATION)
    create_events(
        example_validation_execution_id, [examples_artifact_id, schema_artifact_id],
        [example_validation_artifact_id])
    
    # Simulate a run of TFX_EXECUTION_TRANSFORM.
    # This run consumes data and schema to generate transformed data.
    transform_execution_id = create_execution(
        TFX_EXECUTION_TRANSFORM, name='transform', start_time=4, end_time=5)
    transformed_examples_artifact_id = create_artifact(
        TFX_ARTIFACT_TRANSFORMED_EXAMPLES, span=span, split=split, version=version,
        transform_name='transform')
    create_events(
        transform_execution_id, [examples_artifact_id, schema_artifact_id],
        [transformed_examples_artifact_id])

    # Simulate a run of TFX_EXECUTION_TRAINER.
    # This run consumes transformed data to generate a model and checkpoints/events along the way.
    trainer_execution_id = create_execution(
        TFX_EXECUTION_TRAINER, algorithm='DNNLinearCombined',
        hparams_csv="dnn_hidden_units=[10, 10],dropout=0.1,activation=relu",
        start_time=5, end_time=6)
    model_artifact_id = create_artifact(
        TFX_ARTIFACT_MODEL, model_type="tensorflow",
        model_disk_size=random.randrange(0, 100*1000*1000),
        uri=os.path.join(
            os.environ['HOME'], 
            'tfx/examples/chicago_taxi/data/train/local_chicago_taxi_output/serving_model_dir'))
    create_events(
        trainer_execution_id, [transformed_examples_artifact_id],
        [model_artifact_id])

    # Simulate a run of TFX_EXECUTION_EVALUATOR.
    # This run consumes data and model to generate an eval result.
    evaluator_execution_id = create_execution(
        TFX_EXECUTION_EVALUATOR,
        start_time=6, end_time=7)
    model_eval_artifact_id = create_artifact(
        TFX_ARTIFACT_MODEL_EVAL,
        uri=os.path.join(
            os.environ['HOME'], 
            'tfx/examples/chicago_taxi/data/train/local_chicago_taxi_output/eval_result'),
        evaluation_name='tfma_eval_run')
    create_events(
        evaluator_execution_id, [examples_artifact_id, model_artifact_id],
        [model_eval_artifact_id])

In [10]:
# test read only mode.
r_store = get_metadata_store(
    connection_mode=metadata_store_pb2.SqliteMetadataSourceConfig.READONLY)
#print(r_store.get_artifacts_by_type(TFX_ARTIFACT_MODEL))
#print(r_store.get_events_by_execution_ids([6]))
print(r_store.get_events_by_artifact_ids([5]))

[artifact_id: 5
execution_id: 5
type: DECLARED_OUTPUT
milliseconds_since_epoch: 1550115700689
, artifact_id: 5
execution_id: 6
type: DECLARED_INPUT
milliseconds_since_epoch: 1550115700706
]
