### Install the _Kubeflow-metadata_ library

In [1]:
# To use the latest publish `kubeflow-metadata` library, you can run:
!pip install kubeflow-metadata --user
# Install other packages:
!pip install pandas --user
# Then restart the Notebook kernel.

Collecting kubeflow-metadata
  Downloading kubeflow-metadata-0.3.1.tar.gz (10 kB)
Collecting ml-metadata==0.21.1
  Downloading ml_metadata-0.21.1-cp36-cp36m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 43.9 MB/s eta 0:00:01
[?25hCollecting retrying
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: kubeflow-metadata, retrying
  Building wheel for kubeflow-metadata (setup.py) ... [?25ldone
[?25h  Created wheel for kubeflow-metadata: filename=kubeflow_metadata-0.3.1-py3-none-any.whl size=12490 sha256=4145ab4fde8f292a67b55d8ecac8f2dc1dbad1fe1b56625df3d808f987e45653
  Stored in directory: /home/jovyan/.cache/pip/wheels/04/e9/b7/ef010eb3ef0e48343f764fbf1b2eafa6cd967fd468dc9808e5
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=9530 sha256=4e1003b8e8e6b46b0051c5da07bb8035b095dc95d483773119eb88247a1bdd8a
  Stored in directory: /hom

In [2]:
import pandas
from kubeflow.metadata import metadata
from datetime import datetime
from uuid import uuid4

In [3]:
METADATA_STORE_HOST = "metadata-grpc-service.kubeflow" # default DNS of Kubeflow Metadata gRPC serivce.
METADATA_STORE_PORT = 8080

### Create a workspace

In [4]:
ws1 = metadata.Workspace(
    # Connect to metadata service in namespace kubeflow in k8s cluster.
    store=metadata.Store(grpc_host=METADATA_STORE_HOST, grpc_port=METADATA_STORE_PORT),
    name="workspace_2",
    description="a workspace for testing2",
    labels={"n1": "v1"})

### Create a run in a workspace

In [12]:
r = metadata.Run(
    workspace=ws1,
    name="run-" + datetime.utcnow().isoformat("T") ,
    description="a run in ws_1",
)

### Create an execution in a run

In [13]:
exec = metadata.Execution(
    name = "execution" + datetime.utcnow().isoformat("T") ,
    workspace=ws1,
    run=r,
    description="execution example",
)
print("An execution was created with id %s" % exec.id)

An execution was created with id 6


### Log a data set

In [14]:
date_set_version = "data_set_version_" + str(uuid4())
data_set = exec.log_input(
        metadata.DataSet(
            description="an example data",
            name="mytable-dump",
            owner="owner@my-company.org",
            uri="file://path/to/dataset",
            version=date_set_version,
            query="SELECT * FROM mytable"))
print("Data set id is {0.id} with version '{0.version}'".format(data_set))

Data set id is 9 with version 'data_set_version_e9ac739e-7590-4f3f-918d-822aaf669a77'


### Log a model

In [15]:
model_version = "model_version_" + str(uuid4())
model = exec.log_output(
    metadata.Model(
            name="MNIST",
            description="model to recognize handwritten digits",
            owner="someone@kubeflow.org",
            uri="gcs://my-bucket/mnist",
            model_type="neural network",
            training_framework={
                "name": "tensorflow",
                "version": "v1.0"
            },
            hyperparameters={
                "learning_rate": 0.5,
                "layers": [10, 3, 1],
                "early_stop": True
            },
            version=model_version,
            labels={"mylabel": "l1"}))
print(model)
print("\nModel id is {0.id} and version is {0.version}".format(model))

kubeflow.metadata.metadata.Model(workspace=None, name='MNIST', description='model to recognize handwritten digits', owner='someone@kubeflow.org', uri='gcs://my-bucket/mnist', version='model_version_acf1dd72-e9f1-4c38-8b3e-9c5dc5937c69', model_type='neural network', training_framework={'name': 'tensorflow', 'version': 'v1.0'}, hyperparameters={'learning_rate': 0.5, 'layers': [10, 3, 1], 'early_stop': True}, labels={'mylabel': 'l1'}, id=10, create_time='2020-11-06T08:26:09.262462Z', kwargs={})

Model id is 10 and version is model_version_acf1dd72-e9f1-4c38-8b3e-9c5dc5937c69


### Log the evaluation of a model

In [16]:
metrics = exec.log_output(
    metadata.Metrics(
            name="MNIST-evaluation",
            description="validating the MNIST model to recognize handwritten digits",
            owner="someone@kubeflow.org",
            uri="gcs://my-bucket/mnist-eval.csv",
            data_set_id=str(data_set.id),
            model_id=str(model.id),
            metrics_type=metadata.Metrics.VALIDATION,
            values={"accuracy": 0.95},
            labels={"mylabel": "l1"}))
print("Metrics id is %s" % metrics.id)

Metrics id is 11


### Add Metadata for serving the model

In [17]:
serving_application = metadata.Execution(
    name="serving model",
    workspace=ws1,
    description="an execution to represent model serving component",
)
# Noticed we use model name, version, uri to uniquely identify existing model.
served_model = metadata.Model(
    name="MNIST",
    uri="gcs://my-bucket/mnist",
    version=model.version,
)
m=serving_application.log_input(served_model)
print("Found the mode with id {0.id} and version '{0.version}'.".format(m))

Found the mode with id 10 and version 'model_version_acf1dd72-e9f1-4c38-8b3e-9c5dc5937c69'.


### List all models in the workspace

In [18]:
pandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME))

Unnamed: 0,id,workspace,run,create_time,version,owner,description,name,model_type,uri,training_framework,hyperparameters,labels,kwargs
0,7,workspace_2,run-2020-11-06T08:22:14.887534,2020-11-06T08:23:24.769561Z,model_version_16898c29-3a53-4215-831e-4037b86e...,someone@kubeflow.org,model to recognize handwritten digits,MNIST,neural network,gcs://my-bucket/mnist,"{'name': 'tensorflow', 'version': 'v1.0'}","{'learning_rate': 0.5, 'layers': [10, 3, 1], '...",{'mylabel': 'l1'},{}
1,10,workspace_2,run-2020-11-06T08:26:06.965447,2020-11-06T08:26:09.262462Z,model_version_acf1dd72-e9f1-4c38-8b3e-9c5dc593...,someone@kubeflow.org,model to recognize handwritten digits,MNIST,neural network,gcs://my-bucket/mnist,"{'name': 'tensorflow', 'version': 'v1.0'}","{'learning_rate': 0.5, 'layers': [10, 3, 1], '...",{'mylabel': 'l1'},{}


### Basic Lineage Tracking

In [20]:
print("Model id is %s\n" % model.id)
    
model_events = ws1.store.get_events_by_artifact_ids([model.id])

execution_ids = set(e.execution_id for e in model_events)
print("All executions related to the model are {}".format(execution_ids))
# assert execution_ids == set([serving_application.id, exec.id])

trainer_events = ws1.store.get_events_by_execution_ids([exec.id])
artifact_ids = set(e.artifact_id for e in trainer_events)
print("All artifacts related to the training event are {}".format(artifact_ids))# assert artifact_ids == set([model.id, metrics.id, data_set.id])

Model id is 10

All executions related to the model are {6, 7}
All artifacts related to the training event are {9, 10, 11}
