### Install the _Kubeflow-metadata_ library (Load prereqs)
_**Note:** Make sure to have run:_

```bash
kubectl port-forward --namespace kubeflow $(kubectl get pod --namespace kubeflow --selector="component=grpc-server,kustomize.component=metadata" --output jsonpath='{.items[0].metadata.name}') 8080:8080
```

In [ ]:
# To use the latest publish `kubeflow-metadata` library, you can run:
!pip install kubeflow-metadata --user
# Install other packages:
!pip install pandas --user
# Then restart the Notebook kernel.

In [1]:
import pandas
from kubeflow.metadata import metadata
from datetime import datetime
from uuid import uuid4
import numpy as np

### Load all test cases
- All 5 columns
- 7 Columns worth of data
- Partials:
  - Active Execution
  - 3 columns
  - 4 columns
- Multiples
  - Multi-Input
  - Multi-Output
  - Multi-Execution

In [2]:
cases = [
    "All 5 columns",
    "7 Columns worth of data",
    "Active Execution",
    "3 columns",
    "4 columns"
]

ws = [
    metadata.Workspace(
        store=metadata.Store(grpc_host="localhost", grpc_port=8080),
        name="test_case_{}".format(i),
        description=x,
        labels={"n1": "v1"}) for i, x in enumerate(cases)]

In [3]:
runs = [
    metadata.Run(
        workspace=w,
        name="run-" + datetime.utcnow().isoformat("T") ,
        description="a run in ws_{}".format(i),
    ) for i, w in enumerate(ws)
]

In [4]:
execs = [
    metadata.Execution(
        name = "execution-" + datetime.utcnow().isoformat("T") ,
        workspace=w,
        run=runs[i],
        description=cases[i],
    ) for i, w in enumerate(ws)]

In [5]:
print('Created executions:', list(map(lambda x: x.id, execs)))

Created executions: [61, 62, 63, 64, 65]


### Let's create fake data sources, that can be shared by our sources

In [6]:
get_date_set_version = lambda: "data_set_version_" + str(uuid4())
fileSources = [
    metadata.DataSet(
        description="Sample file set 1",
        name="table-dump",
        owner="ap@kubeflow.org",
        uri="file://datasets/dump1",
        version=get_date_set_version(),
        query="SELECT * FROM mytable"),
    metadata.DataSet(
        description="Sample file set 2",
        name="cloud-table",
        owner="ap@kubeflow.org",
        uri="gs://cloud/table.csv",
        version=get_date_set_version(),
        query="SELECT * FROM mytable"),
]

how_many_sources = np.random.choice(len(fileSources), len(execs))
data_sets = []

for i, src_count in enumerate(how_many_sources):
    exec = execs[i]
    ds = fileSources[0:src_count+1]
    ds = list(map(lambda x: exec.log_input(x), ds))
    print("Data sets:", ["{{id: {0.id}, version: '{0.version}'}}".format(d) for d in ds])
    data_sets.append(ds)

Data sets: ["{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}"]
Data sets: ["{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}"]
Data sets: ["{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}", "{id: 96, version: 'data_set_version_7cbfcb57-101f-4a7b-9d37-72547e7314ca'}"]
Data sets: ["{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}"]
Data sets: ["{id: 95, version: 'data_set_version_980a618d-0b0f-431a-b58d-d31e6f792ba7'}", "{id: 96, version: 'data_set_version_7cbfcb57-101f-4a7b-9d37-72547e7314ca'}"]


### Log a model

In [7]:
models = []
for i, exec in enumerate(execs):
    model_version = "model_version_{}".format(i)
    l = []
    l.append(exec.log_output(
        metadata.Model(
            name="MNIST",
            description="model to recognize handwritten digits",
            owner="someone@kubeflow.org",
            uri="gcs://my-bucket/mnist",
            model_type="neural network",
            training_framework={
                "name": "tensorflow",
                "version": "v1.0"
            },
            hyperparameters={
                "learning_rate": 0.5,
                "layers": [10, 3, 1],
                "early_stop": True
            },
            version=model_version,
            labels={"mylabel": "l1"})))
    if np.random.choice(2, 1, p=[.6, .4]) == 1:
        l.append(exec.log_output(
            metadata.Model(
                name="SVHN",
                description="model to recognize house numbers on map images",
                owner="ap@kubeflow.org",
                uri="gcs://my-bucket/svhn",
                model_type="neural network",
                training_framework={
                    "name": "pytorch",
                    "version": "v1.0"
                },
                hyperparameters={
                    "learning_rate": 0.0001,
                    "layers": [10, 3, 1],
                    "early_stop": True
                },
                version=model_version,
                labels={"mylabel": "l1"})))
    print("Models:", ["{{id: {0.id}, version: '{0.version}'}}".format(d) for d in l])
    models.append(l)

Models: ["{id: 11, version: 'model_version_0'}"]
Models: ["{id: 12, version: 'model_version_1'}", "{id: 30, version: 'model_version_1'}"]
Models: ["{id: 13, version: 'model_version_2'}"]
Models: ["{id: 15, version: 'model_version_3'}", "{id: 74, version: 'model_version_3'}"]
Models: ["{id: 16, version: 'model_version_4'}"]


### Log an evaluation of a model

In [8]:
for i, exec in enumerate(execs):
    for model in models[i]:
        for data_set in data_sets[i]:
            metrics = exec.log_output(
                metadata.Metrics(
                    name="{}-evaluation".format(model.name),
                    description="validating the {0.name} model to {0.description}".format(model),
                    owner=model.owner,
                    uri="gcs://my-bucket/{}-eval.csv".format(model.name.lower()),
                    data_set_id=str(data_set.id),
                    model_id=str(model.id),
                    metrics_type=metadata.Metrics.VALIDATION,
                    values={"accuracy": np.random.uniform(low=.6)},
                    labels={"mylabel": "l1"}))
print('Metrics created!')

Metrics created!


### Add Metadata for serving the model

In [13]:
for i, w in enumerate(ws):
    serving_application = None
    if i in [2,3]:
        print('Retrain for', i)
        serving_application = metadata.Execution(
            name="Retrain step",
            workspace=w,
            run=runs[i],
            description="retrain model to be more accurate on a scoped problem",
        )
    else:
        serving_application = metadata.Execution(
            name="serving model",
            workspace=w,
            description="an execution to represent model serving component",
        )
    for model in models[i]:
        # Noticed we use model name, version, uri to uniquely identify existing model.
        served_model = metadata.Model(
            name="MNIST",
            uri="gcs://my-bucket/mnist",
            version=model.version,
        )
        m=serving_application.log_input(served_model)
        if i in [2,3]:
            print('Attaching new model', i)
            o_model = metadata.Model(
                name="Retrained MNIST",
                description="better recognition of slanted digits",
                owner="ap@kubeflow.org",
                uri="gcs://my-bucket/mnist-slanted",
                model_type="neural network",
                training_framework={
                    "name": "pytorch",
                    "version": "v1.0"
                },
                hyperparameters={
                    "learning_rate": 0.01,
                    "layers": [5, 3, 1],
                    "early_stop": True
                },
                version=model_version,
                labels={"mylabel": "l2"}
            )
            serving_application.log_output(o_model)
            exec = metadata.Execution(
                name="serving model",
                workspace=w,
                run=runs[i],
                description="an execution to represent model serving component",
            )
            exec.log_input(o_model)

### List all models in the workspace

In [19]:
pandas.DataFrame.from_dict(ws[2].list(metadata.Model.ARTIFACT_TYPE_NAME))

Unnamed: 0,id,workspace,run,version,owner,description,name,model_type,create_time,uri,training_framework,hyperparameters,labels,kwargs
0,13,test_case_2,run-2019-12-13T23:05:35.516946,model_version_2,someone@kubeflow.org,model to recognize handwritten digits,MNIST,neural network,2019-12-13T23:05:38.064051Z,gcs://my-bucket/mnist,"{'name': 'tensorflow', 'version': 'v1.0'}","{'learning_rate': 0.5, 'layers': [10, 3, 1], '...",{'mylabel': 'l1'},{}
1,14,test_case_2,run-2019-12-13T23:05:35.516946,model_version_2,ap@kubeflow.org,model to recognize house numbers on map images,SVHN,neural network,2019-12-13T23:05:38.641097Z,gcs://my-bucket/svhn,"{'name': 'pytorch', 'version': 'v1.0'}","{'learning_rate': 0.0001, 'layers': [10, 3, 1]...",{'mylabel': 'l1'},{}


### Basic Lineage Tracking

In [ ]:
# print("model id is %s\n" % model.id)
    
# model_events = ws1.store.get_events_by_artifact_ids([model.id])

# execution_ids = set(e.execution_id for e in model_events)
# print("All executions related to the model are {}".format(execution_ids))
# # assert execution_ids == set([serving_application.id, exec.id])

# trainer_events = ws1.store.get_events_by_execution_ids([exec.id])
# artifact_ids = set(e.artifact_id for e in trainer_events)
# print("All artifacts related to the training event are {}".format(artifact_ids))# assert artifact_ids == set([model.id, metrics.id, data_set.id])