In [None]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Reusable components and Pre-build components

This tutorial describes the manual way of writing a full component program (in any language) and a component definition for it. Below is a summary of the steps involved in creating and using a component:

- Write the program that contains your component’s logic. The program must use files and command-line arguments to pass data to and from the component.
- Containerize the program.
- Write a component specification in YAML format that describes the component for the Kubeflow Pipelines system.
- Use the Kubeflow Pipelines SDK to load your component, use it in a pipeline and run that pipeline.

More over, we will combine our built components together with a pre-build components and a lightweight component to compose a pipeline with three steps
- Train a minist model and export to GCS
- Deploy the exported tensorflow model on AI Platform
- Test the deployment by calling the end point

**Note: Make sure that you have docker installed in the local environment if you want to build the image locally**

In [1]:
import kfp
import kfp.gcp as gcp
import kfp.dsl as dsl
import kfp.compiler as compiler
import kfp.components as comp
import datetime

import kubernetes as k8s

In [2]:
# Required Parameters
PROJECT_ID='<ADD GCP PROJECT HERE>'
GCS_BUCKET='gs://<ADD STORAGE LOCATION HERE>'

In [3]:
# Optional Parameters, but required for running outside kubeflow cluster
HOST = '<ADD HOST NAME TO TALK TO KUBEFLOW PIPELINE HERE>'
CLIENT_ID = '<ADD OAuth CLIENT ID USED BY IAP HERE>'
OTHER_CLIENT_ID = '<ADD OAuth CLIENT ID USED TO OBTAIN AUTH CODES HERE>'
OTHER_CLIENT_SECRET = '<ADD OAuth CLIENT SECRET USED TO OBTAIN AUTH CODES HERE>'

## Create client

**If submit outside the kubeflow cluster, need the following**
- `host`: the host name to use to talk to Kubeflow Pipelines, i.e., "https://`<your-deployment>`.endpoints.`<your-project>`.cloud.goog/pipeline"
- `client_id`: The client ID used by Identity-Aware Proxy
- `other_client_id`: The client ID used to obtain the auth codes and refresh tokens.
- `other_client_secret`: The client secret used to obtain the auth codes and refresh tokens.
- For getting `other_client_id` and `other_client_secret`, you'll need to create OAuth client ID credentials of type `Other` according to the tutorial [here](
https://cloud.google.com/iap/docs/authentication-howto#authenticating_from_a_desktop_app)

```python
client = kfp.Client(host, client_id, other_client_id, other_client_secret)
```

**If you run and submit within the kubeflow cluster**, the following is enough
```python
client = kfp.Client()
```

In [5]:
# Create kfp client
in_cluster = True
try:
  k8s.config.load_incluster_config()
except:
  in_cluster = False
  pass

if in_cluster:
    client = kfp.Client()
else:
    client = kfp.Client(host=HOST, 
                        client_id=CLIENT_ID,
                        other_client_id=OTHER_CLIENT_ID, 
                        other_client_secret=OTHER_CLIENT_SECRET)

# Build reusable components

## Writing the program code

The following cell creates a file `app.py` that contains a Python script. The script takes a GCS bucket name as an input argument, gets the lists of blobs in that bucket, prints the list of blobs and also writes them to an output file.

In [6]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/reuse_components_pipeline/minist_training

# Create the Python file that lists GCS blobs.
cat > ./tmp/reuse_components_pipeline/minist_training/app.py <<HERE
import argparse
from datetime import datetime
import tensorflow as tf

parser = argparse.ArgumentParser()
parser.add_argument(
    '--model_path', type=str, required=True, help='Name of the model file.')
parser.add_argument(
    '--bucket', type=str, required=True, help='GCS bucket name.')
args = parser.parse_args()

bucket=args.bucket
model_path=args.model_path

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(512, activation=tf.nn.relu),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

print(model.summary())    

mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

callbacks = [
  tf.keras.callbacks.TensorBoard(log_dir=bucket + '/logs/' + datetime.now().date().__str__()),
  # Interrupt training if val_loss stops improving for over 2 epochs
  tf.keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
]

model.fit(x_train, y_train, batch_size=32, epochs=5, callbacks=callbacks,
          validation_data=(x_test, y_test))

from tensorflow import gfile

gcs_path = bucket + "/" + model_path
# The export require the folder is new
if gfile.Exists(gcs_path):
    gfile.DeleteRecursively(gcs_path)
tf.keras.experimental.export_saved_model(model, gcs_path)

with open('/output.txt', 'w') as f:
  f.write(gcs_path)
HERE

## Create a Docker container
Create your own container image that includes your program. 
- If your component creates some outputs to be fed as inputs to the downstream components, each separate output must be written as a string to a separate local text file inside the container image. 
- For example, if a trainer component needs to output the trained model path, it can write the path to a local file `/output.txt`. 
- The string written to an output file cannot be too big. If it is too big (>> 100 kB), it is recommended to save the output to an external persistent storage and pass the storage path to the next component.

### Create docker file

Now create a container that runs the script. Start by creating a `Dockerfile`. A `Dockerfile` contains the instructions to assemble a Docker image. The `FROM` statement specifies the Base Image from which you are building. `WORKDIR` sets the working directory. When you assemble the Docker image, `COPY` will copy the required files and directories (for example, `app.py`) to the filesystem of the container. `RUN` will execute a command (for example, install the dependencies) and commits the results. 

In [7]:
%%bash

# Create Dockerfile.
# AI platform only support tensorflow 1.14
cat > ./tmp/reuse_components_pipeline/minist_training/Dockerfile <<EOF
FROM tensorflow/tensorflow:1.14.0-py3
WORKDIR /app
COPY . /app
EOF

### Build docker image

Now that we have created our Dockerfile we can create our Docker image. Then we need to push the image to a registry to host the image. 
- We are going to use the `kfp.containers.build_image_from_working_dir` to build the image and push to the Google Container Registry (GCR), which makes use of [kaniko](https://cloud.google.com/blog/products/gcp/introducing-kaniko-build-container-images-in-kubernetes-and-google-container-builder-even-without-root-access).
- It is definitely possible to build the image using Docker and push to GCR.

**Note**:
If you run the following code from a notebook **within kubeflow cluster** and **with kubeflow version >= 0.7**, you need to make sure that there is valid credential under your notebook's namespace, since the namespace of the notebook server is no long `kubeflow`. 
- With kubeflow version >= 0.7, the credentail is supposed to be copied automatically while creating notebook through `Configurations`, which doesn't work properly at the time of creating this notebook. 
- You can also add credentials to the new namespace by either copying them from an existing Kubeflow namespace or by creating a new service account as explained [here](https://www.kubeflow.org/docs/gke/authentication/#kubeflow-v0-6-and-before-gcp-service-account-key-as-secret).
- The following cell demonstrate how to copy the default secret to your own namespace.

```bash
%%bash

NAMESPACE=<your notebook name space>
SOURCE=kubeflow
NAME=user-gcp-sa
SECRET=$(kubectl get secrets \${NAME} -n \${SOURCE} -o jsonpath="{.data.\${NAME}\.json}" | base64 -D)
kubectl create -n \${NAMESPACE} secret generic \${NAME} --from-literal="\${NAME}.json=\${SECRET}"
```

In [8]:
IMAGE_NAME="minist_training_kf_pipeline"
TAG="latest" # "v_$(date +%Y%m%d_%H%M%S)"

GCR_IMAGE="gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{TAG}".format(
    PROJECT_ID=PROJECT_ID,
    IMAGE_NAME=IMAGE_NAME,
    TAG=TAG
)

builder = kfp.containers._container_builder.ContainerBuilder(
    gcs_staging=GCS_BUCKET + "/kfp_container_build_staging")

image_name = kfp.containers.build_image_from_working_dir(
    image_name=GCR_IMAGE,
    working_dir='./tmp/reuse_components_pipeline/minist_training/',
    builder=builder
)

image_name



'gcr.io/kubeflow-pipeline-fantasy/minist_training_kf_pipeline@sha256:d3e28ca79dcbd61149fedfdbe7aa25b4ccf8f802da66337b61e3e1630a99e045'

#### If you want to use docker to build the image
Run the following in a cell
```bash
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="minist_training_kf_pipeline"
TAG="latest" # "v_$(date +%Y%m%d_%H%M%S)"

# Create script to build docker image and push it.
cat > ./tmp/components/minist_training/build_image.sh <<HERE
PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}
docker image rm \${IMAGE_NAME}
docker image rm \${GCR_IMAGE}
HERE

cd tmp/components/minist_training
bash build_image.sh
```

**Remember to set the image_name after the image is built**
```python
image_name = <the image uri>
```

## Writing your component definition file
To create a component from your containerized program you need to write component specification in YAML format that describes the component for the Kubeflow Pipelines system.

For the complete definition of a Kubeflow Pipelines component, see the [component specification](https://www.kubeflow.org/docs/pipelines/reference/component-spec/). However, for this tutorial you don’t need to know the full schema of the component specification. The tutorial provides enough information for the relevant the components.

Start writing the component definition (component.yaml) by specifying your container image in the component’s implementation section:

In [9]:
%%bash -s "{image_name}"

GCR_IMAGE="${1}"
echo ${GCR_IMAGE}

# Create Yaml
# the image uri should be changed according to the above docker image push output

cat > minist_pipeline_component.yaml <<HERE
name: Minist training
description: Train a minist model and save to GCS
inputs:
  - name: model_path
    description: 'Path of the tf model.'
    type: String
  - name: bucket
    description: 'GCS bucket name.'
    type: String
outputs:
  - name: gcs_model_path
    description: 'Trained model path.'
    type: GCSPath
implementation:
  container:
    image: ${GCR_IMAGE}
    command: [
      python, /app/app.py,
      --model_path, {inputValue: model_path},
      --bucket,     {inputValue: bucket},
    ]
    fileOutputs:
      gcs_model_path: /output.txt
HERE

gcr.io/kubeflow-pipeline-fantasy/minist_training_kf_pipeline@sha256:d3e28ca79dcbd61149fedfdbe7aa25b4ccf8f802da66337b61e3e1630a99e045


In [10]:
import os
minist_train_op = kfp.components.load_component_from_file(os.path.join('./', 'minist_pipeline_component.yaml')) 

In [11]:
minist_train_op.component_spec

ComponentSpec(name='Minist training', description='Train a minist model and save to GCS', metadata=None, inputs=[InputSpec(name='model_path', type='String', description='Path of the tf model.', default=None, optional=False), InputSpec(name='bucket', type='String', description='GCS bucket name.', default=None, optional=False)], outputs=[OutputSpec(name='gcs_model_path', type='GCSPath', description='Trained model path.')], implementation=ContainerImplementation(container=ContainerSpec(image='gcr.io/kubeflow-pipeline-fantasy/minist_training_kf_pipeline@sha256:d3e28ca79dcbd61149fedfdbe7aa25b4ccf8f802da66337b61e3e1630a99e045', command=['python', '/app/app.py', '--model_path', InputValuePlaceholder(input_name='model_path'), '--bucket', InputValuePlaceholder(input_name='bucket')], args=None, env=None, file_outputs={'gcs_model_path': '/output.txt'})), version='google.com/cloud/pipelines/component/v1')

# Define deployment operation on AI Platform

In [12]:
mlengine_deploy_op = comp.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/gcp/ml_engine/deploy/component.yaml')

def deploy(
    project_id,
    model_uri,
    model_id,
    runtime_version,
    python_version):
    
    return mlengine_deploy_op(
        model_uri=model_uri,
        project_id=project_id, 
        model_id=model_id, 
        runtime_version=runtime_version, 
        python_version=python_version,
        replace_existing_version=True, 
        set_default=True)

Kubeflow serving deployment component as an option. **Note that, the deployed Endppoint URI is not availabe as output of this component.**
```python
kubeflow_deploy_op = comp.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/gcp/ml_engine/deploy/component.yaml')

def deploy_kubeflow(
    model_dir,
    tf_server_name):
    return kubeflow_deploy_op(
        model_dir=model_dir,
        server_name=tf_server_name,
        cluster_name='kubeflow', 
        namespace='kubeflow',
        pvc_name='', 
        service_type='ClusterIP')
```

# Create a lightweight component for testing the deployment

In [13]:
def deployment_test(project_id: str, model_name: str, version: str) -> str:

    model_name = model_name.split("/")[-1]
    version = version.split("/")[-1]
    
    import googleapiclient.discovery
    
    def predict(project, model, data, version=None):
      """Run predictions on a list of instances.

      Args:
        project: (str), project where the Cloud ML Engine Model is deployed.
        model: (str), model name.
        data: ([[any]]), list of input instances, where each input instance is a
          list of attributes.
        version: str, version of the model to target.

      Returns:
        Mapping[str: any]: dictionary of prediction results defined by the model.
      """

      service = googleapiclient.discovery.build('ml', 'v1')
      name = 'projects/{}/models/{}'.format(project, model)

      if version is not None:
        name += '/versions/{}'.format(version)

      response = service.projects().predict(
          name=name, body={
              'instances': data
          }).execute()

      if 'error' in response:
        raise RuntimeError(response['error'])

      return response['predictions']

    import tensorflow as tf
    import json
    
    mnist = tf.keras.datasets.mnist
    (x_train, y_train),(x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    result = predict(
        project=project_id,
        model=model_name,
        data=x_test[0:2].tolist(),
        version=version)
    print(result)
    
    return json.dumps(result)

In [17]:
# # Test the function with already deployed version
# deployment_test(
#     project_id=PROJECT_ID,
#     model_name="minist",
#     version='ver_bb1ebd2a06ab7f321ad3db6b3b3d83e6' # previous deployed version for testing
# )

In [14]:
deployment_test_op = comp.func_to_container_op(
    func=deployment_test, 
    base_image="tensorflow/tensorflow:1.15.0-py3",
    packages_to_install=["google-api-python-client==1.7.8"])

# Create your workflow as a Python function

Define your pipeline as a Python function. ` @kfp.dsl.pipeline` is a required decoration including `name` and `description` properties. Then compile the pipeline function. After the compilation is completed, a pipeline file is created.

In [15]:
# Define the pipeline
@dsl.pipeline(
   name='Minist pipeline',
   description='A toy pipeline that performs minist model training.'
)
def minist_reuse_component_pipeline(
    project_id: str = PROJECT_ID,
    model_path: str = 'mnist_model', 
    bucket: str = GCS_BUCKET
):
    train_task = minist_train_op(
        model_path=model_path, 
        bucket=bucket
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    deploy_task = deploy(
        project_id=project_id,
        model_uri=train_task.outputs['gcs_model_path'],
        model_id="minist", 
        runtime_version="1.14",
        python_version="3.5"
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))  
    
    deploy_test_task = deployment_test_op(
        project_id=project_id,
        model_name=deploy_task.outputs["model_name"], 
        version=deploy_task.outputs["version_name"],
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    return True

In [16]:
pipeline_func = minist_reuse_component_pipeline
pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'

compiler.Compiler().compile(pipeline_func, pipeline_filename)
#Submit a pipeline run
arguments = {"model_path":"mnist_model",
             "bucket":GCS_BUCKET}
run_name = pipeline_func.__name__ + ' run'
experiment = client.create_experiment('python-functions-minist')

run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)

## Submit pipeline directly from pipeline function
# client.create_run_from_pipeline_func(pipeline_func, experiment_name={}, run_name={} arguments={})

  serialized_value),
