# Overview

Demostrate AI capabilities using lightGBM framework, Vertex Custom Training Containers, Fast API as webserver (Prediction Custom Containers) in a Vertex Pipeline.

### Set Variables

In [31]:
DATASET_URI = 'gs://vtx-datasets-public/breast_cancer_data.csv'
PIPELINE_ROOT_PATH = 'gs://vtx-root-path'
PROJECT_ID= 'jchavezar-demo'
MODELS_URI = 'gs://vtx-models/lightgbm'
PRED_IMAGE_URI = 'us-central1-docker.pkg.dev/jchavezar-demo/predictions/pred_lightgbm_cpu:latest'

### Get Dataset component

In [6]:
from kfp.v2.dsl import (component, Output, Artifact)

@component(
    packages_to_install=[
        "pandas",
        "gcsfs",
        "sklearn"
        ]
)
def get_data(
    datasource: str,
    dataset_xtrain: Output[Artifact],
    dataset_ytrain: Output[Artifact],
    dataset_xtest: Output[Artifact],
    dataset_ytest: Output[Artifact]
    ):
    import pandas as pd
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(datasource)
    X = df[['mean_radius','mean_texture','mean_perimeter','mean_area','mean_smoothness']]
    y = df['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    X_train.to_csv(dataset_xtrain.path + ".csv", index=False, encoding='utf-8-sig')
    X_test.to_csv(dataset_xtest.path + ".csv", index=False, encoding='utf-8-sig')
    y_train.to_csv(dataset_ytrain.path + ".csv", index=False, encoding='utf-8-sig')
    y_test.to_csv(dataset_ytest.path + ".csv", index=False, encoding='utf-8-sig')


### Training Component

In [62]:
from kfp.v2.dsl import (Input)

@component(
    packages_to_install=[
        "pandas",
        "gcsfs",
        "lightgbm",
        "google-cloud-storage"]
)
def train(
    project_id: str,
    dataset_xtrain: Input[Artifact],
    dataset_ytrain: Input[Artifact],
    model_uri: str
    ):
    import pickle
    import numpy as np
    import pandas as pd
    import lightgbm as lgb
    from google.cloud import storage

    X_train = pd.read_csv(dataset_xtrain.path+".csv")
    y_train = pd.read_csv(dataset_ytrain.path+".csv").diagnosis
    clf = lgb.LGBMClassifier()
    clf.fit(X_train, y_train)

    file_name = "/tmp/model.pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(clf, file)

    storage_client = storage.Client(project=project_id)
    _bucket = model_uri.split('/')[2]
    _suffix = "/".join(model_uri.split('/')[3:]).rstrip("/")
    bucket = storage_client.get_bucket(_bucket)
    print(bucket)
    print(_suffix)
    blob = bucket.blob(f'{_suffix}/model.pkl')
    blob.upload_from_filename('/tmp/model.pkl')

### Evaluation Component

In [85]:
from kfp.v2.dsl import ClassificationMetrics, Metrics
from typing import NamedTuple

@component(
    packages_to_install=[
        "pandas",
        "gcsfs",
        "lightgbm",
        "sklearn",
        "google-cloud-storage"
        ]
)
def evaluate_model(
    project_id: str,
    dataset_xtest: Input[Artifact],
    dataset_ytest: Input[Artifact],
    model_uri: str,
    metrics: Output[ClassificationMetrics],
    kpi: Output[Metrics]
    ) -> NamedTuple(
    "Outputs", [("eval_metric", float)]
):
    import pandas as pd
    import pickle
    from google.cloud import storage
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    from collections import namedtuple

    X_test = pd.read_csv(dataset_xtest.path+".csv")
    y_test = pd.read_csv(dataset_ytest.path+".csv")

    # Load Model File

    file_name = '/tmp/model.pkl'

    storage_client = storage.Client(project=project_id)
    _bucket = model_uri.split('/')[2]
    _suffix = "/".join(model_uri.split('/')[3:]).rstrip("/")
    bucket = storage_client.get_bucket(_bucket)
    blob = bucket.blob(f'{_suffix}/model.pkl')
    blob.download_to_filename(file_name)

    with open(file_name, 'rb') as file:
        model = pickle.load(file)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy))

    y_scores = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(
        y_true=y_test.to_numpy(), 
        y_score=y_scores, 
        pos_label=True)
    metrics.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())

    metrics.log_confusion_matrix(
        ["False", "True"],
        confusion_matrix(
            y_test, y_pred
        ).tolist(),
    )

    kpi.log_metric("accuracy", float(accuracy))
    outputs = namedtuple("Outputs", ["eval_metric"])

    return outputs(float(accuracy))

### Custom Prediction Server (FastAPI)

In [9]:
!rm -fr custom_6
!mkdir custom_6
!mkdir custom_6/app

In [144]:
%%writefile custom_6/app/main.py

from google.cloud import storage
from fastapi import Request, FastAPI
import json
import os
import pickle
import argparse
import sys

app = FastAPI()

x=os.environ['AIP_STORAGE_URI']
print(f'[INFO] ------ {x}', file=sys.stderr)

# Loading Model File

file_name = 'model.pkl'
client = storage.Client(project=os.environ['PROJECT_ID'])
with open(file_name, "wb") as model:
    client.download_blob_to_file(
        f"{os.environ['AIP_STORAGE_URI']}/{file_name}", model
    )
with open(file_name, 'rb') as file:
    model = pickle.load(file)

# Webserver methods

@app.get('/')
def get_root():
    return {'message': 'Welcome to Breast Cancer Prediction'}
@app.get('/health_check')
def health():
    return 200
if os.environ.get('AIP_PREDICT_ROUTE') is not None:
    method = os.environ['AIP_PREDICT_ROUTE']
else:
    method = '/predict'
print(method)
@app.post(method)
async def predict(request: Request):
    print("----------------- PREDICTING -----------------")
    body = await request.json()
    instances = body["instances"]
    outputs = model.predict(instances)
    print(f'[INFO] ------ {outputs}, {type(outputs)}', file=sys.stderr)
    response = outputs.tolist()
    print("----------------- OUTPUTS -----------------")
    return {"predictions": response}

Overwriting custom_6/app/main.py


In [145]:
%%writefile custom_6/Dockerfile

FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7

COPY app /app
WORKDIR /app
RUN pip install joblib google-cloud-storage lightgbm
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

EXPOSE 8080

Overwriting custom_6/Dockerfile


In [146]:
!gcloud builds submit -t $PRED_IMAGE_URI custom_6/.

Creating temporary tarball archive of 2 file(s) totalling 1.4 KiB before compression.
Uploading tarball of [custom_6/.] to [gs://jchavezar-demo_cloudbuild/source/1660573188.406273-cd70d93f30544fd5b6d88e5fea92fb17.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/bee7f0d7-11f6-44ce-9681-1f211a1de2dd].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/bee7f0d7-11f6-44ce-9681-1f211a1de2dd?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "bee7f0d7-11f6-44ce-9681-1f211a1de2dd"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1660573188.406273-cd70d93f30544fd5b6d88e5fea92fb17.tgz#1660573189140732
Copying gs://jchavezar-demo_cloudbuild/source/1660573188.406273-cd70d93f30544fd5b6d88e5fea92fb17.tgz#1660573189140732...
/ [1 files][  942.0 B/  942.0 B]                                                
Operation completed over 1 

### Create Pipeline

In [138]:
from kfp.v2.dsl import pipeline, Condition
from kfp.v2.components import importer_node
from google_cloud_pipeline_components import aiplatform as gcc
from google_cloud_pipeline_components.types import artifact_types

@pipeline(name='lightgbm-light')
def pipeline(
    datasource: str,
    project_id: str,
    model_uri: str,
    eval_acc_threshold: float,
    ):
    get_data_task = get_data(datasource)
    train_task = train(
        project_id,
        get_data_task.outputs["dataset_xtrain"], 
        get_data_task.outputs["dataset_ytrain"],
        model_uri,
        )
    eval_task = evaluate_model(
        project_id,
        get_data_task.outputs["dataset_xtest"], 
        get_data_task.outputs["dataset_ytest"],
        model_uri).after(train_task)


    with Condition(
        eval_task.outputs["eval_metric"] > eval_acc_threshold,
        name="model-deploy-decision",
    ):
        import_unmanaged_model_op = importer_node.importer(
            artifact_uri="gs://vtx-models/lightgbm",
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                "containerSpec": {
                    "imageUri": PRED_IMAGE_URI,
                    "env": [
                        {
                            "name": "PROJECT_ID",
                            "value": PROJECT_ID},
                    ],
                    "predictRoute": "/predict",
                    "healthRoute": "/health_check",
                    "ports": [
                        {
                            "containerPort": 8080
                        }
                    ]
                },
            },
        )
        custom_model_upload_job = gcc.ModelUploadOp(
            project=PROJECT_ID,
            display_name="lightgbm-model",
            unmanaged_container_model=import_unmanaged_model_op.outputs["artifact"],
            ).after(import_unmanaged_model_op)

        endpoint_create_job = gcc.EndpointCreateOp(
            project=PROJECT_ID,
            display_name="pipelines-created-endpoint",
        )
            
        custom_model_deploy_job = (gcc.ModelDeployOp(
            model=custom_model_upload_job.outputs["model"],
            endpoint=endpoint_create_job.outputs["endpoint"],
            deployed_model_display_name="lightgbm_model_end",
            traffic_split={"0":"100"},
            dedicated_resources_machine_type="n1-standard-2",
            dedicated_resources_min_replica_count=1,
            dedicated_resources_max_replica_count=1
        )).set_caching_options(False)
    

### Compile the Pipeline

In [139]:
from kfp.v2 import compiler
import warnings
warnings.filterwarnings('ignore')

compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='lightgbm-light.json')

In [143]:
import google.cloud.aiplatform as aip

job = aip.PipelineJob(
    display_name="lightgbm-light",
    template_path="lightgbm-light.json",
    pipeline_root=PIPELINE_ROOT_PATH,
    parameter_values={
        "datasource": DATASET_URI,
        "project_id": PROJECT_ID,
        "model_uri": MODELS_URI,
        "eval_acc_threshold": 0.5,
    },
    enable_caching=False
)

job.submit(service_account='vtx-pipe@jchavezar-demo.iam.gserviceaccount.com')

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/lightgbm-light-20220815085002
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/lightgbm-light-20220815085002')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/lightgbm-light-20220815085002?project=569083142710


![](images/vertex-pipe-lightgbm-cpu.png)