# 03 - Model Serving

The purpose of the notebook is to show how to serve both AutoML Tables and Custom models for online and batch prediction.
The notebook covers the following tasks:
1. Creating an AI Platform Endpoint
2. Deploy the AutoML Tables and the custom modesl to the endpoint.
4. Test the endpoints for online prediction.
5. Getting online explaination from the AutoML Tables mode.
5. Use the uploaded custom model for batch prediciton.

## Setup

In [None]:
import os
import time
from datetime import datetime
import tensorflow as tf
from datetime import datetime
from google.cloud.aiplatform import gapic as aip

In [None]:
PROJECT = 'ksalama-cloudml'
REGION = 'us-central1'
BUCKET = 'ksalama-cloudml-us'

MODEL_ENDPOINT_DISPLAYNAME = 'chicago_taxi_tips_classifier'
AUTOML_MODEL_DISPLAYNAME = 'chicago_taxi_tips_classifier_automl'
CUSTOM_MODEL_DISPLAYNAME = 'chicago_taxi_tips_classifier_custom'

API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
PARENT = f"projects/{PROJECT}/locations/{REGION}"
client_options = {"api_endpoint": API_ENDPOINT}

## 1. Create AI Platform Endpoint

In [None]:
endpoint_client = aip.EndpointServiceClient(client_options=client_options)
model_client = aip.ModelServiceClient(client_options=client_options)

In [None]:
response = endpoint_client.create_endpoint(
    parent=PARENT,
    endpoint=aip.Endpoint(display_name=MODEL_ENDPOINT_DISPLAYNAME)
)

response.result()

In [None]:
for entry in endpoint_client.list_endpoints(parent=PARENT):
    if entry.display_name == MODEL_ENDPOINT_DISPLAYNAME:
        model_endpoint = entry
        break
        
model_endpoint

## 2. Deploy AI Platform Model to Endpoint

We assume that both the AutoML Tables model and the custom model have the same serving signature to be deployed under the same Endpoint. In this case, we can split the traffic between them (for example, to perform A/B testing).

In [None]:
def deploy_model_to_endpoint(
    model_display_name,
    endpoint_display_name,
    model_client,
    endpoint_client,
    parent,
    traffic_split
):
    
    for entry in model_client.list_models(parent=parent):
        if entry.display_name == model_display_name:
            aip_model = entry
            
    for entry in endpoint_client.list_endpoints(parent=parent):
        if entry.display_name == endpoint_display_name:
            model_endpoint = entry
            break


    serving_machine_spec = aip.MachineSpec(
        machine_type='n1-standard-2',
        #accelerator_count=1,
        #accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4
    )

    dedicated_serving_resources = aip.DedicatedResources(
        machine_spec=serving_machine_spec,
        min_replica_count=1,
        max_replica_count=5
    )

    deployed_model = aip.DeployedModel(
        model=aip_model.name,
        disable_container_logging=False,
        enable_access_logging=False,
        #automatic_resources=None,
        dedicated_resources=dedicated_serving_resources
    )
    
    response = endpoint_client.deploy_model(
        endpoint=model_endpoint.name,
        deployed_model=deployed_model,
        traffic_split=traffic_split
    )
    
    return response

### Deploy AutoML Model

In [None]:
# Deploy AutoML Model
response = deploy_model_to_endpoint(
    AUTOML_MODEL_DISPLAYNAME,
    MODEL_ENDPOINT_DISPLAYNAME,
    model_client,
    endpoint_client,
    PARENT,
    {"0": 100}
)

response.result()

### Deploy Custom Model

In [None]:
response = deploy_model_to_endpoint(
    CUSTOM_MODEL_DISPLAYNAME,
    MODEL_ENDPOINT_DISPLAYNAME,
    model_client,
    endpoint_client,
    PARENT,
    {"0": 50, "1": 50}
)

response.result()

** How to update traffic split progammatically?

## 3. Making Online Predicitons

** Currently the AutoML Tables and the Custom model don't have the same serving signature, so they expect two differnt types of the input instances. However, the endpoint would only accept the instance of the first deployed model.

In [None]:
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


def predict_tabular_classifier(
    client_options,
    endpoint,
    instance
):
    instances = [json_format.ParseDict(instance, Value())]
    prediction_client = aip.PredictionServiceClient(client_options=client_options)
    response = prediction_client.predict(
        endpoint=endpoint, instances=instances#, parameters=parameters
    )
    return response

In [None]:
instance_automl = {
    "dropoff_grid": "POINT(-87.6 41.9)",
    "euclidean": 2064.2696,
    "loc_cross": "",
    "payment_type": "Credit Card",
    "pickup_grid": "POINT(-87.6 41.9)",
    "trip_miles": 1.37,
    "trip_day": "12",
    "trip_hour": "16",
    "trip_month": "2",
    "trip_day_of_week": "4",
    "trip_seconds": "555",
}

# instance_custom = {
#     "dropoff_grid": "POINT(-87.6 41.9)",
#     "euclidean": 2064.2696,
#     "loc_cross": "",
#     "payment_type": "Credit Card",
#     "pickup_grid": "POINT(-87.6 41.9)",
#     "trip_miles": 1.37,
#     "trip_day": 12,
#     "trip_hour": 16,
#     "trip_month": 2,
#     "trip_day_of_week": 4,
#     "trip_seconds": 555,
# }

instance_custom = {
    "dropoff_grid": ["POINT(-87.6 41.9)"],
    "euclidean": [2064.2696],
    "loc_cross": [""],
    "payment_type": ["Credit Card"],
    "pickup_grid": ["POINT(-87.6 41.9)"],
    "trip_miles": [1.37],
    "trip_day": [12],
    "trip_hour": [16],
    "trip_month": [2],
    "trip_day_of_week": [4],
    "trip_seconds": [555],
}

In [None]:
for i in range(10):
    try:
        response = predict_tabular_classifier(
            client_options, 
            model_endpoint.name, 
            instance_automl
        )
        print(f"AutoML model (id: {response.deployed_model_id}) responded:")
        for prediction in response.predictions:
            print(dict(prediction))

    except:
        response = predict_tabular_classifier(
            client_options, 
            model_endpoint.name, 
            instance_custom
        )
        print(f"Custom model (id: {response.deployed_model_id}) responded:")
        for prediction in response.predictions:
            print(dict(prediction))
    

### 4. Getting Online Explaination (AutoML)

In [None]:
from google.cloud import aiplatform_v1beta1 as aip_beta

In [None]:
def explain_tabular_classifier(
    client_options,
    endpoint,
    instance
):
    instances = [json_format.ParseDict(instance, Value())]
    prediction_client = aip_beta.PredictionServiceClient(client_options=client_options)
    response = prediction_client.explain(
        endpoint=endpoint, 
        instances=instances
    )
    return response

In [None]:
try:
    response = explain_tabular_classifier(
        client_options, 
        model_endpoint.name, 
        instance_automl,
    )
    print("AutoML model responded:")
    for explaination in response.explainations:
        print(dict(explaination))
except:
     print("Custom model responded: No support for explaination.")
    
#     response = explain_tabular_classifier(
#         client_options, 
#         model_endpoint.name, 
#         instance_custom,
#     )
#     print("Custom model responded:")
#     for explaination in response.explainations:
#         print(dict(explaination))

## 5. Batch Prediction (Custom Model)

In [None]:
GCS_WORKSPACE = f"gs://{BUCKET}/ucaip_demo/chicago_taxi"
SERVING_DATA_DIR = os.path.join(GCS_WORKSPACE, 'serving_data')
SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')
SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')

RAW_SCHEMA_DIR = 'model_src/raw_schema/schema.pbtxt'
DATASET_DISPLAYNAME = 'chicago_taxi_tips'

In [None]:
if tf.io.gfile.exists(SERVING_DATA_DIR):
    print("Removing previous serving data...")
    tf.io.gfile.rmtree(SERVING_DATA_DIR)
print("Creating preprocessing serving data directory...")
tf.io.gfile.mkdir(SERVING_DATA_DIR)

### Extract serving data to Cloud Storage as TFRecords

In [None]:
DATA_SPLIT = 'TEST'
LIMIT = 10000

def get_source_query(dataset_display_name, data_split, limit):
    
    dataset_client = aip.DatasetServiceClient(client_options=client_options)
    for dataset in dataset_client.list_datasets(parent=PARENT):
        if dataset.display_name == dataset_display_name:
            dataset_uri = dataset.name
            break

    dataset = dataset_client.get_dataset(name=dataset_uri)
    bq_source_uri = dataset.metadata['inputConfig']['bigquerySource']['uri']
    _, bq_dataset_name, bq_table_name = bq_source_uri.replace("g://", "").split('.')
    
    query = f'''
        SELECT 
            IF(trip_month IS NULL, -1, trip_month) trip_month,	
            IF(trip_day IS NULL, -1, trip_day) trip_day,
            IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week,
            IF(trip_hour IS NULL, -1, trip_hour) trip_hour,	
            IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds,
            IF(trip_miles IS NULL, -1, trip_miles) trip_miles,
            IF(payment_type IS NULL, 'NA', payment_type) payment_type,
            IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid,
            IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid,
            IF(euclidean IS NULL, -1, euclidean) euclidean,
            IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross
        FROM {bq_dataset_name}.{bq_table_name} 
        WHERE data_split = '{data_split}' LIMIT {limit}
    '''
    return query


args = {
    #'runner': 'DataflowRunner',
    'raw_schema_location': RAW_SCHEMA_DIR,
    'raw_data_query': get_source_query(DATASET_DISPLAYNAME, DATA_SPLIT, LIMIT),
    'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, "data-"),
    'temporary_dir': os.path.join(GCS_WORKSPACE, 'tmp'),
    'gcs_location': os.path.join(GCS_WORKSPACE, 'bq_tmp'),
    'project': PROJECT,
    'region': REGION,
    'setup_file': './setup.py'
}

In [None]:
from dataflow_src import data_prep

In [None]:
tf.get_logger().setLevel('ERROR')

print("Data extraction started...")
data_prep.run_extract_pipeline(args)
print("Data extraction completed.")

In [None]:
!gsutil ls {SERVING_INPUT_DATA_DIR}

### Prepare the batch prediction job

In [None]:
def create_batch_prediction_job(
    job_client,
    model_display_name, 
    gcs_data_uri_pattern, 
    gcs_output_uri,
    parent
):
    
    serving_data_uris = tf.io.gfile.glob(gcs_data_uri_pattern)
    
    for entry in model_client.list_models(parent=parent):
        if entry.display_name == model_display_name:
            aip_model = entry
    
    job_name = f"batch_predict_{model_display_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}"

    machine_spec = {
        "machine_type": 'n1-standard-2',
        #accelerator_count=1,
        #accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4
    }

    batch_prediction_job = {
        "display_name": job_name,
        "model": aip_model.name,
        #"model_parameters": json_format.ParseDict(model_parameters, Value()),
        "input_config": {
            "instances_format": "jsonl",
            "gcs_source": {"uris": serving_data_uris},
        },
        "output_config": {
            "predictions_format": "jsonl",
            "gcs_destination": {"output_uri_prefix": gcs_output_uri},
        },
        "dedicated_resources": {
            "machine_spec": machine_spec,
            "starting_replica_count": 1,
            "max_replica_count": 10,
        },
    }
    
    response = job_client.create_batch_prediction_job(
        parent=PARENT, batch_prediction_job=batch_prediction_job
    )
    return response


### Submit the batch prediction job

In [None]:
job_client = aip.JobServiceClient(client_options=client_options)

batch_prediction_job = create_batch_prediction_job(
    job_client,
    CUSTOM_MODEL_DISPLAYNAME, 
    SERVING_INPUT_DATA_DIR + '/*.jsonl', 
    SERVING_OUTPUT_DATA_DIR,
    PARENT
)

batch_prediction_job

In [None]:
while True:
    response = job_client.get_batch_prediction_job(name=batch_prediction_job.name)
    if response.state == aip.JobState.JOB_STATE_SUCCEEDED:
        print("Batch prediction job completed. - Training Time:", response.update_time - response.create_time)
        break
        print("Training job has not completed:", response.state)
    elif response.state == aip.JobState.JOB_STATE_FAILED:
        print("Batch prediction job failed!")
        break
    else:
        print("Batch prediction job is running.")
    time.sleep(60)

In [None]:
!gsutil ls {SERVING_OUTPUT_DATA_DIR}