# 03 - Model Serving

The purpose of the notebook is to show how to serve both AutoML Tables and Custom models for online and batch prediction.
The notebook covers the following tasks:
1. Creating an AI Platform Endpoint
2. Deploy the AutoML Tables and the custom modesl to the endpoint.
4. Test the endpoints for online prediction.
5. Getting online explaination from the AutoML Tables mode.
5. Use the uploaded custom model for batch prediciton.

## Setup

In [None]:
import os
import time
from datetime import datetime
import tensorflow as tf

In [None]:
PROJECT = 'ksalama-cloudml'  # Change to your project Id.
REGION = 'us-central1'
BUCKET = 'ksalama-cloudml-us' # Change to your bucket.

DATASET_DISPLAY_NAME = 'chicago_taxi_tips'
MODEL_ENDPOINT_DISPLAY_NAME = 'chicago_taxi_tips_classifier'
AUTOML_MODEL_DISPLAY_NAME = 'chicago_taxi_tips_classifier_automl'
CUSTOM_MODEL_DISPLAY_NAME = 'chicago_taxi_tips_classifier_custom'

In [None]:
from utils.ucaip_utils import AIPUtils
aip_utils = AIPUtils(PROJECT, REGION)

## 1. Create AI Platform Endpoint

In [None]:
response = aip_utils.create_endpoint(MODEL_ENDPOINT_DISPLAY_NAME)
response.result()

## 2. Deploy AI Platform Model to Endpoint

We assume that both the AutoML Tables model and the custom model have the same serving signature to be deployed under the same Endpoint. In this case, we can split the traffic between them (for example, to perform A/B testing).

In [None]:
dedicated_serving_resources_spec = { 
    'machine_spec': {
        'machine_type': 'n1-standard-2',
        #'accelerator_count': 1,
        #'accelerator_type': 'NVIDIA_TESLA_T4'
    },
    'min_replica_count': 1,
    'max_replica_count': 5
}

### Deploy AutoML Model

In [None]:
response = aip_utils.deploy_model(
        model_display_name=AUTOML_MODEL_DISPLAY_NAME,
        endpoint_display_name=MODEL_ENDPOINT_DISPLAY_NAME,
        dedicated_serving_resources_spec=dedicated_serving_resources_spec,
    )

response.result()

### Deploy Custom Model

In [None]:
response = aip_utils.deploy_model(
    model_display_name=CUSTOM_MODEL_DISPLAY_NAME,
    endpoint_display_name=MODEL_ENDPOINT_DISPLAY_NAME,
    dedicated_serving_resources_spec=dedicated_serving_resources_spec,
    traffic_split={"0": 50, "1": 50}
)

response.result()

** How to update traffic split progammatically?

## 3. Making Online Predicitons

** Currently the AutoML Tables and the Custom model don't have the same serving signature, so they expect two differnt types of the input instances. However, the endpoint would only accept the instance of the first deployed model.

In [None]:
endpoint = aip_utils.get_endpoint_by_display_name(
    MODEL_ENDPOINT_DISPLAY_NAME)

In [None]:
instance_automl = {
    "dropoff_grid": "POINT(-87.6 41.9)",
    "euclidean": 2064.2696,
    "loc_cross": "",
    "payment_type": "Credit Card",
    "pickup_grid": "POINT(-87.6 41.9)",
    "trip_miles": 1.37,
    "trip_day": "12",
    "trip_hour": "16",
    "trip_month": "2",
    "trip_day_of_week": "4",
    "trip_seconds": "555",
}

instance_custom = {
    "dropoff_grid": ["POINT(-87.6 41.9)"],
    "euclidean": [2064.2696],
    "loc_cross": [""],
    "payment_type": ["Credit Card"],
    "pickup_grid": ["POINT(-87.6 41.9)"],
    "trip_miles": [1.37],
    "trip_day": [12],
    "trip_hour": [16],
    "trip_month": [2],
    "trip_day_of_week": [4],
    "trip_seconds": [555],
}

In [None]:
for i in range(10):
    try:
        response = aip_utils.predict_tabular_classifier(
            endpoint.name, 
            instance_automl
        )
        print(f"AutoML model (id: {response.deployed_model_id}) responded:")
        for prediction in response.predictions:
            print(dict(prediction))

    except:
        response = aip_utils.predict_tabular_classifier(
            endpoint.name,  
            instance_custom
        )
        print(f"Custom model (id: {response.deployed_model_id}) responded:")
        for prediction in response.predictions:
            print(dict(prediction))
    

### 4. Getting Online Explaination (AutoML)

In [None]:
try:
    response = aip_utils.explain_tabular_classifier(
        endpoint.name, 
        instance_automl,
    )
    print("AutoML model responded:")
    print(response.explanations)
except:
     print("Custom model responded: No support for explaination.")
    
#     response = explain_tabular_classifier(
#         client_options, 
#         model_endpoint.name, 
#         instance_custom,
#     )
#     print("Custom model responded:")
#     for explaination in response.explainations:
#         print(dict(explaination))

## 5. Batch Prediction (Custom Model)

In [None]:
WORKSPACE = f"gs://{BUCKET}/ucaip_demo/chicago_taxi"
SERVING_DATA_DIR = os.path.join(WORKSPACE, 'serving_data')
SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')
SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')

RAW_SCHEMA_DIR = 'model_src/raw_schema/schema.pbtxt'

In [None]:
if tf.io.gfile.exists(SERVING_DATA_DIR):
    print("Removing previous serving data...")
    tf.io.gfile.rmtree(SERVING_DATA_DIR)
print("Creating preprocessing serving data directory...")
tf.io.gfile.mkdir(SERVING_DATA_DIR)

### Extract serving data to Cloud Storage as TFRecords

In [None]:
from utils import datasource_utils

In [None]:
DATA_SPLIT = 'TEST'
LIMIT = 10000

raw_data_query = datasource_utils.get_source_query(
    project=PROJECT, 
    region=REGION, 
    dataset_display_name=DATASET_DISPLAY_NAME, 
    data_split=DATA_SPLIT, 
    limit=LIMIT
)

print(raw_data_query)

In [None]:
args = {
    #'runner': 'DataflowRunner',
    'raw_schema_location': RAW_SCHEMA_DIR,
    'raw_data_query': raw_data_query,
    'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, "data-"),
    'temporary_dir': os.path.join(WORKSPACE, 'tmp'),
    'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),
    'project': PROJECT,
    'region': REGION,
    'setup_file': './setup.py'
}

In [None]:
from dataflow_src import data_prep

In [None]:
tf.get_logger().setLevel('ERROR')

print("Data extraction started...")
data_prep.run_extract_pipeline(args)
print("Data extraction completed.")

In [None]:
!gsutil ls {SERVING_INPUT_DATA_DIR}

### Prepare the batch prediction job

In [None]:
dedicated_resources =  {
    "machine_spec": {
        "machine_type": 'n1-standard-2',
        #'accelerator_count': 1,
        #'accelerator_type': 'NVIDIA_TESLA_T4'
    },
    "starting_replica_count": 1,
    "max_replica_count": 10,
}

### Submit the batch prediction job

In [None]:
batch_prediction_job = aip_utils.submit_batch_prediction_job(
        model_display_name=CUSTOM_MODEL_DISPLAY_NAME, 
        gcs_data_uri_pattern=SERVING_INPUT_DATA_DIR + '/*.jsonl', 
        gcs_output_uri=SERVING_OUTPUT_DATA_DIR,
        dedicated_resources=dedicated_resources,
        instances_format='jsonl',
        predictions_format='jsonl'
)

### Monitor job state

In [None]:
while True:
    response = aip_utils.get_batch_prediction_job_by_uri(batch_prediction_job.name)
    if response.state.name == 'JOB_STATE_SUCCEEDED':
        print("Batch prediction completed. - Training Time:", response.update_time - response.create_time)
        break
    elif response.state.name == 'JOB_STATE_FAILED':
        print("Batch prediction failed!")
        break
    else:
        print(f"Batch prediction state is: {response.state.name}.")
    time.sleep(60)

In [None]:
!gsutil ls {SERVING_OUTPUT_DATA_DIR}