# Monitoring deployed models with Vertex Model Monitoring

In [1]:
import copy
import numpy as np
import os
import pprint
import pandas as pd
import random
import tensorflow as tf
import time

from google.cloud import aiplatform
from google.cloud import bigquery_datatransfer
from google.cloud import bigquery

from google.cloud.aiplatform_v1beta1.services.endpoint_service import \
    EndpointServiceClient
from google.cloud.aiplatform_v1beta1.services.job_service import \
    JobServiceClient
from google.cloud.aiplatform_v1beta1.services.prediction_service import \
    PredictionServiceClient
from google.cloud.aiplatform_v1beta1.types.io import BigQuerySource
from google.cloud.aiplatform_v1beta1.types.model_deployment_monitoring_job import (
    ModelDeploymentMonitoringJob, ModelDeploymentMonitoringObjectiveConfig,
    ModelDeploymentMonitoringScheduleConfig)
from google.cloud.aiplatform_v1beta1.types.model_monitoring import (
    ModelMonitoringAlertConfig, ModelMonitoringObjectiveConfig,
    SamplingStrategy, ThresholdConfig)
from google.cloud.aiplatform_v1beta1.types.prediction_service import \
    PredictRequest

from google.protobuf import json_format
from google.protobuf.duration_pb2 import Duration
from google.protobuf.struct_pb2 import Value

## Configure lab settings

In [7]:
PROJECT_ID = 'jk-wst1'
REGION = 'us-central1'
PREFIX = 'jkwst1'

STAGING_BUCKET = f'gs://{PREFIX}-bucket'
VERTEX_SA = f'{PREFIX}-training-sa@{PROJECT_ID}.iam.gserviceaccount.com'

API_ENDPOINT = f'{REGION}-aiplatform.googleapis.com'
PREDICT_API_ENDPOINT = f'{REGION}-prediction-aiplatform.googleapis.com'

ENDPOINT_DISPLAY_NAME = f'{PREFIX} Chicago taxi endpoint'

BQ_DATASET_NAME = f'{PREFIX}_dataset' 
BQ_TRAIN_SPLIT_NAME = 'training'
BQ_VALID_SPLIT_NAME = 'validation'
BQ_TEST_SPLIT_NAME = 'testing'
BQ_LOCATION = REGION

## Test the deployed model

In [8]:
filter = f'display_name="{ENDPOINT_DISPLAY_NAME}"'

endpoint_info = None
for endpoint_info in aiplatform.Endpoint.list(filter=filter):
    print(endpoint_info)
    
if endpoint_info:    
    endpoint = aiplatform.Endpoint(endpoint_info.resource_name)
    ENDPOINT_ID = endpoint.resource_name
else:
    print('No endpoints found')

<google.cloud.aiplatform.models.Endpoint object at 0x7f00b5a58510> 
resource name: projects/630263135640/locations/us-central1/endpoints/3418003418755629056


In [9]:
test_instances = [  
    
    {
        "dropoff_grid": ["POINT(-87.6 41.9)"],
        "euclidean": [2064.2696],
        "payment_type": ["Credit Card"],
        "pickup_grid": ["POINT(-87.6 41.9)"],
        "trip_miles": [1.37],
        "trip_day": [12],
        "trip_hour": [16],
        "trip_month": [2],
        "trip_day_of_week": [4],
        "trip_seconds": [555]
    }
]

predictions = endpoint.predict(instances=test_instances)
prob = tf.nn.sigmoid(predictions[0])
print('Probability of tip > 20%:', prob.numpy())

Probability of tip > 20%: [[0.7745298]]


## Starting monitoring jobs

### Define helper functions

In [10]:
def list_monitoring_jobs():
    client_options = dict(api_endpoint=API_ENDPOINT)
    parent = f"projects/{PROJECT_ID}/locations/{REGION}"
    client = JobServiceClient(client_options=client_options)
    response = client.list_model_deployment_monitoring_jobs(parent=parent)
    return response

def get_monitoring_job(job):
    client_options = dict(api_endpoint=API_ENDPOINT)
    parent = f"projects/{PROJECT_ID}/locations/{REGION}"
    client = JobServiceClient(client_options=client_options)
    response = client.get_model_deployment_monitoring_job(parent=parent)
    return response

def pause_monitoring_job(job):
    client_options = dict(api_endpoint=API_ENDPOINT)
    client = JobServiceClient(client_options=client_options)
    response = client.pause_model_deployment_monitoring_job(name=job)
    print(response)

def delete_monitoring_job(job):
    client_options = dict(api_endpoint=API_ENDPOINT)
    client = JobServiceClient(client_options=client_options)
    response = client.delete_model_deployment_monitoring_job(name=job)
    print(response)


### Configure the job

#### Configure skew and drift thresholds

In [11]:
SKEW_THRESHOLDS = {
    'trip_month': 0.3,
    'trip_day': 0.3,
    'trip_day_of_week': 0.3,
    'trip_hour': 0.3,
    'trip_seconds': 0.3,
    'trip_miles': 0.3,
    'payment_type': 0.3,
    'pickup_grid': 0.3,
    'dropoff_grid': 0.3,
    'euclidean': 0.3,  
}

DIRFT_THRESHOLDS = {
    'trip_month': 0.3,
    'trip_day': 0.3,
    'trip_day_of_week': 0.3,
    'trip_hour': 0.3,
    'trip_seconds': 0.3,
    'trip_miles': 0.3,
    'payment_type': 0.3,
    'pickup_grid': 0.3,
    'dropoff_grid': 0.3,
    'euclidean': 0.3, 
}


In [12]:
skew_thresholds = {feature: ThresholdConfig(value=float(value)) for feature, value in SKEW_THRESHOLDS.items()}
skew_config = ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(
    skew_thresholds=skew_thresholds
)


drift_thresholds = {feature: ThresholdConfig(value=float(value)) for feature, value in DIRFT_THRESHOLDS.items()}
drift_config = ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(
    drift_thresholds=drift_thresholds
)

#### Configure training dataset source location
This is used for schema generation

In [13]:
TARGET = 'tip_bin'
BQ_TRAINING_DATA = f'bq://{PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_TRAIN_SPLIT_NAME}'

training_dataset = ModelMonitoringObjectiveConfig.TrainingDataset(target_field=TARGET)
training_dataset.bigquery_source = BigQuerySource(input_uri=BQ_TRAINING_DATA)

#### Configure model monitoring object

In [14]:
objective_config = ModelMonitoringObjectiveConfig(
    training_dataset=training_dataset,
    training_prediction_skew_detection_config=skew_config,
    prediction_drift_detection_config=drift_config,
)
objective_template = ModelDeploymentMonitoringObjectiveConfig(
    objective_config=objective_config
)

#### Get all deployed model ids on the monitored endpoint

In [15]:
client = EndpointServiceClient(client_options=dict(api_endpoint=API_ENDPOINT))
response = client.get_endpoint(name=ENDPOINT_ID)
model_ids = []
for model in response.deployed_models:
    model_ids.append(model.id)
model_ids

['4309082826277388288']

#### Set objectives for each deployed model

In [16]:
objective_configs = []
for model_id in model_ids:
    objective_config = copy.deepcopy(objective_template)
    objective_config.deployed_model_id = model_id
    objective_configs.append(objective_config)

#### Configure sampling strategy

In [17]:
LOG_SAMPLE_RATE = 0.8

random_sampling = SamplingStrategy.RandomSampleConfig(sample_rate=LOG_SAMPLE_RATE)
sampling_config = SamplingStrategy(random_sample_config=random_sampling)

#### Configure monitoring schedule

In [18]:
MONITOR_INTERVAL = 3600

duration = Duration(seconds=MONITOR_INTERVAL)
schedule_config = ModelDeploymentMonitoringScheduleConfig(monitor_interval=duration)

#### Configure alerting

In [19]:
NOTIFY_EMAIL = "jarekk@gcp.solutions"
emails = [NOTIFY_EMAIL]

email_config = ModelMonitoringAlertConfig.EmailAlertConfig(user_emails=emails)
alerting_config = ModelMonitoringAlertConfig(email_alert_config=email_config)

#### Create a monitoring job

In [20]:
job_name = "TAXI_MONITORING_{}".format(time.strftime("%Y%m%d_%H%M%S"))

predict_schema = ""
analysis_schema = ""
    
job = ModelDeploymentMonitoringJob(
    display_name=job_name,
    endpoint=ENDPOINT_ID,
    model_deployment_monitoring_objective_configs=objective_configs,
    logging_sampling_strategy=sampling_config,
    model_deployment_monitoring_schedule_config=schedule_config,
    model_monitoring_alert_config=alerting_config,
    predict_instance_schema_uri=predict_schema,
    analysis_instance_schema_uri=analysis_schema,
)
    
options = dict(api_endpoint=API_ENDPOINT)
client = JobServiceClient(client_options=options)

parent = f"projects/{PROJECT_ID}/locations/{REGION}"
response = client.create_model_deployment_monitoring_job(
    parent=parent, model_deployment_monitoring_job=job
)
    
print("Created monitoring job:")
print(response.name)
print(response.state)
job_id = response.name

Created monitoring job:
projects/630263135640/locations/us-central1/modelDeploymentMonitoringJobs/1260326198454517760
JobState.JOB_STATE_PENDING


#### List monitoring jobs

In [21]:
for job in list_monitoring_jobs():
    print(job.name, job.state)

projects/630263135640/locations/us-central1/modelDeploymentMonitoringJobs/1260326198454517760 JobState.JOB_STATE_PENDING


## Generate simulated workload

In [90]:
sql_script = f'''
SELECT * 
FROM {PROJECT_ID}.{BQ_DATASET_NAME}.{BQ_TEST_SPLIT_NAME}
'''

client = bigquery.Client()
data = client.query(sql_script).to_dataframe()

data.head()

Unnamed: 0,trip_month,trip_day,trip_day_of_week,trip_hour,trip_seconds,trip_miles,payment_type,pickup_grid,dropoff_grid,euclidean,tip_bin
0,2,1,7,21,39,0.01,Cash,POINT(-87.7 41.9),POINT(-87.7 41.9),0.0,0
1,6,1,2,6,25,0.06,Cash,POINT(-87.7 41.9),POINT(-87.7 41.9),0.0,0
2,2,1,7,10,329,0.7,Cash,POINT(-87.6 41.9),POINT(-87.6 41.9),0.0,0
3,2,1,7,10,134,0.36,Cash,POINT(-87.6 41.9),POINT(-87.6 41.9),0.0,0
4,6,1,2,11,481,0.43,Cash,POINT(-87.7 42),POINT(-87.7 42),0.0,0


In [221]:
CATEGORICAL_FEATURES = {
    'trip_month': [int(val) for val in data['trip_month'].unique()],
    'trip_day': [int(val) for val in data['trip_day'].unique()],
    'trip_hour': [int(val) for val in data['trip_hour'].unique()],
    'trip_day_of_week': [int(val) for val in data['trip_day_of_week'].unique()],
    'payment_type': [str(val) for val in data['payment_type'].unique()],
    'pickup_grid': [str(val) for val in data['pickup_grid'].unique()],
    'dropoff_grid': [str(val) for val in data['dropoff_grid'].unique()],
}

NUMERIC_FEATURES = {
    'trip_miles': (data['trip_miles'].mean(), data['trip_miles'].std()),
    'trip_seconds': (data['trip_seconds'].mean(), data['trip_seconds'].std()),
    'euclidean': (data['euclidean'].mean(), data['euclidean'].std()),
}


def monitoring_test(endpoint, count=1, seconds=3, perturb_num={}, perturb_cat={}):
    
    for i in range(0, count):
        instance = {}
        for key, dict in CATEGORICAL_FEATURES.items():
            new_dict = dict
            if key in perturb_cat.keys():
                new_dict = dict + perturb_cat[key]
            instance[key] = random.choices(new_dict)
    
        for key, stats in NUMERIC_FEATURES.items():
            mean = stats[0]
            std = stats[1]
            if key in perturb_num.keys():
                mean = perturb_num[key][0](mean)
                std = perturb_num[key][1](std)
            
            value = float(np.random.normal(mean, std, 1))
            value = max(value, 0)
            if key == 'trip_seconds':
                value = max(int(value), 60)
            instance[key] = [value]
            
        predictions = endpoint.predict(instances=[instance])
        print(predictions)
        time.sleep(seconds)
        
    

In [None]:
perturb_num = {
    'trip_seconds': (lambda x: x * 3, lambda x: x / 3)
}

perturb_cat = {
    'trip_day_of_week': ['1', '1', '1', '1']
}

monitoring_test(endpoint, count=1000, perturb_num=perturb_num, seconds=3)

Prediction(predictions=[[-10.0271759]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-5.17274475]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-7.02776146]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-7.47582626]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-9.60560417]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-0.753469]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[0.545983076]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-9.26857853]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-0.242768645]], deployed_model_id='3328564744905818112', explanations=None)
Prediction(predictions=[[-12.4206724]], deployed_model_id='3328564744905818112', explanations=None)
P