# Vertex Tabular Binary Classification with .CustomJob()

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/jchavezar/vertex-ai-mlops/blob/main/vertex-custom-ml/tensorflow/custom_jobs/container-xai-tabclass.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>

In [None]:
! pip3 install --upgrade google-cloud-aiplatform -q google-cloud-bigquery db-dtypes

## Colab only: Uncomment the following cell to restart the kernel.

In [1]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

In [2]:
# from google.colab import auth
# auth.authenticate_user()

<img src="../../../images/tf-custom-container-tabclass.png"/>

## Set Constants

In [3]:
PROJECT_ID = 'jchavezar-demo' # @param {type:"string"}
REGION = 'us-central1' # @param {type:"string"}
DATASETS_URI = 'gs://vtx-datasets-public/ecommerce' # @param {type:"string"}
MODEL_URI = 'gs://vtx-models/ecommerce/03cb' # @param {type:"string"}
STAGING_URI = 'gs://vtx-staging/ecommerce/03cb' # @param {type:"string"}
TRAIN_IMAGE_URI = f'gcr.io/{PROJECT_ID}/03cb-tf-hpt-xai-train:latest' # @param {type:"string"}
PREDICTION_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest' # @param {type:"string"}
SERIES = '03cb' # @param {type:"string"}
EXPERIMENT_NAME = 'ecommerce-exp' # @param {type:"string"}

In [4]:
## Set the Project
! gcloud config set project $PROJECT_ID

Updated property [core/project].


## Create Folder Structure

```
tmp
└─── source
     |  setup.py
     |
     └─── trainer
          |  __init__.py
          |  train.py
          |

```

In [5]:
!rm -fr source
!mkdir -p source/trainer

## Create Python Source Distribution Files 

In [6]:
%%writefile source/trainer/train.py

import os
import time
import warnings
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
warnings.filterwarnings('ignore')

################################### ARGUMENTS #######################################


parser = argparse.ArgumentParser()
parser.add_argument(
    '--train_data_uri', 
    help = 'dataset to train',
    type = str
)
parser.add_argument(
    '--val_data_uri', 
    help = 'val to train',
    type = str
)
parser.add_argument(
    '--test_data_uri', 
    help = 'test to train',
    type = str
)
args = parser.parse_args()

train_df = pd.read_csv(args.train_data_uri)
val_df = pd.read_csv(args.val_data_uri)
test_df = pd.read_csv(args.test_data_uri)

################################### PREPROCESSING #######################################

## Convert pandas dataframe to tensor data (from GCS to TF.data.Data)
init_start = time.process_time()
def df_to_dataset(dataframe, shuffle=None):
    df = dataframe.copy()
    labels = df.pop('will_buy_on_return_visit')
    df = {key: value[:, tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds
      
## Normalization / Standarization
def get_normalization_layer(name, dataset):
    start = time.process_time()
    normalizer = layers.Normalization(axis=None)
    feature_ds = dataset.map(lambda x, y: x[name])
    normalizer.adapt(feature_ds)
    print(f'Normalization time for {name}: {time.process_time() - start}')
    return normalizer

# Performs feature-wise categorical encoding of inputs features
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    start = time.process_time()
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)
    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    print(f'Encoding time for {name}: {time.process_time() - start}')
    return lambda feature: encoder(index(feature))

batch_size = 256
train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df)
test_ds = df_to_dataset(test_df)

## Identify Numerical and Categorical columns:
num_columns = ['latest_ecommerce_progress', 'time_on_site', 'pageviews']
cat_columns = ['source', 'medium', 'channel_grouping', 'device_category', 'country']
num_cat_columns = 'bounces'

all_inputs = []
encoded_features = []

# Numerical Features.
for header in num_columns:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)
    
# Categorical Features.
for header in cat_columns:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(name=header,
                                                 dataset=train_ds,
                                                 dtype='string',
                                                 max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

## Integer values into integer indices.
bounces_col = tf.keras.Input(shape=(1,), name=num_cat_columns, dtype='int64')

encoding_layer = get_category_encoding_layer(name=num_cat_columns,
                                             dataset=train_ds,
                                             dtype='int64',
                                             max_tokens=5)
encoded_age_col = encoding_layer(bounces_col)
all_inputs.append(bounces_col)
encoded_features.append(encoded_age_col)

print(f'Total preprocessing time: {time.process_time() - init_start}')

#########################################################################################


################################### CREATE, COMPILE AND TRAIN MODEL #####################

all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
print('compile pass')
################################## SETUP TENSORBOARD LOGS AND TRAIN #####################

print(os.environ['AIP_TENSORBOARD_LOG_DIR'])
print('---------------------')
print(os.environ['AIP_MODEL_DIR'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=os.environ['AIP_TENSORBOARD_LOG_DIR'], update_freq='batch')
model.fit(train_ds, epochs=15, validation_data=val_ds, callbacks=[tensorboard_callback])
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

################################### SAVE MODEL ##########################################

model.save(os.environ['AIP_MODEL_DIR'])

Writing source/trainer/train.py


In [7]:
%%writefile source/Dockerfile
FROM python:3.9.12

COPY . /

RUN pip install tensorflow && \
    pip install google-cloud-storage && \
    pip install pandas && \
    pip install gcsfs

ENTRYPOINT ["python", "trainer/train.py"]

Writing source/Dockerfile


In [8]:
!gcloud builds submit -t $TRAIN_IMAGE_URI source/.

Creating temporary tarball archive of 2 file(s) totalling 5.4 KiB before compression.
Uploading tarball of [source/.] to [gs://jchavezar-demo_cloudbuild/source/1675970142.126684-4ed6f7aa453e42dc9fc5ab6607a81d31.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/e4c00a24-d99b-4bd8-8f81-14709eaa69d4].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/e4c00a24-d99b-4bd8-8f81-14709eaa69d4?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "e4c00a24-d99b-4bd8-8f81-14709eaa69d4"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1675970142.126684-4ed6f7aa453e42dc9fc5ab6607a81d31.tgz#1675970142414481
Copying gs://jchavezar-demo_cloudbuild/source/1675970142.126684-4ed6f7aa453e42dc9fc5ab6607a81d31.tgz#1675970142414481...
/ [1 files][  2.0 KiB/  2.0 KiB]                                                
Operation completed over 1 ob

## Create Tensorboard

In [None]:
from google.cloud import aiplatform as aip

tb = aip.Tensorboard.list(filter=f"labels.series={SERIES}")
if tb:
    tb = tb[0]
else: 
    tb = aip.Tensorboard.create(display_name=SERIES, labels={'series' : f'{SERIES}'})
print(tb)

## Create Vertex Training from Code [CustomJob]

In [12]:
aip.init(
    experiment=EXPERIMENT_NAME,
    experiment_tensorboard=tb.resource_name,
    project=PROJECT_ID,
    location=REGION)


worker_pool_specs = [
    {
        'machine_spec' : {
            'machine_type': 'n1-standard-4'
        },
        'replica_count': 1,
        'container_spec': {
            'image_uri': TRAIN_IMAGE_URI,
            'args': [
                '--train_data_uri='+f'{DATASETS_URI}/train.csv',
                '--val_data_uri='+f'{DATASETS_URI}/val.csv',
                '--test_data_uri='+f'{DATASETS_URI}/test.csv',
            ]
        }
    }
]

job = aip.CustomJob(
    display_name = '03cb-ecommerce_tf',
    worker_pool_specs = worker_pool_specs,
    base_output_dir = MODEL_URI,
    staging_bucket = STAGING_URI
)

model = job.run(
    service_account = 'vtx-pipe@jchavezar-demo.iam.gserviceaccount.com',
    tensorboard = tb.resource_name
)

Creating CustomJob
CustomJob created. Resource name: projects/569083142710/locations/us-central1/customJobs/1534311989151531008
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/569083142710/locations/us-central1/customJobs/1534311989151531008')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1534311989151531008?project=569083142710
CustomJob projects/569083142710/locations/us-central1/customJobs/1534311989151531008 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1534311989151531008 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1534311989151531008 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1534311989151531008 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1534311

## Get Model Information from Tensorflow Graph

In [None]:
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

loaded_model = tf.keras.models.load_model(f"{MODEL_URI}/model")
tf.keras.utils.plot_model(loaded_model, show_shapes=True, rankdir="LR")

2023-02-09 19:23:36.521977: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-09 19:23:52.153542: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-02-09 19:23:52.155105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

## Build Explainable AI Metadata from Graph

In [None]:
infer = loaded_model.signatures['serving_default']
print(infer.inputs)
print()
print(infer.structured_outputs)

In [None]:
EXPLANATION_METADATA = {
    "outputs": {
        "logit": { 
            "output_tensor_name": "dense_1"
        }
    },
    "inputs": { 
        "latest_ecommerce_progress": {},
        "bounces": {},
        "time_on_site": {},
        "pageviews": {},
        "source": {},
        "medium": {},
        "channel_grouping": {},
        "device_category": {},
        "country": {},
    }
}
EXPLANATION_PARAMS = {"sampled_shapley_attribution": {"path_count": 5}}

## Upload Model

In [None]:
model = aip.Model.upload(
    display_name = '03-cb-ecommerce_tf_v2',
    serving_container_image_uri = PREDICTION_IMAGE_URI,
    artifact_uri = f'{MODEL_URI}/model',
    explanation_parameters=EXPLANATION_PARAMS,
    explanation_metadata=EXPLANATION_METADATA,
)

## Deploy Model On Endpoint

In [None]:
endpoint = model.deploy(
    deployed_model_display_name = '03cb-ecommerce_tf_ep_dep',
    traffic_percentage = 100,
    machine_type = 'n1-standard-4',
    min_replica_count = 1,
    max_replica_count = 1,
    explanation_metadata=EXPLANATION_METADATA,
    explanation_parameters=EXPLANATION_PARAMS
)

## Testing Predictions

In [None]:
instance = {
    'latest_ecommerce_progress': [0],
    'bounces': [0],
    'time_on_site': [103],
    'pageviews': [3],
    'source': ['youtube.com'],
    'medium': ['referral'],
    'channel_grouping': ['Social'],
    'device_category': ['desktop'],
    'country': ['Vietnam'],
}

In [None]:
endpoint.predict([instance])

In [None]:
explanations = endpoint.explain([instance])
print("Explainable predictions:", explanations)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

results = {k:v for k,v in explanations[4][0].attributions[0].feature_attributions.items()}
names = list(results.keys())
values = []
for i in results.values():
    values.append(i.pop())
    
plt.barh(range(len(results)), values, tick_label=names)

## Cleaning

In [None]:
!rm -f source.tar source.tar.gz
!rm -fr source