# Starting and monitoring AI Platform Training jobs.

In [1]:
import base64
import os
import json
import time
import numpy as np
import tensorflow as tf

import google.auth

from google.auth.credentials import Credentials
from google.auth.transport.requests import AuthorizedSession

from typing import List, Optional, Text, Tuple


## Setting the notebook's parameters

In [2]:
CAIP_ENDPOINT = "us-central1-aiplatform.googleapis.com"
CAIP_REGION = "us-central1"
TENSORBOARD_NAME = "demo-tensorboard"
TENSORBOARD_LOG_DIR = "gs://jk-mlops-dev-tensorboard-logs-us-central1/test1"

## Preparing Managed TensorBoard 

### Create an authorized session

In [None]:
credentials, project_id = google.auth.default()
authed_session = AuthorizedSession(credentials)

caip_api_prefix = f'https://{CAIP_ENDPOINT}/v1alpha1/projects/{project_id}/locations/{CAIP_REGION}'

### Create a TensorBoard resources

In [None]:
api_url = f'{caip_api_prefix}/tensorboards'

request_body = {
    "display_name": TENSORBOARD_NAME
}

response = authed_session.post(api_url, data=json.dumps(request_body))
response.json()

### List all tensorboards with a set name

In [None]:
api_url = f'{caip_api_prefix}/tensorboards?filter=display_name={TENSORBOARD_NAME}'

response = authed_session.get(api_url)
response.json()

## Preparing a training container

In [42]:
%%writefile train.py

# Copyright 2020 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
from datetime import datetime
import hypertune
import tensorflow as tf
import sys
import os
import time



def _get_model(input_shape, num_classes):
    """
    Creates a simple convolutional network.
    """
    
    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.Dropout(0.25))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

    model.compile(
        loss=tf.keras.losses.categorical_crossentropy,
        optimizer=tf.keras.optimizers.Adadelta(),
        metrics=['accuracy'])
    
    return model
  

def _get_datasets():
    """
    Creates MNIST training and validation splits.
    """
    
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
    
    img_rows, img_cols = 28, 28
    num_classes = 10
    
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)
        
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255
    
    y_train = tf.keras.utils.to_categorical(y_train, num_classes)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes)
    
    return input_shape, num_classes, x_train, y_train, x_test, y_test


class _HptuneCallback(tf.keras.callbacks.Callback):
    """
    A custom Keras callback class that reports a metric to hypertuner
    at the end of each epoch.
    """
    
    def __init__(self, metric_tag, metric_value):
        super(_HptuneCallback, self).__init__()
        self.metric_tag = metric_tag
        self.metric_value = metric_value
        self.hpt = hypertune.HyperTune()
        
    def on_epoch_end(self, epoch, logs=None):
        self.hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag=self.metric_tag,
            metric_value=logs[self.metric_value],
            global_step=epoch)


def train(batch_size, epochs, verbosity):
    """
    Trains the mnist model.
    """
    
    # Prepare datasets
    input_shape, num_classes, x_train, y_train, x_test, y_test = _get_datasets()
    
    # Create model
    model = _get_model(input_shape, num_classes)
    
    # Configure Hypertuner callback
    callbacks = [_HptuneCallback('accuracy', 'val_accuracy')]
            
    # Configure TensorBoard callback
    if 'AIP_TENSORBOARD_LOG_DIR' in os.environ:
        log_dir = os.environ['AIP_TENSORBOARD_LOG_DIR']
        callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=log_dir))
    
    # Start training
    history = model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=(x_test, y_test),
        callbacks=callbacks,
    )
    

    
def get_args():
  """
  Returns an argument parser.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--num-epochs',
      type=int,
      default=20,
      help='number of times to go through the data, default=20')
  parser.add_argument(
      '--batch-size',
      default=128,
      type=int,
      help='number of records to read during each training step, default=128')
  parser.add_argument(
      '--verbosity',
      choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'],
      default='INFO')
  args, _ = parser.parse_known_args()
  return args



if __name__ == "__main__":
    args = get_args()
    train(args.batch_size, args.num_epochs, args.verbosity)

Writing train.py


In [41]:
%env AIP_TENSORBOARD_LOG_DIR={TENSORBOARD_LOG_DIR}
#%env AIP_TENSORBOARD_LOG_DIR

history = train(batch_size=32, epochs=3, verbosity=1)

env: AIP_TENSORBOARD_LOG_DIR=gs://jk-mlops-dev-tensorboard-logs-us-central1/test1
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
history

<tensorflow.python.keras.callbacks.History at 0x7f0a28c9ae90>

In [20]:
history.history

{'loss': [2.2472245693206787, 2.065563201904297, 1.7723687887191772],
 'accuracy': [0.22290000319480896, 0.4487000107765198, 0.5751500129699707],
 'val_loss': [2.156482696533203, 1.8973174095153809, 1.5119750499725342],
 'val_accuracy': [0.5813000202178955, 0.7179999947547913, 0.7728000283241272]}

## Cleaning up

### List all tensorboards in the project

In [None]:
api_url = f'{caip_api_prefix}/tensorboards'

response = authed_session.get(api_url)
response.json()

### Delete a TensorBoard resource

In [None]:
tensorboard_id = '4655314639817539584'

api_url = f'{caip_api_prefix}/tensorboards/{tensorboard_id}'

response = authed_session.delete(api_url)
response.json()