In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
%pip install -U fire cloudml-hypertune

In [1]:
import fire
import hypertune
import numpy as np
import os
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.keras.backend.clear_session()  # For easy reset of notebook state.

tf.__version__

'2.1.0'

## Configure environment settings

In [2]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

ARTIFACT_STORE = 'gs://hostedkfp-default-l2iv13wnek'

## Define a training app for a toy ResNet model

In [3]:
training_app_folder = 'training_app'
os.makedirs(training_app_folder, exist_ok=True)

In [4]:
%%writefile {training_app_folder}/train.py

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import fire
import hypertune
import os
import subprocess
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def _create_toy_resnet(dropout_rate=0.5, learning_rate=0.001):
    inputs = keras.Input(shape=(32, 32, 3), name='img')
    x = layers.Conv2D(32, 3, activation='relu')(inputs)
    x = layers.Conv2D(64, 3, activation='relu')(x)
    block_1_output = layers.MaxPooling2D(3)(x)

    x = layers.Conv2D(64, 3, activation='relu', padding='same')(block_1_output)
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
    block_2_output = layers.add([x, block_1_output])

    x = layers.Conv2D(64, 3, activation='relu', padding='same')(block_2_output)
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
    block_3_output = layers.add([x, block_2_output])

    x = layers.Conv2D(64, 3, activation='relu')(block_3_output)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(10)(x)

    model = keras.Model(inputs, outputs, name='toy_resnet')
    model.compile(optimizer=keras.optimizers.RMSprop(learning_rate),
              loss=keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['acc'])
    
    return model

class _HptuneCallback(tf.keras.callbacks.Callback):
    
    hpt = hypertune.HyperTune()

    def on_epoch_end(self, epoch, logs=None):
        _HptuneCallback.hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='accuracy',
            metric_value=logs['val_acc'],
            global_step=epoch
    )
        
        
def train_evaluate(job_dir, dropout_rate, learning_rate, batch_size, num_epochs):
    
    toy_resnet = _create_toy_resnet(dropout_rate, learning_rate)
    toy_resnet.summary()

    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.
    y_train = keras.utils.to_categorical(y_train, 10)
    y_test = keras.utils.to_categorical(y_test, 10)

    toy_resnet.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=num_epochs,
          validation_split=0.2,
          verbose=2,
          callbacks=[_HptuneCallback()])
    
if __name__ == "__main__":
  fire.Fire(train_evaluate)

Overwriting training_app/train.py


In [5]:
job_dir = '/home/jupyter/jobs/job1'
dropout_rate = 0.5
learning_rate = 0.001
batch_size = 64
num_epochs = 5

!python training_app/train.py {job_dir} {dropout_rate} {learning_rate} {batch_size} {num_epochs}

2020-04-01 05:02:56.618551: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-04-01 05:02:59.103990: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer.so.6
2020-04-01 05:02:59.105581: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer_plugin.so.6
2020-04-01 05:03:01.044874: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-04-01 05:03:03.006699: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-04-01 05:03:03.007715: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1555] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla K80 computeCapability: 3.7
coreClock: 0.8235G

## Package the training app into a docker image

In [6]:
%%writefile {training_app_folder}/Dockerfile

FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-1
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting training_app/Dockerfile


## Build the docker image

In [7]:
image_name = 'toy_resent_trainer_image'
image_tag = 'latest'
image_uri = 'gcr.io/{}/{}:{}'.format(PROJECT_ID, image_name, image_tag)

In [8]:
!gcloud builds submit --tag $image_uri $training_app_folder

Creating temporary tarball archive of 3 file(s) totalling 4.2 KiB before compression.
Uploading tarball of [training_app] to [gs://mlops-workshop_cloudbuild/source/1585717466.3-670f0fae3e8a437b9a0a1f798ef38a0b.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/mlops-workshop/builds/e7c02d20-cf3f-44a5-b23d-57b8b6dc98b0].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/e7c02d20-cf3f-44a5-b23d-57b8b6dc98b0?project=745302968357].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "e7c02d20-cf3f-44a5-b23d-57b8b6dc98b0"

FETCHSOURCE
Fetching storage object: gs://mlops-workshop_cloudbuild/source/1585717466.3-670f0fae3e8a437b9a0a1f798ef38a0b.tgz#1585717466671540
Copying gs://mlops-workshop_cloudbuild/source/1585717466.3-670f0fae3e8a437b9a0a1f798ef38a0b.tgz#1585717466671540...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation completed over 1 objects/1.7 KiB.                

## Submit AI Platform Training job

In [9]:
job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
job_dir = '{}/{}'.format(ARTIFACT_STORE, job_name)
scale_tier = 'BASIC_GPU'
region = 'us-central1'

dropout_rate = 0.5
learning_rate = 0.001
batch_size = 64
num_epochs = 5

In [10]:
!gcloud ai-platform jobs submit training {job_name} \
--region={region} \
--job-dir={job_dir} \
--master-image-uri={image_uri} \
--scale-tier={scale_tier} \
-- \
--dropout_rate={dropout_rate} \
--learning_rate={learning_rate} \
--batch_size={batch_size} \
--num_epochs={num_epochs}

Job [JOB_20200401_051015] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200401_051015

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200401_051015
jobId: JOB_20200401_051015
state: QUEUED


## Monitor the job

In [11]:
!gcloud ai-platform jobs describe {job_name}

createTime: '2020-04-01T05:10:18Z'
endTime: '2020-04-01T05:19:06Z'
etag: hL9SukdNePQ=
jobId: JOB_20200401_051015
startTime: '2020-04-01T05:15:01Z'
state: SUCCEEDED
trainingInput:
  args:
  - --dropout_rate=0.5
  - --learning_rate=0.001
  - --batch_size=64
  - --num_epochs=5
  jobDir: gs://hostedkfp-default-l2iv13wnek/JOB_20200401_051015
  masterConfig:
    imageUri: gcr.io/mlops-workshop/toy_resent_trainer_image:latest
  region: us-central1
  scaleTier: BASIC_GPU
trainingOutput:
  consumedMLUnits: 0.28

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/JOB_20200401_051015?project=mlops-workshop

View logs at:
https://console.cloud.google.com/logs?resource=ml.googleapis.com%2Fjob_id%2FJOB_20200401_051015&project=mlops-workshop


In [12]:
!gcloud ai-platform jobs stream-logs {job_name}

INFO	2020-04-01 05:10:17 +0000	service		Validating job requirements...
INFO	2020-04-01 05:10:18 +0000	service		Job creation request has been successfully validated.
INFO	2020-04-01 05:10:18 +0000	service		Job JOB_20200401_051015 is queued.
INFO	2020-04-01 05:10:18 +0000	service		Waiting for job to be provisioned.
INFO	2020-04-01 05:10:20 +0000	service		Waiting for training program to start.
ERROR	2020-04-01 05:14:41 +0000	master-replica-0		2020-04-01 05:14:41.771117: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
ERROR	2020-04-01 05:14:43 +0000	master-replica-0		2020-04-01 05:14:43.146988: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer.so.6
ERROR	2020-04-01 05:14:43 +0000	master-replica-0		2020-04-01 05:14:43.149322: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libnvinfer_plugin.so.6
ERROR	2020-04-0

## Submit hyperparameter tuning job

In [13]:
%%writefile {training_app_folder}/hptuning_config.yaml

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 10
    maxParallelTrials: 4
    hyperparameterMetricTag: accuracy
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: batch_size
      type: DISCRETE
      discreteValues: [
          32,
          64
          ]
    - parameterName: dropout_rate
      type: DOUBLE
      minValue:  0.4
      maxValue:  0.6
      scaleType: UNIT_LINEAR_SCALE
    - parameterName: learning_rate
      type: DOUBLE
      minValue:  0.0005
      maxValue:  0.002
      scaleType: UNIT_LINEAR_SCALE

Overwriting training_app/hptuning_config.yaml


In [14]:
job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
job_dir = '{}/{}'.format(ARTIFACT_STORE, job_name)
scale_tier = 'BASIC_GPU'
region = 'us-central1'

num_epochs = 5

In [15]:
!gcloud ai-platform jobs submit training {job_name} \
--region={region} \
--job-dir={job_dir} \
--master-image-uri={image_uri} \
--scale-tier={scale_tier} \
--config {training_app_folder}/hptuning_config.yaml \
-- \
--num_epochs={num_epochs}

Job [JOB_20200401_052221] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200401_052221

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200401_052221
jobId: JOB_20200401_052221
state: QUEUED
