In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
%pip install -U fire

In [None]:
import fire
import numpy as np
import os
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.keras.backend.clear_session()  # For easy reset of notebook state.

tf.__version__

## Configure environment settings

In [None]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

ARTIFACT_STORE = 'gs://hostedkfp-default-l2iv13wnek'

## Define a training app for a toy ResNet model

In [None]:
training_app_folder = 'training_app'
os.makedirs(training_app_folder, exist_ok=True)

In [None]:
%%writefile {training_app_folder}/train.py

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import fire
import os
import subprocess
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def _create_toy_resnet(dropout_rate=0.5, learning_rate=0.001):
    inputs = keras.Input(shape=(32, 32, 3), name='img')
    x = layers.Conv2D(32, 3, activation='relu')(inputs)
    x = layers.Conv2D(64, 3, activation='relu')(x)
    block_1_output = layers.MaxPooling2D(3)(x)

    x = layers.Conv2D(64, 3, activation='relu', padding='same')(block_1_output)
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
    block_2_output = layers.add([x, block_1_output])

    x = layers.Conv2D(64, 3, activation='relu', padding='same')(block_2_output)
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
    block_3_output = layers.add([x, block_2_output])

    x = layers.Conv2D(64, 3, activation='relu')(block_3_output)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(10)(x)

    model = keras.Model(inputs, outputs, name='toy_resnet')
    model.compile(optimizer=keras.optimizers.RMSprop(learning_rate),
              loss=keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['acc'])
    
    return model

def train_evaluate(job_dir, dropout_rate, learning_rate, batch_size, num_epochs):
    
    toy_resnet = _create_toy_resnet(dropout_rate, learning_rate)
    toy_resnet.summary()

    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.
    y_train = keras.utils.to_categorical(y_train, 10)
    y_test = keras.utils.to_categorical(y_test, 10)

    toy_resnet.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=num_epochs,
          validation_split=0.2)
    
if __name__ == "__main__":
  fire.Fire(train_evaluate)

In [None]:
job_dir = '/home/jupyter/jobs/job1'
dropout_rate = 0.5
learning_rate = 0.001
batch_size = 64
num_epochs = 5

!python training_app/train.py {job_dir} {dropout_rate} {learning_rate} {batch_size} {num_epochs}

## Package the training app into a docker image

In [None]:
%%writefile {training_app_folder}/Dockerfile

FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-1
RUN pip install -U fire
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

## Build the docker image

In [None]:
image_name = 'toy_resent_trainer_image'
image_tag = 'latest'
image_uri = 'gcr.io/{}/{}:{}'.format(PROJECT_ID, image_name, image_tag)

In [None]:
!gcloud builds submit --tag $image_uri $training_app_folder

## Submit AI Platform Training job

In [None]:
job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
job_dir = '{}/{}'.format(ARTIFACT_STORE, job_name)
scale_tier = 'BASIC_GPU'
region = 'us-central1'

dropout_rate = 0.5
learning_rate = 0.001
batch_size = 64
num_epochs = 5

In [None]:
!gcloud ai-platform jobs submit training {job_name} \
--region={region} \
--job-dir={job_dir} \
--master-image-uri={image_uri} \
--scale-tier={scale_tier} \
-- \
--dropout_rate={dropout_rate} \
--learning_rate={learning_rate} \
--batch_size={batch_size} \
--num_epochs={num_epochs}

## Monitor the job

In [None]:

!gcloud ai-platform jobs describe {job_name}

In [None]:
!gcloud ai-platform jobs stream-logs {job_name}

## Submit hyperparameter tuning job

In [None]:
%%writefile {training_app_folder}/hptuning_config.yaml

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainingInput:
  hyperparameters:
    goal: MAXIMIZE
    maxTrials: 4
    maxParallelTrials: 4
    hyperparameterMetricTag: accuracy
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: batch_size
      type: DISCRETE
      discreteValues: [
          32,
          64
          ]
    - parameterName: dropout_rate
      type: DOUBLE
      minValue:  0.4
      maxValue:  0.6
      scaleType: UNIT_LINEAR_SCALE
    - parameterName: learning_rate
      type: DOUBLE
      minValue:  0.0005
      maxValue:  0.002
      scaleType: UNIT_LINEAR_SCALE

In [None]:
job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
job_dir = '{}/{}'.format(ARTIFACT_STORE, job_name)
scale_tier = 'BASIC_GPU'
region = 'us-central1'

num_epochs = 5

In [None]:
!gcloud ai-platform jobs submit training {job_name} \
--region={region} \
--job-dir={job_dir} \
--master-image-uri={image_uri} \
--scale-tier={scale_tier} \
--config {tarining_app_folder}/hptuning_config.yaml
-- \
--num_epochs={num_epochs}