# Training on a single node

## Prepare a training script

In [40]:
import datetime

In [41]:
SCRIPT_FOLDER = './train'

In [56]:
!mkdir $SCRIPT_FOLDER

In [57]:
!touch $SCRIPT_FOLDER/__init__.py

In [3]:
%%writefile $SCRIPT_FOLDER/train.py

from absl import flags
from absl import app

import os
import tensorflow as tf
import numpy as np

from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout, Dense, add

#tf.enable_eager_execution()


IMAGE_SHAPE = (32, 32, 3)
NUM_CLASSES = 10

def toy_resnet_model():
    inputs = Input(shape=IMAGE_SHAPE, name='image')
    x = Conv2D(32, 3, activation='relu')(inputs)
    x = Conv2D(64, 3, activation='relu')(x)
    block_1_output = MaxPooling2D(3)(x)
    
    x = Conv2D(64, 3, activation='relu', padding='same')(block_1_output)
    x = Conv2D(64, 3, activation='relu', padding='same')(x)
    block_2_output = add([x, block_1_output])
    
    x = Conv2D(64, 3, activation='relu', padding='same')(x)
    x = Conv2D(64, 3, activation='relu', padding='same')(x)
    block_3_output = add([x, block_2_output])
    
    x = Conv2D(64, 3, activation='relu')(block_3_output)
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    outputs = Dense(10, activation='softmax')(x)
    
    model = Model(inputs, outputs, name='toy_resnet')
    
    return model


def prepare_datasets():
    def _parse_record(example_proto):
        features = {
            'image': tf.FixedLenFeature([], tf.string),
            'label': tf.FixedLenFeature([], tf.int64, default_value=0)
        }
        
        parsed_features = tf.parse_single_example(example_proto, features)
        image = parsed_features['image']
        label = parsed_features['label']
        
        image = tf.image.decode_png(image, channels=3)
        image = tf.cast(image, tf.float32)
        image = image / 255
        
        label = tf.one_hot(label, NUM_CLASSES)
        
        return image, label

    
    train_dataset = tf.data.TFRecordDataset(FLAGS.train_files)
    eval_dataset = tf.data.TFRecordDataset(FLAGS.eval_files)
    
    train_dataset = train_dataset.map(_parse_record)
    eval_dataset = eval_dataset.map(_parse_record)
    
    train_dataset = train_dataset.shuffle(4096).batch(FLAGS.batch_size).repeat()
    eval_dataset = eval_dataset.batch(FLAGS.batch_size).repeat()
    
    return train_dataset, eval_dataset


def train_evaluate():
    
    train_dataset, eval_dataset = prepare_datasets()
    
    model = toy_resnet_model()
    
    model.compile(optimizer=tf.keras.optimizers.RMSprop(1e-3),
             loss="categorical_crossentropy",
             metrics=["accuracy"]
             )

    callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir=FLAGS['job-dir'].value, update_freq='epoch')
    ]
    
    model.fit(train_dataset,
         epochs=FLAGS.epochs,
         steps_per_epoch=1000,
         callbacks=callbacks,
         validation_data=eval_dataset,
         validation_steps=200)
    
    

FLAGS = flags.FLAGS
flags.DEFINE_list("train_files", None, "Training TFRecord files")
flags.DEFINE_list("eval_files", None, "Evaluation TFRecord files")

flags.DEFINE_integer("epochs", 5, "Number of epochs to train")
flags.DEFINE_integer("batch_size", 32, "Batch size")
flags.DEFINE_integer("steps_per_epoch", 1000, "Steps per epoch")
flags.DEFINE_integer("validation_steps", 20, "Batch size")

flags.DEFINE_string("job-dir", None, "Job dir")

# Required flags
flags.mark_flag_as_required("train_files")
flags.mark_flag_as_required("eval_files")


def main(argv):
    del argv #Unused
    
    train_evaluate()
     

if __name__ == '__main__':
    
    app.run(main)


Overwriting ./train/train.py


## Run the training script locally

In [4]:
TRAIN_DATA = 'gs://jkdatasets/cifar10/cifar10-train.tfrecord-00000-of-00010,gs://jkdatasets/cifar10/cifar10-train.tfrecord-00001-of-00010'
EVAL_DATA = 'gs://jkdatasets/cifar10/cifar10-test.tfrecord-00000-of-00001'
BUCKET_NAME = 'gs://jkcmle/'

In [5]:
# Define a timestamped job name
JOB_NAME = "toyresnet_{}".format(int(time.time()))
JOB_DIR = BUCKET_NAME + JOB_NAME

In [None]:
!gcloud ml-engine local train \
--module-name train.train \
--package-path train \
--job-dir $JOB_DIR \
-- \
--train_files $TRAIN_DATA \
--eval_files $EVAL_DATA \
--epochs 2

### Run the training script on CMLE

In [5]:
# Define a timestamped job name
JOB_NAME = "toyresnet_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
JOB_DIR = BUCKET_NAME + JOB_NAME
REGION = 'us-west1'

!gcloud ml-engine jobs submit training $JOB_NAME \
--module-name train.train \
--package-path train \
--runtime-version 1.13 \
--python-version 3.5 \
--region $REGION \
--scale-tier basic-gpu \
--job-dir $JOB_DIR \
-- \
--train_files $TRAIN_DATA \
--eval_files $EVAL_DATA \
--epochs 2

Job [toyresnet_20190331_161436] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe toyresnet_20190331_161436

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs toyresnet_20190331_161436
jobId: toyresnet_20190331_161436
state: QUEUED


### Run the training script in a container.

```
export REGION=us-west1
export BUCKET_NAME=gs://jkcmle
export PROJECT_ID=$(gcloud config list project --format "value(core.project)")
export IMAGE_REPO_NAME=toyresnet
export IMAGE_TAG=gpu
export IMAGE_URI=gcr.io/$PROJECT_ID/$IMAGE_REPO_NAME:$IMAGE_TAG
export TRAIN_DATA=gs://jkdatasets/cifar10/cifar10-train.tfrecord-00000-of-00010,gs://jkdatasets/cifar10/cifar10-train.tfrecord-00001-of-00010
export EVAL_DATA=gs://jkdatasets/cifar10/cifar10-test.tfrecord-00000-of-00001
export JOB_NAME=J$(date +'%Y%M%d_%H%M%S')
export JOB_DIR=$BUCKET_NAME/jobs/$JOB_NAME
```




In [66]:
REGION='us-west1'
BUCKET_NAME='gs://jkcmle'
PROJECT_ID='sandbox-235500'
IMAGE_REPO_NAME='toyresnet'
IMAGE_TAG='gpu'
IMAGE_URI='gcr.io/' + PROJECT_ID + '/' + IMAGE_REPO_NAME + ':' + IMAGE_TAG
TRAIN_DATA='gs://jkdatasets/cifar10/cifar10-train.tfrecord-00000-of-00010,gs://jkdatasets/cifar10/cifar10-train.tfrecord-00001-of-00010'
EVAL_DATA='gs://jkdatasets/cifar10/cifar10-test.tfrecord-00000-of-00001'
JOB_NAME = 'toyresnet_' + datetime.datetime.today().strftime('%Y%M%d_%H%M%S')
JOB_DIR=BUCKET_NAME + '/jobs/' + JOB_NAME

In [63]:
IMAGE_URI

'gcr.io/sandbox-235500toyresnet:gpu'

In [68]:
!gcloud beta ml-engine jobs submit training $JOB_NAME \
--region $REGION \
--master-image-uri $IMAGE_URI \
--scale-tier BASIC_GPU \
-- \
--train_files $TRAIN_DATA \
--eval_files $EVAL_DATA \
--epochs 2


Job [toyresnet_20193001_203049] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ml-engine jobs describe toyresnet_20193001_203049

or continue streaming the logs with the command

  $ gcloud ml-engine jobs stream-logs toyresnet_20193001_203049
jobId: toyresnet_20193001_203049
state: QUEUED


In [67]:
!docker run --rm --runtime=nvidia $IMAGE_URI \
--train_files=$TRAIN_DATA \
--eval_files=$EVAL_DATA \
--epochs=2 \
--job-dir=$JOB_DIR

2019-04-01 20:30:57.938311: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-04-01 20:31:01.848627: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-04-01 20:31:01.849169: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 0000:00:04.0
totalMemory: 11.17GiB freeMemory: 11.10GiB
2019-04-01 20:31:01.914276: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-04-01 20:31:01.914848: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 1 with properties: 
name: Tesla K80 major