## **Tune Tutorial**

https://github.com/ray-project/tutorial/blob/master/tune_exercises/Tune.ipynb

### **Outline**

0. Preprocess the Data
1. Create and train a model on a toy dataset
2. Integrating Tune into workflow
3. Trying out advanced features
4. Validating trained model
5. Try out a search algorithm

In [1]:
import numpy as np
from IPython.display import HTML
import matplotlib.pyplot as plt

In [2]:
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.datasets import mnist

Using TensorFlow backend.


### **PART 0: Preprocess the Data**

In [3]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1).astype('float32')

#Preprocess the input
mean_img = np.mean(X_train,0)
X_train -= mean_img
X_test  -= mean_img

num_classes = 10
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 28, 28, 1)
(60000, 10)
(10000, 28, 28, 1)
(10000, 10)


In [4]:
def generate_batch(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch//batch_size
    global counter

    X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
    y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
    counter += 1

    #restart counter to yeild data in the next epoch as well
    if counter >= number_of_batches:
        counter = 0
    
    return X_batch,y_batch

In [5]:
counter = 0

### **PART 1: Creating a model to be trained**

In [6]:
import argparse
parser = argparse.ArgumentParser(description = 'Keras MNIST Example')
parser.add_argument('--epochs', type=int, default = 2, help = 'epoch')
parser.add_argument('--batch_size', type=int, default = 128, help = 'batch_size')

parser.add_argument('--lr', type=float, default = 0.1, help = 'learning rate')
parser.add_argument('--momentum', type = float, default = 0.0, help = 'SGD momentum')
parser.add_argument('--kernel1', type = int, default = 3, help = 'Size of first kernel')
parser.add_argument('--kernel2', type = int, default = 3, help = 'Size if second kernel')
parser.add_argument('--poolsize', type = int, default = 2, help = 'Size of Pooling')
parser.add_argument('--dropout1', type = float, default = 0.25, help = 'first kernel dropout rate')
parser.add_argument('--dropout2', type = float, default = 0.5, help = 'second kernel dropout rate')
parser.add_argument('--hidden', type = int, default = 32, help = 'Size of Hidden Layer')

DEFAULT_ARGS = vars(parser.parse_known_args()[0])

In [7]:
DEFAULT_ARGS

{'batch_size': 128,
 'dropout1': 0.25,
 'dropout2': 0.5,
 'epochs': 2,
 'hidden': 32,
 'kernel1': 3,
 'kernel2': 3,
 'lr': 0.1,
 'momentum': 0.0,
 'poolsize': 2}

In [8]:
def make_model(parameters):
    config = DEFAULT_ARGS.copy()
    config.update(parameters)
    num_classes = 10
    
    model = Sequential()
    model.add(Conv2D(32, kernel_size = (config['kernel1'], config['kernel1']), activation = 'relu', input_shape = (28,28,1)))
    model.add(Conv2D(64, kernel_size = (config['kernel2'], config['kernel2']), activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (config['poolsize'], config['poolsize'])))
    model.add(Dropout(config["dropout1"]))
    model.add(Flatten())
    model.add(Dense(config['hidden'], activation = 'relu'))
    model.add(Dropout(config["dropout2"]))
    model.add(Dense(num_classes, activation = 'softmax'))
    
    model.compile(loss = keras.losses.categorical_crossentropy, 
                  optimizer = keras.optimizers.SGD(lr = config["lr"], momentum = config["momentum"]), 
                  metrics=['accuracy'])
    
    return model

In [9]:
def train_mnist(args, X_train, y_train):
    """Loads data, and saves the weights"""
    model = make_model(args)
    print(model.summary())

    batch_size = args['batch_size']
    epochs = args['epochs']
    
    model.fit(X_train, y_train, 
              batch_size = batch_size,
              epochs = epochs,
              verbose = 1,
              validation_data = (X_test, y_test))

In [10]:
train_mnist(DEFAULT_ARGS, X_train, y_train)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 12, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                294944    
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
__________

### **PART 2: Setting up Tune**

Tune utilizes Ray as a backend

In [11]:
import ray
from ray import tune

ray.init(ignore_reinit_error=True)

2019-03-27 23:27:59,082	INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-03-27_23-27-59_5655/logs.




2019-03-27 23:27:59,208	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:50575 to respond...
2019-03-27 23:27:59,407	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:34625 to respond...
2019-03-27 23:27:59,414	INFO services.py:760 -- Starting Redis shard with 3.32 GB max memory.
2019-03-27 23:27:59,524	INFO services.py:1384 -- Starting the Plasma object store with 4.98 GB memory using /dev/shm.


{'node_ip_address': None,
 'object_store_address': '/tmp/ray/session_2019-03-27_23-27-59_5655/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-03-27_23-27-59_5655/sockets/raylet',
 'redis_address': '192.168.25.47:50575',
 'webui_url': None}

#### **Two steps to use Tune**

Step 1) We need to make a signature to a specific format. Pass a **reporter object** to the below train_mnist_tune class

```python
def trainable(config, reporter):
    """
    Args:
        config (dict): Parameters provided from the search algorithm
            or variant generation.
        reporter (Reporter): Handle to report intermediate metrics to Tune.
    """
```

Step 2) We want to keep track of performance as the model is training. <br>
e.g. Call the reporter to report the mean accuracy for every batch

```python

def train_func(config, reporter):  # add a reporter arg
    # ...
    for data, target in dataset:
        model.fit(data, target)
        save_model()
        accuracy = model.evaluate(x_batch, y_batch)[1]
```

In [13]:
class TuneCallback(keras.callbacks.Callback):
    def __init__(self, reporter, logs={}):
        self.reporter = reporter
        self.iteration = 0

    def on_train_end(self, epoch, logs={}):
        self.reporter(timesteps_total=self.iteration, done=1, mean_accuracy=logs["acc"])

    def on_batch_end(self, batch, logs={}):
        self.iteration += 1
        self.reporter(timesteps_total=self.iteration, mean_accuracy=logs["acc"])

In [25]:
def train_mnist_tune(args, X_train, y_train, reporter):
    """Loads data, and saves the weights"""
    model = make_model(args)
    #print(model.summary())

    batch_size = args['batch_size']
    epochs = args['epochs']
    
    model.fit(X_train, y_train, 
              batch_size = batch_size,
              epochs = epochs,
              verbose = 1,
              validation_data = (X_test, y_test),
              callbacks = [TuneCallback(reporter)])

In [26]:
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray import tune

scheduler = AsyncHyperBandScheduler(
        time_attr="timesteps_total",
        reward_attr="mean_accuracy",
        max_t=400,
        grace_period=20)

tune.register_trainable(
        "TRAIN_FN",
        lambda config, reporter: train_mnist_tune(DEFAULT_ARGS, X_train, y_train, reporter))

In [27]:
tune.run(
    "TRAIN_FN",
    name="exp",
    scheduler=scheduler,
    **{
        "stop": {
            "mean_accuracy": 0.99,
            "timesteps_total": 10
        },
        "num_samples": 1,
        "resources_per_trial": {
            "cpu": 4,
            "gpu": 0.5
        },
        "config": {
            "lr": tune.sample_from(
                lambda spec: np.random.uniform(0.001, 0.1))
            #"momentum": tune.sample_from(
            #    lambda spec: np.random.uniform(0.1, 0.9)),
            #"hidden": tune.sample_from(
            #    lambda spec: np.random.randint(32, 512)),
            #"dropout1": tune.sample_from(
            #    lambda spec: np.random.uniform(0.2, 0.8)),
        }
    })

2019-03-27 23:46:44,583	INFO tune.py:60 -- Tip: to resume incomplete experiments, pass resume='prompt' or resume=True to run()
2019-03-27 23:46:44,584	INFO tune.py:211 -- Starting a new experiment.


== Status ==
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 180.000: None | Iter 60.000: None | Iter 20.000: None
Bracket: Iter 180.000: None | Iter 60.000: None
Bracket: Iter 180.000: None
Resources requested: 0/12 CPUs, 0/1 GPUs
Memory usage on this node: 6.6/16.6 GB





== Status ==
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 180.000: None | Iter 60.000: None | Iter 20.000: None
Bracket: Iter 180.000: None | Iter 60.000: None
Bracket: Iter 180.000: None
Resources requested: 4/12 CPUs, 0.5/1 GPUs
Memory usage on this node: 6.9/16.6 GB
Result logdir: /home/hojoon/ray_results/exp
Number of trials: 1 ({'RUNNING': 1})
RUNNING trials:
 - TRAIN_FN_0_lr=0.060213:	RUNNING

[2m[36m(pid=5774)[0m Using TensorFlow backend.
[2m[36m(pid=5774)[0m 2019-03-27 23:46:46.769405: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
[2m[36m(pid=5774)[0m Train on 60000 samples, validate on 10000 samples
[2m[36m(pid=5774)[0m Epoch 1/2
[2m[36m(pid=5774)[0m 2019-03-27 23:46:46.833682: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:897] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so

2019-03-27 23:46:47,274	ERROR trial_runner.py:460 -- Error processing event.
Traceback (most recent call last):
  File "/home/hojoon/anaconda3/envs/tf/lib/python3.5/site-packages/ray/tune/trial_runner.py", line 409, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "/home/hojoon/anaconda3/envs/tf/lib/python3.5/site-packages/ray/tune/ray_trial_executor.py", line 314, in fetch_result
    result = ray.get(trial_future[0])
  File "/home/hojoon/anaconda3/envs/tf/lib/python3.5/site-packages/ray/worker.py", line 2316, in get
    raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
2019-03-27 23:46:47,276	ERROR worker.py:1780 -- A worker died or was killed while executing task 000000004c70be56092e5a6001cf44eab29aea88.
2019-03-27 23:46:47,276	INFO ray_trial_executor.py:178 -- Destroying actor for trial TRAIN_FN_0_lr=0.060213. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creati

[2m[36m(pid=5774)[0m 2019-03-27 23:46:47.162168: E tensorflow/stream_executor/cuda/cuda_blas.cc:459] failed to create cublas handle: CUBLAS_STATUS_NOT_INITIALIZED
[2m[36m(pid=5774)[0m 2019-03-27 23:46:47.166628: E tensorflow/stream_executor/cuda/cuda_dnn.cc:352] Could not create cudnn handle: CUDNN_STATUS_INTERNAL_ERROR
[2m[36m(pid=5774)[0m Fatal Python error: Segmentation fault
[2m[36m(pid=5774)[0m 
== Status ==
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 180.000: None | Iter 60.000: None | Iter 20.000: None
Bracket: Iter 180.000: None | Iter 60.000: None
Bracket: Iter 180.000: None
Resources requested: 0/12 CPUs, 0.0/1 GPUs
Memory usage on this node: 6.8/16.6 GB
Result logdir: /home/hojoon/ray_results/exp
Number of trials: 1 ({'ERROR': 1})
ERROR trials:
 - TRAIN_FN_0_lr=0.060213:	ERROR, 1 failures: /home/hojoon/ray_results/exp/TRAIN_FN_0_lr=0.060213_2019-03-27_23-46-45e2j0mdpp/error_2019-03-27_23-46-47.txt



TuneError: ('Trials did not complete', [TRAIN_FN_0_lr=0.060213])