In [1]:
import time

import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

import numpy as np
from sklearn.model_selection import train_test_split

from model.ram import RecurrentAttentionModel

from data.augmented_mnist import minibatcher
from data.augmented_mnist import get_mnist

from bayes_opt import BayesianOptimization

In [2]:
(X_train, y_train),(X_test, y_test) = get_mnist(True, True, False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
print(X_train.shape, y_train.shape, np.max(X_train), np.min(X_train))
print(X_val.shape, y_val.shape, np.max(X_val), np.min(X_val))
print(X_test.shape, y_test.shape, np.max(X_test), np.min(X_test))


(45000, 28, 28, 1) (45000, 10) 1.0 0.0
(15000, 28, 28, 1) (15000, 10) 1.0 0.0
(10000, 28, 28, 1) (10000, 10) 1.0 0.0


In [3]:
def train(learning_rate, std):
    ram = RecurrentAttentionModel(time_steps=7,
                          n_glimpses=1, 
                          glimpse_size=8,
                          num_classes=10,
                          max_gradient_norm=5.0,
                          std=std.astype(np.float32))
    optimizer = tf.keras.optimizers.Adam(learning_rate.astype(np.float32))    
    batch_size = 200
    for timestep in range(1):
        batcher = minibatcher(X_train, y_train, batch_size, True)
        for X, y in batcher:
            with tf.GradientTape() as tape:
                logits = ram(X)
                hybrid_loss, _, _, _ = ram.hybrid_loss(logits, y)
                # calculate gradient and do gradient descent
                gradients = tape.gradient(hybrid_loss, ram.trainable_variables)
                optimizer.apply_gradients(zip(gradients, ram.trainable_variables))

        # testing step
        batcher = minibatcher(X_val, y_val, batch_size, True)
        accuracys = []
        for X, y in batcher:
            logits = ram(X)
            accuracy, _, _ = ram.predict(logits, y)
            accuracys.append(accuracy.numpy())
    return np.mean(accuracys)

In [4]:
# Bounded region of parameter space
pbounds = {'learning_rate': (1e-8, 0.1), 'std': (0, 1)}

# optimizer
optimizer = BayesianOptimization(
    f=train,
    pbounds=pbounds,
    random_state=42,
)

In [5]:
# n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
# init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
optimizer.maximize(
    init_points=0,
    n_iter=5,
)

|   iter    |  target   | learni... |    std    |
-------------------------------------------------
| [0m 1       [0m | [0m 0.1012  [0m | [0m 0.03745 [0m | [0m 0.9507  [0m |
| [0m 2       [0m | [0m 0.1012  [0m | [0m 0.1     [0m | [0m 0.0     [0m |
| [95m 3       [0m | [95m 0.109   [0m | [95m 0.08725 [0m | [95m 0.8278  [0m |
| [0m 4       [0m | [0m 0.09633 [0m | [0m 0.1     [0m | [0m 1.0     [0m |
| [0m 5       [0m | [0m 0.1022  [0m | [0m 1e-08   [0m | [0m 0.3844  [0m |
| [0m 6       [0m | [0m 0.0986  [0m | [0m 0.1     [0m | [0m 0.4573  [0m |


In [6]:
print(optimizer.max)

{'target': 0.10900000482797623, 'params': {'std': 0.827801597161052, 'learning_rate': 0.08724767732947727}}
