In [1]:
import os 
import time
import tensorflow as tf
tf.enable_eager_execution()

  return f(*args, **kwds)


## Import MNIST dataset

In [2]:
batch_size = 32
tf.set_random_seed(42)

# load mnist images and store them in two seperate tensorflow dataset (train and test)
(x_train, y_train),(x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train, y_test = tf.cast(y_train, dtype=tf.int32), tf.cast(y_test, dtype=tf.int32)
dset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(100, reshuffle_each_iteration=True).repeat().batch(batch_size)
dset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(100, reshuffle_each_iteration=True).repeat().batch(1000)

## Model and training procedure

In [3]:
class Model(tf.keras.Model):
    '''We use a simple model with only one hidden fully connected layer.
    More complex architectures can achieve higher accuracies, see http://yann.lecun.com/exdb/mnist/.'''
    def __init__(self, units=128):
        super(Model, self).__init__()
        self.units = units
        self.W1 = tf.layers.Dense(units, activation=tf.nn.relu, name="Layer1")
        self.W2 = tf.layers.Dense(10, name="Layer2")
    
    def call(self, _input):
        '''One forward pass of the model with a batch of MNIST images as the input x (shape: [32, 28x28x]) and 
        logits for each class label as the output (shape: [32, 10]).'''
        x = tf.layers.flatten(_input)
        hidden1 = self.W1(x)
        logits = self.W2(hidden1)
        return logits
    

def loss(logits, labels):
    '''Cross entropy loss between the predicted (soft) assignments and the true target labels.'''
    return tf.reduce_mean(
      tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=labels))


def compute_accuracy(logits, labels):
    predictions = tf.argmax(logits, axis=1, output_type=tf.int64)
    labels = tf.cast(labels, tf.int64)
    batch_size = int(logits.shape[0])
    return tf.reduce_sum(
      tf.cast(tf.equal(predictions, labels), dtype=tf.float32)) / batch_size


def train(model, optimizer, dataset, dataset_test, log_interval=100, stop=500):
    """Trains model on `dataset` using `optimizer`."""
    start = time.time()
    for (batch, (images, labels)) in enumerate(dataset):
        
        # tape records operations for automatic differentiation
        with tf.GradientTape() as tape:
            logits = model(images)
            loss_value = loss(logits, labels)
        
        # compute the gradients and use them to optimize the model variables
        grads = tape.gradient(loss_value, model.variables)
        optimizer.apply_gradients(zip(grads, model.variables))
        
        if log_interval and batch % log_interval == 0:
            rate = log_interval / (time.time() - start) if batch > 0 else 1./(time.time() - start)
            print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate))
            start = time.time()
        if batch % 100 == 0:
            test(model, dataset_test)
        if batch >= stop: break
        
def test(model, dataset):
    """Perform an evaluation of `model` on the examples from `dataset`."""
    avg_loss = 0
    avg_accuracy = 0
    for (images, labels) in dataset:
        logits = model(images)
        avg_loss = loss(logits, labels)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, axis=1, output_type=tf.int32),labels), tf.float32))
        break
    print('Test set: Average loss: {0:.3f}, Accuracy: {1:.1f}%\n'.format(avg_loss.numpy(), 100 * accuracy.numpy()))

## Construct the optimizer
In this exercise you will implement three of the most used optimizer in deep learning from scratch: SGD, Adagrad, and Adam. This blog post (http://ruder.io/optimizing-gradient-descent/) gives a great overview over the different methods.

### SGD
$$ 
    \theta^+ = \theta - \eta \nabla L(x_{i:i+n}; \theta)
$$

In [4]:
class SGD:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        
    def apply_gradients(self, grads_and_vars):        
        for gradient, variable in grads_and_vars:
            variable.assign_sub(self.learning_rate*gradient)

### Adagrad 
$$
    g_{t,i} = \nabla_\theta L(x_t; \theta_{t,i})\\
    G_{t,i} = \sum_{\tau=0}^t g_{\tau, i}^2\\
    \theta^+ = \theta - \frac{\eta}{\sqrt{G_{t,i} + \epsilon}} g_{t,i}
$$
initial_accumulator_value $:= \epsilon$

In [5]:
class Adagrad:
    def __init__(self, learning_rate=0.001, initial_accumulator_value=0.1):
        self.learning_rate = learning_rate
        self.initial_accumulator_value = initial_accumulator_value
        self.g = None
        
    def apply_gradients(self, grads_and_vars):
        if self.g is None:
            grads_and_vars = list(grads_and_vars)
            self.g = [tf.ones_like(gradient_layer, dtype=tf.float64) * self.initial_accumulator_value for (gradient_layer, _) in grads_and_vars]
            
        for i, (gradient, variable) in enumerate(grads_and_vars):
            self.g[i] += tf.square(gradient)
            variable.assign_sub(self.learning_rate/tf.sqrt(self.g[i])*gradient)

### Adam
https://arxiv.org/pdf/1412.6980.pdf see improved algorithm at the end of section 2.
$$
    m_t = \beta_1 m_{t-1} + (1-\beta_1) \nabla_\theta L(x_t; \theta_t)\\
    v_t = \beta_2 v_{t-1} + (1-\beta_2) \left(\nabla_\theta L(x_t; \theta_t)\right) ^2\\
    \eta_t = \eta * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t}
$$
$$
    \theta_{t+1} = \theta_t - \frac{\eta_t}{\sqrt{v_t} + \epsilon}m_t
$$

In [6]:
class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.iteration = 0
        
    def apply_gradients(self, grads_and_vars):
        self.iteration += 1
        if self.m is None:
            grads_and_vars = list(grads_and_vars)
            self.m = [tf.zeros_like(gradient_layer, dtype=tf.float64) for (gradient_layer, _) in grads_and_vars]
            self.v = [tf.zeros_like(gradient_layer, dtype=tf.float64) for (gradient_layer, _) in grads_and_vars]

        learning_rate = self.learning_rate * tf.cast(tf.sqrt(1-self.beta2**self.iteration)/(1-self.beta1**self.iteration), dtype=tf.float64)
        
        for i, (gradient, variable) in enumerate(grads_and_vars):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * gradient
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * tf.square(gradient)
            update_step = learning_rate / (tf.sqrt(self.v[i]) + self.epsilon) * self.m[i]
            variable.assign_sub(update_step)

# Train the model

In [14]:
# Set a fixed random seed, so you can compare your implementation against the provided tensorflow optimizer.
tf.set_random_seed(42)

model = Model()

"""Use your own implementation of the optimizer."""
# optimizer = SGD(learning_rate=0.01)
# optimizer = Adagrad(learning_rate=0.1)
# optimizer = Adam(learning_rate=0.01)

"""Use a pre-built one from tensorflow. Compare your solution against these given optimizers."""
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
# optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
# optimizer = tf.train.AdamOptimizer(learning_rate=0.01)

train(model, optimizer, dset_train, dset_test, stop=500)

Step #0	Loss: 2.352876 (23 steps/sec)
Test set: Average loss: 2.283, Accuracy: 35.2%

Step #100	Loss: 0.581958 (153 steps/sec)
Test set: Average loss: 0.373, Accuracy: 88.7%

Step #200	Loss: 0.258020 (149 steps/sec)
Test set: Average loss: 0.327, Accuracy: 90.0%

Step #300	Loss: 0.113682 (156 steps/sec)
Test set: Average loss: 0.338, Accuracy: 87.9%

Step #400	Loss: 0.198193 (154 steps/sec)
Test set: Average loss: 0.272, Accuracy: 91.2%

Step #500	Loss: 0.157352 (145 steps/sec)
Test set: Average loss: 0.278, Accuracy: 92.9%

