<a href="https://colab.research.google.com/github/leox1v/Optimization_tutorial/blob/master/Optimization_DL2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Imports
%tensorflow_version 2.x
import tensorflow as tf

# Load MNIST

In [0]:
# get the dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Let's press the data between 0 and 1
x_train, x_test = x_train / 255.0, x_test / 255.0

## Model

In [0]:
def get_model():
  return tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')])

# SGD
$$ 
    \theta^+ = \theta - \eta \nabla L(x_{i:i+n}; \theta)
$$

In [0]:
class MySGD(tf.keras.optimizers.Optimizer):
    def __init__(self, learning_rate=0.01):
        super(MySGD, self).__init__(name='my_sgd')
        self.learning_rate = learning_rate
        
    def apply_gradients(self, grads_and_vars): 
      # TODO: implement the update step 
      # hint: use variable.assign_sub(value) to modify the variable value 
    

    def get_config(self):
      config = {
          'lr': self.learning_rate
      }
      base_config = super(SGD, self).get_config()
      return dict(list(base_config.items()) + list(config.items()))

# Adagrad 
$$
    g_{t,i} = \nabla_\theta L(x_t; \theta_{t,i})\\
    G_{t,i} = \sum_{\tau=0}^t g_{\tau, i}^2\\
    \theta^+ = \theta - \frac{\eta}{\sqrt{G_{t,i} + \epsilon}} g_{t,i}
$$

In [0]:
class MyAdagrad(tf.keras.optimizers.Optimizer):
    def __init__(self, learning_rate=0.01, epsilon=0.1):
        super(MyAdagrad, self).__init__(name='MyAdagrad')
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.g = None
        
    def apply_gradients(self, grads_and_vars):
        # TODO: implement the update step 
        # hint: use variable.assign_sub(value) to modify the variable value 

    def get_config(self):
      config = {
          'lr': self.learning_rate,
          'epsilon': self.epsilon
      }
      base_config = super(SGD, self).get_config()
      return dict(list(base_config.items()) + list(config.items()))

# Adam
https://arxiv.org/pdf/1412.6980.pdf see improved algorithm at the end of section 2.
$$
    m_t = \beta_1 m_{t-1} + (1-\beta_1) \nabla_\theta L(x_t; \theta_t)\\
    v_t = \beta_2 v_{t-1} + (1-\beta_2) \left(\nabla_\theta L(x_t; \theta_t)\right) ^2\\
    \eta_t = \eta * \frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t}
$$
$$
    \theta_{t+1} = \theta_t - \frac{\eta_t}{\sqrt{v_t} + \epsilon}m_t
$$

In [0]:
class MyAdam(tf.keras.optimizers.Optimizer):
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-07):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.iteration = 0
        
    def apply_gradients(self, grads_and_vars):
        # TODO: implement the update step 
        # hint: use variable.assign_sub(value) to modify the variable value 

    def get_config(self):
      config = {
          'lr': self.learning_rate,
          'epsilon': self.epsilon,
          'beta1': self.beta1,
          'beta2': self.beta2
      }
      base_config = super(SGD, self).get_config()
      return dict(list(base_config.items()) + list(config.items()))

# Train the model

In [0]:
# SGD
model = get_model()
model.compile(optimizer=MySGD(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=3)

In [0]:
# Adagrad
model = get_model()
model.compile(optimizer=MyAdagrad(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=3)

In [0]:
# Adam
model = get_model()
model.compile(optimizer=MyAdam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=3)