In [2]:
import numpy as np

## Optimizers

In [3]:
class Optimizer:
    def __init__(self, learning_rate=None, name=None):
        self.learning_rate = learning_rate
        self.name = name

    def config(self, layers):

        pass

    def optimize(self, idx, layers: list, grads: dict, *args):
        '''# Args: Takes in idx of the layer, list of the layers and the gradients as a dictionary
            Performs updates in the list of layers passed into it'''
        pass

1. Batch Gradient Descent
  - $W_t = W_{t-1} - \eta \cdot \nabla L(W_{t-1})$
  - $b_t = b_{t-1} - \eta \cdot ∇ L(b_{t-1})$

2. Stochastic Gradient Descent (SGD)
   - for random sample i,
   - $L^*(θ) = \dfrac{1}{2}(y^i - f_θ(x^i))^2$
   - $θ' = θ - \eta (y^i - f_\theta(x^i))x^i$

3. SGD with Momentum
  - $v_{t}^W = \beta \cdot v_{t-1}^W + \nabla_W L(W_{t-1})$
  - $W_t = W_{t-1} - \eta \cdot v_{t}^W$
  - $v_{t}^b = \beta \cdot v_{t-1}^b + \nabla_b L(b_{t-1})$
  - $b_t = b_{t-1} - \eta \cdot v_{t}^b$

In [4]:
class SGDM(Optimizer):
  """
  mu_init   : inital momentum coefficient
  max_mu    : max mu

  demon     : decaying momentum
  beta_init : inital beta value of demon, controlling the rate of decaying
  """
  def __init__(self, learning_rate = 1e-2, mu_init = 0.5, max_mu = 0.99, beta_init = 0.9, demon = False, **kwargs):
    super().__init__(**kwargs)
    self.mu_init = mu_init
    self.max_mu = max_mu
    self.demon = demon
    if self.demon:
      self.beta = beta_init
    self.m = dict()

  def config(self, layers):
    for i in range(1, len(layers)+1):
      self.m[f"W{i}"] = 0
      self.m[f"b{i}"] = 0

  def optimize(self, idx, layers, grads, epoch_num, steps):

    mu = np.min(self.mu_init * 1.2**(epoch_num - 1), self.max_mu)

    if self.demon:
      pass

    """

    self.m[f"W{idx}"] = self.m[f"W{idx}"] * mu + grads[f"dW{idx}"]
    self.m[f"b{idx}"] = self.m[f"b{idx}"] * mu + grads[f"db{idx}"]

    layers[idx].W -= self.learning_rate * self.m[f"W{idx}"]
    layers[idx].b -= self.learning_rate * self.m[f"b{idx}"]


    """
    # Another expression of SGDM

    self.m[f"W{idx}"] = self.m[f"W{idx}"] * mu - self.learning_rate * grads[f"dW{idx}"]
    self.m[f"b{idx}"] = self.m[f"b{idx}"] * mu - self.learning_rate * grads[f"db{idx}"]

    layers[idx].W += self.m[f"W{idx}"]
    layers[idx].b += self.m[f"b{idx}"]


4. Nesterov Accelerated Gradient
  - $v_{t}^θ = \beta \cdot v_{t-1}^θ - \eta \cdot \nabla_θ L(θ_{t-1} + v_{t-1}^θ)$
  - $θ_{t} = θ_{t-1} + v_{t}^θ$


- $\theta_{t} <= \eta \cdot v_{t-1}^\theta + \theta_{t}$
  
  - $v_{t} = \beta \cdot v_{t-1} - \eta \cdot \nabla_{\theta}L(\theta_{t})$

  - $\eta \cdot v_{t} = \beta ( \beta \cdot v_{t-1} - \eta \cdot
  \nabla_{\theta}L(\theta_{t}))$

  - $\theta_{t+1} = \theta_{t} - \eta \cdot \nabla_{\theta}L(\theta_{t}) + \beta ( \beta \cdot v_{t-1} + \eta \cdot
  \nabla_{\theta}L(\theta_{t}))$






In [5]:
class Nesterov(SGDM):
  def __init__(self, learning_rate = 1e-2, **kwargs):

    self.learning_rate = learning_rate
    super().__init__(**kwargs)

  def optimize(self, idx, layers, grads, epoch_num, steps):

    mu = np.min(self.mu_init * 1.2**(epoch_num - 1), self.max_mu)

    if self.demon:
      pass

    self.m[f"W{idx}"] = self.m[f"W{idx}"] * mu - self.learning_rate * grads[f"dW{idx}"]
    self.m[f"b{idx}"] = self.m[f"b{idx}"] * mu - self.learning_rate * grads[f"db{idx}"]

    layers[idx].W += mu * self.m[f"W{idx}"] - self.learning_rate * grads[f"dW{idx}"]
    layers[idx].b += mu * self.m[f"b{idx}"] - self.learning_rate * grads[f"db{idx}"]

5. AdaGrad
  - $v_{t}^\theta  = v_{t-1}^\theta + \nabla_{\theta} L(W_{t-1})^2$

  - $\theta_{t} = \theta_{t-1} - \dfrac{\eta}{\sqrt{v_{t}^{\theta} + {\epsilon}}} \cdot \nabla_{\theta} L(W_{t-1})$


In [6]:
class AdaGrad(Optimizer):
  def __init__(self, learning_rate = 1e-2, epsilon = 1e-8, **kwargs):

    self.learning_rate = learning_rate
    self.epsilon = epsilon
    self.V = dict()
    super().__init__(**kwargs)

    def config(self, layers):
      for i in range(1, len(layers) + 1):
        self.V[f"W{i}"] = 0
        self.V[f"b{i}"] = 0

    def optimize(self, idx, layers, grads, epochs_num, steps):

      self.V[f"W{idx}"] += grads[f"dW{idx}"]**2
      self.V[f"b{idx}"] += grads[f"db{idx}"]**2

      layers[f"W{idx}"] -=  grads[f"dW{idx}"] * (self.learning_rate) / np.sqrt(self.V[f"W{idx}"] + self.epsilon)
      layers[f"b{idx}"] -=  grads[f"db{idx}"] * (self.learning_rate) / np.sqrt(self.V[f"b{idx}"] + self.epsilon)

6. RMSProp
  - $v_{t}^{\theta} = \beta \cdot v_{t-1}^{\theta} + (1-\beta) \cdot \nabla_{θ}L(\theta_{t-1})^2$
  - $\theta_{t} = \theta_{t-1} - \dfrac{\eta}{\sqrt{v_{t}^{\theta} + {\epsilon}}} \cdot \nabla_{\theta} L(W_{t-1})$

In [7]:
class RMSProp(Optimizer):
  def __init__(self, learning_rate = 1e-2, decay_rate = 0.9, epsilon = 1e-8, **kwargs):

    self.learning_rate = learning_rate
    self.decay_rate = decay_rate
    self.epsilon = epsilon
    super().__init__(**kwargs)
    self.V = dict()

  def config(self, layers):
    for i in range(1, len(layers) + 1):
      self.V[f"W{i}"] = 0
      self.V[f"b{i}"] = 0

  def optimize(self, idx, layers, grads, epochs_num, steps):

    self.V[f"W{idx}"] = self.decay_rate * self.V[f"W{idx}"] + (1 - self.decay_rate) * grads[f"dW{idx}"]**2
    self.V[f"b{idx}"] = self.decay_rate * self.V[f"b{idx}"] + (1 - self.decay_rate) * grads[f"db{idx}"]**2

    layers[idx].W -=  grads[f"dW{idx}"] * (self.learning_rate) / np.sqrt(self.V[f"W{idx}"] + self.epsilon)
    layers[idx].b -=  grads[f"db{idx}"] * (self.learning_rate) / np.sqrt(self.V[f"b{idx}"] + self.epsilon)


7. Adam
  
  > **Moment Vector**
  - $m_{t}^{\theta} = \beta_{1} \cdot m_{t-1}^{\theta} + (1-\beta_{1}) \cdot \nabla_{\theta}L(\theta_{t-1})$
  - $v_{t}^{\theta} \,= \beta_{2} \cdot v_{t-1}^{\theta}\, + (1-\beta_{2}) \cdot \nabla_{θ}L(\theta_{t-1})^2$
  
  > **Bias Correction**
  - $\widehat{m_{t}^{\theta}} = \dfrac {m_{t}^{\theta}}{1 - {\beta}_{1}^{t}}$
  - $\widehat{v_{t}^{\theta}} \, \, = \dfrac {v_{t}^{\theta}}{1 - {\beta}_{2}^{t}}$

  > **Update Parameters**
  - $\theta_{t} = \theta_{t-1} - \dfrac{\eta⋅\widehat{m_{t}^{\theta}}}{\sqrt{\widehat{v_{t}^{\theta}}} + {\epsilon}}$



In [10]:
class Adam(Optimizer):
  def __init__(self, learning_rate = 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, **kwargs): ## default recommended from paper

    self.learning_rate = learning_rate
    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon
    super().__init__(**kwargs)

    self.t = 0      # initalize step
    self.M = dict()
    self.V = dict()

  def config(self, layers):

    self.t = 0
    for i in range(1, len(layers) + 1):
      self.M[f"W{i}"] = 0
      self.M[f"b{i}"] = 0
      self.V[f"W{i}"] = 0
      self.V[f"b{i}"] = 0

  def optimize(self, idx, layers, grads):

    self.t += 1 # update step

    # Update first moment estimate
    self.M[f"W{idx}"] = self.beta1 * self.M[f"W{idx}"] + (1 - self.beta1) * grads[f"dW{idx}"]
    self.M[f"b{idx}"] = self.beta1 * self.M[f"b{idx}"] + (1 - self.beta1) * grads[f"db{idx}"]

    # Update second moment estimate
    self.V[f"W{idx}"] = self.beta2 * self.V[f"W{idx}"] + (1 - self.beta2) * grads[f"dW{idx}"]**2
    self.V[f"b{idx}"] = self.beta2 * self.V[f"b{idx}"] + (1 - self.beta2) * grads[f"db{idx}"]**2


    # bias correction
    M_hat_W = self.M[f"W{idx}"] / (1 - self.beta1**self.t)
    M_hat_b = self.M[f"b{idx}"] / (1 - self.beta1**self.t)

    V_hat_W = self.V[f"W{idx}"] / (1 - self.beta2**self.t)
    V_hat_b = self.V[f"b{idx}"] / (1 - self.beta2**self.t)

    # Update parameters
    layers[idx].W -= self.learning_rate * M_hat_W / (np.sqrt(V_hat_W) + self.epsilon)
    layers[idx].b -= self.learning_rate * M_hat_b / (np.sqrt(V_hat_b) + self.epsilon)

{1: 1}