# Task 3 - Optimizers

**Requirements:**
 - numpy (https://numpy.org/)
 - matplotlib (https://matplotlib.org/)

Let's continue with our framework. We use all of the previous implemented classes (with some modifications) and add new - **Optimizers**.

Watch out for the shape of input data.. Now we are working with mini-batches $(B,nX,1)$, where $B$ is number of samples in mini-batch, $nX$ is number of features and $1$ is for vector/matrix multiplication in the last 2 dimensions, leaving $B$ as samples.

In [21]:
# Import
import numpy as np
from utils import Module

In [22]:
#------------------------------------------------------------------------------
#   Linear layer (Dense, Fully connected, Single Layer Perceptron)
#------------------------------------------------------------------------------
class Linear(Module):
    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()
        self.W = np.random.randn(out_features, in_features)
        self.b = np.zeros((out_features, 1))

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.aPred = input
        self.m = self.aPred.shape[0]
        net = np.matmul(self.W, input) + self.b
        return net

    def backward(self, dz: np.ndarray) -> np.ndarray:
        self.dW = (1.0/self.m) * np.sum(np.matmul(dz, self.aPred.transpose((0,2,1))), axis=0)
        self.db = (1.0/self.m) * np.sum(dz, axis=0)
        return np.matmul(self.W.transpose(), dz)

    def get_optimizer_context(self):
        # TODO
        pass

    def set_optimizer_context(self, params):
        # TODO
        pass
#------------------------------------------------------------------------------
#   SigmoidActivationFunction class
#------------------------------------------------------------------------------
class Sigmoid(Module):
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return 1.0 / (1.0 + np.exp(-input))

    def backward(self, da) -> np.ndarray:
        a = self(self.fw_input)
        return np.multiply(da, np.multiply(a, 1 - a))

#------------------------------------------------------------------------------
#   HyperbolicTangentActivationFunction class
#------------------------------------------------------------------------------
class Tanh(Module):
    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return (np.exp(2 * input) - 1) / (np.exp(2 * input) + 1)

    def backward(self, da) -> np.ndarray:
        a = self(self.fw_input)
        return np.multiply(da, 1 - np.square(a))

#------------------------------------------------------------------------------
#   Model class
#------------------------------------------------------------------------------
class Model(Module):
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, input) -> np.ndarray:
        for name, module in self.modules.items():
            input = module(input)
        return input

    def backward(self, z: np.ndarray):
        for name, module in reversed(self.modules.items()):
            z = module.backward(z)

## Loss Functions

As in standard deep learning frameworks, calling Loss function can return either **cost** or  **loss**  based on parameter **reduce**.

In [23]:
#------------------------------------------------------------------------------
#   MeanSquareErrorLossFunction class
#------------------------------------------------------------------------------
class MSELoss(Module):
    def __init__(self, reduce="mean"):
        super(MSELoss, self).__init__()
        if reduce == "mean":
            self.reduce_fn = np.mean
        elif reduce == "sum":
            self.reduce_fn = np.sum
        elif reduce is None:
            # return identity (do nothing)
            self.reduce_fn = lambda x : x
        else:
            raise AttributeError

    def forward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return self.reduce_fn(np.mean(np.power(target - input, 2), axis=0, keepdims=True))

    def backward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return np.mean(-2 * (target - input), axis=1, keepdims=True)


#------------------------------------------------------------------------------
#   BinaryCrossEntropyLossFunction class
#------------------------------------------------------------------------------
class BCELoss(Module):
    def __init__(self, reduce="mean"):
        super(BCELoss, self).__init__()
        if reduce == "mean":
            self.reduce_fn = np.mean
        elif reduce == "sum":
            self.reduce_fn = np.sum
        elif reduce is None:
            # return identity (do nothing)
            self.reduce_fn = lambda x : x
        else:
            raise AttributeError

    def forward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return self.reduce_fn(-(target * np.log(input) + np.multiply((1 - target), np.log(1 - input))))

    def backward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return -np.divide(target, input) + np.divide(1 - target, 1 - input)

## Optimizers

Each optimizer has as first required parameter **parameters_gen**. It is function for accessing all trainable parameters of Model. Another atributes of optimizer is based on the optimizer definition.

Your task is to implement:
 - SGD with momentum
 - RMSProp: http://www.cs.toronto.edu/~hinton/coursera/lecture6/lec6.pdf
 - Adam: https://arxiv.org/pdf/1412.6980.pdf

All algorithms are in [https://www.deeplearningbook.org/contents/optimization.html](https://www.deeplearningbook.org/contents/optimization.html)


In [24]:
#------------------------------------------------------------------------------
#   AbstractOptimizer class
#------------------------------------------------------------------------------
class Optimizer:
    def __init__(self):
        pass

    def step(self):
        raise NotImplemented

#------------------------------------------------------------------------------
#   StochasticGradientDescentOptimizer class
#------------------------------------------------------------------------------
class SGD(Optimizer):
    def __init__(self, model:Model, lr:float):
        super(SGD, self).__init__()
        self.model = model
        self.lr = lr

    def step(self):
        for name, layer in self.model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW],[b,db]] = params
                    # >>>> start here

                    # <<<< end here
                    layer.set_optimizer_context([W,b])


#------------------------------------------------------------------------------
#   SGDMomentumOptimizer class
#------------------------------------------------------------------------------
class SGDMomentum(Optimizer):
    def __init__(self, model, lr, momentum):
        super(SGDMomentum, self).__init__()
        self.model = model
                # >>>> start_solution

        # <<<< end_solution

    def step(self):
        for name, layer in self.model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW],[b,db]] = params
                    # >>>> start here

                    # <<<< end here

                    layer.set_optimizer_context([W,b])

#------------------------------------------------------------------------------
#   RMSpropOptimizer class
#------------------------------------------------------------------------------
class RMSprop(Optimizer):
    def __init__(self, model, lr, rho, delta):
        super(RMSprop, self).__init__()
        self.model = model
        # >>>> start_solution

        # <<<< end_solution

    def step(self):
        for name, layer in self.model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW], [b, db]] = params
                    # >>>> start here

                    # <<<< end here
                    layer.set_optimizer_context([W, b])


#------------------------------------------------------------------------------
#   AdamOptimizer class
#------------------------------------------------------------------------------
class Adam(Optimizer):
    def __init__(self, model, lr, rho1, rho2, delta):
        super(Adam, self).__init__()
        self.model = model
        self.context = {}
        # >>>> start_solution

        # <<<< end_solution

    def step(self):
        self.t += 1
        for name, layer in self.model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW], [b, db]] = params
                    # >>>> start here

                    # <<<< end here
                    layer.set_optimizer_context([W, b])



## Main Processing Cell

Watch out for the shape of mini-batch (B,Features,1)

 1. Initialize dataset (`dataset_Flower`).
 2. Declare a simple model.
 3. Initialize optimizer.
 4. Make mini-batches.
 5. Perform forward pass through the network.
 6. Compute loss.
 7. Backward prop loss.
 8. Track loss.
 9. Backward pass MLP.
 10. Use optimizer to modify model parameters.
 11. Repeat for $N$ epochs

In [25]:
from utils import gradient_check

In [26]:
from dataset import dataset_Flower, MakeBatches

In [27]:
dataset = MakeBatches(dataset_Flower(m=512, noise=0.3), 32, True)
###>>> start of solution
mlp = Model()
mlp.add_module(Linear(2, 3), 'Dense_1')
mlp.add_module(Tanh(), 'Tanh_1')
mlp.add_module(Linear(3, 4), 'Dense_2')
mlp.add_module(Tanh(), 'Tanh_2')
mlp.add_module(Linear(4, 5), 'Dense_3')
mlp.add_module(Tanh(), 'Tanh_3')
mlp.add_module(Linear(5, 1), 'Dense_4_out')
mlp.add_module(Sigmoid(), 'Sigmoid')
loss_fn = MSELoss(reduce='mean')

optimizer = SGDMomentum(mlp, lr=0.001, momentum=0.5)


In [33]:
N_epochs = 100
losses = []
for i in range(N_epochs):
    epoch_loss = []
    for mini_batch_X, mini_batch_Y in dataset:
        predicted_Y_hat = mlp.forward(mini_batch_X)
        loss = loss_fn(predicted_Y_hat, mini_batch_Y)
        epoch_loss += [np.mean(loss)]
        dLoss = loss_fn.backward(predicted_Y_hat, mini_batch_Y)
        mlp.backward(dLoss)
        # gradient_check(mlp, loss_fn, mini_batch_X, mini_batch_Y)
        optimizer.step()
    losses += [np.mean(epoch_loss)]

In [29]:
import plotly.express as px

In [None]:
fig = px.line({'SGDMomentum':losses})
fig.show()