This jupyter notebook would be imported in the `homework_main-basic.ipynb` and `homework_main-advanced.ipynb` using `%run homework_modules.ipynb` command.
So each cell of this notebook would be call.

Thus, If you have some tests falling, just commit them out to check a functionality.

In [6]:
import numpy as np

In [7]:
# FOR TESTS ONLY!

import torch
from torch.autograd import Variable
import numpy
import traceback

VERBOSE = True

def assertAlmostEqual(expected, actual, msg: str="???", rtol=1e-05, atol=1e-08,):
    try:
        isEq = np.allclose(expected, actual, rtol=rtol, atol=atol)
    except Exception as err:
        print(f"{msg}: FAILED:\n  {err}")
        raise

    if isEq:
        if VERBOSE:
            print(f"{msg}: OK!")
    else:
        print(f"{msg}: FAILED:\n  expected={expected}\n  actual={actual}")
        raise AssertionError(f"{msg}: FAILED:\n  expected={expected}\  actual={actual}")
        
def assertTrue(condition, msg: str="???"):
    if condition:
        if VERBOSE:
            print(f"{msg}: OK!")
    else:
        print(f"{msg}: FAILED:\n  expected=True")
        raise AssertionError(f"{msg}: FAILED:\n  expected=True")

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [8]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [9]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """

        # Your code goes here. ################################################

        y = input
        for module in self.modules:
          y = module.forward(y)
        
        self.output = y

        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To ech module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        # Your code goes here. ################################################

        n = len(self.modules)
        g = gradOutput
        backward_modules = self.modules[1:][::-1]
        for i, module in enumerate(backward_modules):
            g = module.backward(self.modules[n - 2 - i].output, g)

        self.gradInput = self.modules[0].backward(input, g)        
        ###

        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)
    
    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()
    
    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

In [10]:
def test_Sequential():
    # Unfortunately this test you cannot run right now. 
    # It would be best to run it after implementing Linear layer and test it with Linear
    # instead of Batch norm
    
    # TODO: as a student you can try to fix it, or not. 

    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"Iter {i}")
        # layers initialization
        alpha = 0.9
        torch_layer = torch.nn.BatchNorm1d(n_in, eps=BatchNormalization.EPS, momentum=1.-alpha, affine=True)
        torch_layer.bias.data = torch.from_numpy(np.random.random(n_in).astype(np.float32))
        custom_layer = Sequential()
        bn_layer = BatchNormalization(alpha)
        bn_layer.moving_mean = torch_layer.running_mean.numpy().copy()
        bn_layer.moving_variance = torch_layer.running_var.numpy().copy()
        custom_layer.add(bn_layer)
        scaling_layer = ChannelwiseScaling(n_in)
        scaling_layer.gamma = torch_layer.weight.data.numpy()
        scaling_layer.beta = torch_layer.bias.data.numpy()
        custom_layer.add(scaling_layer)
        custom_layer.train()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        #assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        #assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')

        # 3. check layer parameters grad
        weight_grad, bias_grad = custom_layer.getGradParameters()[1]
        torch_weight_grad = torch_layer.weight.grad.data.numpy()
        torch_bias_grad = torch_layer.bias.grad.data.numpy()
        assertAlmostEqual(torch_weight_grad, weight_grad, msg='3. check layer parameters grad. weights')
        assertAlmostEqual(torch_bias_grad, bias_grad, msg='3. check layer parameters grad. bias')
        
#test_Sequential()

# Layers

## 1. Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [11]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
       
        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        # self.output = ...

        #self.output = input.dot(self.w) + self.b
        self.output = np.add(np.dot(input,np.transpose(self.W)), self.b)
        #np.add(input.dot(input, np.transpose(self.W)), self.b, out = self.output)

        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # self.gradInput = ...

        self.gradInput = np.dot(gradOutput, self.W)

        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        # Your code goes here. ################################################
        # self.gradW = ... ; self.gradb = ...

        self.gradW = np.dot(np.transpose(gradOutput), input)
        self.gradb = np.dot(np.ones(input.shape[0]), gradOutput)
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

In [12]:
def test_Linear():
        np.random.seed(42)
        torch.manual_seed(42)

        batch_size, n_in, n_out = 2, 3, 4
        for i in range(100):
            print(f"Iter {i}")
            # layers initialization
            torch_layer = torch.nn.Linear(n_in, n_out)
            custom_layer = Linear(n_in, n_out)
            custom_layer.W = torch_layer.weight.data.numpy()
            custom_layer.b = torch_layer.bias.data.numpy()

            layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-10, 10, (batch_size, n_out)).astype(np.float32)

            # 1. check layer output
            custom_layer_output = custom_layer.updateOutput(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg="1. check layer output")
        
            # 2. check layer input grad
            custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            #assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')

            # 3. check layer parameters grad
            custom_layer.accGradParameters(layer_input, next_layer_grad)
            weight_grad = custom_layer.gradW
            bias_grad = custom_layer.gradb
            torch_weight_grad = torch_layer.weight.grad.data.numpy()
            torch_bias_grad = torch_layer.bias.grad.data.numpy()
            #assertAlmostEqual(torch_weight_grad, weight_grad, msg='3. check layer parameters grad. Weight')
            assertAlmostEqual(torch_bias_grad, bias_grad, msg='3. check layer parameters grad. Bias')
            
#test_Linear()

## 2. SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [13]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        
        # Your code goes here. ################################################
        np.exp(self.output, out = self.output)
        np.divide(self.output, np.sum(self.output, axis = 1).reshape(-1, 1), out = self.output)
        #
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        gradOutput = np.multiply(gradOutput, self.output)
        self.gradInput = np.subtract(gradOutput, np.multiply(self.output, np.sum(gradOutput, axis = 1, keepdims = True).reshape(-1, 1)))
        #
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

In [14]:
def test_SoftMax():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"test_SoftMax. Iter {i}")
        # layers initialization
        torch_layer = torch.nn.Softmax(dim=1)
        custom_layer = SoftMax()

        layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.random((batch_size, n_in)).astype(np.float32)
        next_layer_grad /= next_layer_grad.sum(axis=-1, keepdims=True)
        next_layer_grad = next_layer_grad.clip(1e-5,1.)
        next_layer_grad = 1. / next_layer_grad

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
#test_SoftMax()

## 3. LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [15]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()
    
    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        
        # Your code goes here. ################################################
        np.subtract(self.output, np.log(np.sum(np.exp(self.output), axis = 1).reshape(-1, 1)), out = self.output)
        #
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.zeros(input.shape)

        for i in range(input.shape[0]):
            np.dot(gradOutput[i], np.subtract(np.eye(input.shape[1]), np.exp(self.output)[i]), out = self.gradInput[i])
        #
        return self.gradInput
    
    def __repr__(self):
        return "LogSoftMax"

In [16]:
def test_LogSoftMax():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"test_LogSoftMax. Iter {i}")
        # layers initialization
        torch_layer = torch.nn.LogSoftmax(dim=1)
        custom_layer = LogSoftMax()

        layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.random((batch_size, n_in)).astype(np.float32)
        next_layer_grad /= next_layer_grad.sum(axis=-1, keepdims=True)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
#test_LogSoftMax()

## 4. Batch normalization
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance: 
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance. 

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [18]:
class BatchNormalization(Module):
    EPS = 1e-3
    def __init__(self, alpha = 0.):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha
        self.moving_mean = None 
        self.moving_variance = None
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        # use self.EPS please
        self.n = input.shape[0]
        self.output = np.zeros_like(input)

        if self.training == True:
            self.batch_mean = np.mean(input, axis = 0)
            self.batch_variance = np.var(input, axis = 0)
            self.moving_mean = self.moving_mean * self.alpha + self.batch_mean * (1 - self.alpha)
            self.moving_variance = self.moving_variance * self.alpha + self.batch_variance * (1 - self.alpha)
            
            np.divide(np.subtract(input, self.batch_mean), np.sqrt(np.add(self.batch_variance, self.EPS)), out = self.output)
        else:
            np.divide(np.subtract(input, self.moving_mean), np.sqrt(np.add(self.moving_variance, self.EPS)), out = self.output)
        #
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.zeros_like(input)
        normalized = np.zeros_like(input)
        
        np.divide(np.subtract(input, self.batch_mean), (np.sqrt(np.add(self.batch_variance, self.EPS))), out = normalized)
        np.multiply(np.divide(1, np.multiply(np.sqrt(np.add(self.batch_variance, self.EPS)), self.n)), np.subtract(np.subtract(np.multiply(self.n, gradOutput), gradOutput.sum(axis = 0)), np.multiply(normalized, np.sum(np.multiply(gradOutput, normalized), axis = 0))), out = self.gradInput)
        #
        return self.gradInput
    
    def __repr__(self):
        return "BatchNormalization"

In [21]:
def test_BatchNormalization():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 32, 16
    for _ in range(100):
        # layers initialization
        slope = np.random.uniform(0.01, 0.05)
        alpha = 0.9
        custom_layer = BatchNormalization(alpha)
        custom_layer.train()
        torch_layer = torch.nn.BatchNorm1d(n_in, eps=custom_layer.EPS, momentum=1.-alpha, affine=False)
        custom_layer.moving_mean = torch_layer.running_mean.numpy().copy()
        custom_layer.moving_variance = torch_layer.running_var.numpy().copy()
        

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        # please, don't increase `atol` parameter, it's garanteed that you can implement batch norm layer
        # with tolerance 1e-5
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-5)

        # 3. check moving mean
        assertAlmostEqual(custom_layer.moving_mean, torch_layer.running_mean.numpy())
        # we don't check moving_variance because pytorch uses slightly different formula for it:
        # it computes moving average for unbiased variance (i.e var*N/(N-1))
        #self.assertTrue(np.allclose(custom_layer.moving_variance, torch_layer.running_var.numpy()))

        # 4. check evaluation mode
        custom_layer.moving_variance = torch_layer.running_var.numpy().copy()
        custom_layer.evaluate()
        custom_layer_output = custom_layer.updateOutput(layer_input)
        torch_layer.eval()
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)
            
test_BatchNormalization()

???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
?

In [None]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)
        
        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output
        
    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)
    
    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)
        
    def getParameters(self):
        return [self.gamma, self.beta]
    
    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]
    
    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [None]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        
        self.p = p
        self.mask = None
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = np.zeros(input.shape)
        self.mask = np.random.binomial(1, 1. - self.p, input.shape)
        if self.training == True:
            np.multiply(input, self.mask, out = self.output)
            np.divide(self.output, 1. - self.p, out = self.output)
        else:
            self.output = input
        #
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.zeros(input.shape)
        if self.training == True:
            np.multiply(gradOutput, self.mask, out = self.gradInput)
            np.divide(self.gradInput, (1. - self.p), out = self.gradInput)
        else:
            self.gradInput = gradOutput
        #
        return self.gradInput
        
    def __repr__(self):
        return "Dropout"

In [None]:
def test_Dropout():
    np.random.seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        p = np.random.uniform(0.3, 0.7)
        layer = Dropout(p)
        layer.train()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        layer_output = layer.updateOutput(layer_input)
        assertTrue(np.all(np.logical_or(np.isclose(layer_output, 0), 
                                    np.isclose(layer_output*(1.-p), layer_input))))

        # 2. check layer input grad
        layer_grad = layer.updateGradInput(layer_input, next_layer_grad)
        assertTrue(np.all(np.logical_or(np.isclose(layer_grad, 0), 
                                    np.isclose(layer_grad*(1.-p), next_layer_grad))))

        # 3. check evaluation mode
        layer.evaluate()
        layer_output = layer.updateOutput(layer_input)
        assertAlmostEqual(layer_output, layer_input)

        # 4. check mask
        p = 0.0
        layer = Dropout(p)
        layer.train()
        layer_output = layer.updateOutput(layer_input)
        assertAlmostEqual(layer_output, layer_input)

        p = 0.5
        layer = Dropout(p)
        layer.train()
        layer_input = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
        layer_output = layer.updateOutput(layer_input)
        zeroed_elem_mask = np.isclose(layer_output, 0)
        layer_grad = layer.updateGradInput(layer_input, next_layer_grad)        
        assertTrue(np.all(zeroed_elem_mask == np.isclose(layer_grad, 0)))

        # 5. dropout mask should be generated independently for every input matrix element, not for row/column
        batch_size, n_in = 1000, 1
        p = 0.8
        layer = Dropout(p)
        layer.train()

        layer_input = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
        layer_output = layer.updateOutput(layer_input)
        assertTrue(np.sum(np.isclose(layer_output, 0)) != layer_input.size)

        layer_input = layer_input.T
        layer_output = layer.updateOutput(layer_input)
        assertTrue(np.sum(np.isclose(layer_output, 0)) != layer_input.size)
        
#test_Dropout()

???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
?

# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**): 

In [None]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

## 6. Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope. 

In [None]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()
            
        self.slope = slope
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = input.copy()
        self.output[self.output < 0] = self.output[self.output < 0] * self.slope
        #
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        boolpos = input >= 0
        pos = gradOutput * (boolpos)
        boolneg = input < 0
        neg = gradOutput * (boolneg) * self.slope
        self.gradInput = pos + neg
        #
        return self.gradInput
    
    def __repr__(self):
        return "LeakyReLU"

In [None]:
def test_LeakyReLU():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        slope = np.random.uniform(0.01, 0.05)
        torch_layer = torch.nn.LeakyReLU(slope)
        custom_layer = LeakyReLU(slope)

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)
        
#test_LeakyReLU()

???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
?

## 7. ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [None]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()
        
        self.alpha = alpha
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = input.copy()
        self.output[self.output < 0] = (np.exp(self.output[self.output < 0]) - 1) * self.alpha
        #
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        boolpos = input >= 0
        boolneg = input < 0

        pos = gradOutput * (boolpos)

        neg = gradOutput * np.exp(input) * boolneg * self.alpha

        self.gradInput = pos + neg
        #
        return self.gradInput
    
    def __repr__(self):
        return "ELU"

In [None]:
def test_ELU():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        alpha = 1.0
        torch_layer = torch.nn.ELU(alpha)
        custom_layer = ELU(alpha)

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)
        
#test_ELU()

???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
?

## 8. SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [None]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()
    
    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = np.log(np.exp(input) + 1)
        #
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.divide(gradOutput, (np.exp(-input) + 1))
        #
        return self.gradInput
    
    def __repr__(self):
        return "SoftPlus"

In [None]:
def test_SoftPlus():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        torch_layer = torch.nn.Softplus()
        custom_layer = SoftPlus()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        #assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        #assertAlmostEqual(np.allclose(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6))
        
#test_SoftPlus()

???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!


# Criterions

Criterions are used to score the models answers. 

In [None]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [None]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
        
    def updateOutput(self, input, target):   
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output 
 
    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 9. Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula, 
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [None]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self)
        super(ClassNLLCriterionUnstable, self).__init__()
        
    def updateOutput(self, input, target): 
        
        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        
        # Your code goes here. ################################################
        self.N = input.shape[0]
        self.output = np.negative(np.divide(np.sum(np.multiply(target, np.log(input_clamp))), self.N))
        #
        return self.output

    def updateGradInput(self, input, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
                
        # Your code goes here. ################################################
        self.gradInput = np.negative(np.divide(np.divide(target, input_clamp), self.N))
        #
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterionUnstable"

In [None]:
def test_ClassNLLCriterionUnstable():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"Iter {i}")
        # layers initialization
        torch_layer = torch.nn.NLLLoss()
        custom_layer = ClassNLLCriterionUnstable()

        layer_input = np.random.uniform(0, 1, (batch_size, n_in)).astype(np.float32)
        layer_input /= layer_input.sum(axis=-1, keepdims=True)
        layer_input = layer_input.clip(custom_layer.EPS, 1. - custom_layer.EPS)  # unifies input
        target_labels = np.random.choice(n_in, batch_size)
        target = np.zeros((batch_size, n_in), np.float32)
        target[np.arange(batch_size), target_labels] = 1  # one-hot encoding

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input, target)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(torch.log(layer_input_var), 
                                             Variable(torch.from_numpy(target_labels), requires_grad=False))
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg="1. check layer output")

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, target)
        torch_layer_output_var.backward()
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
#test_ClassNLLCriterionUnstable()

Iter 0
1. check layer output: OK!
2. check layer input grad: OK!
Iter 1
1. check layer output: OK!
2. check layer input grad: OK!
Iter 2
1. check layer output: OK!
2. check layer input grad: OK!
Iter 3
1. check layer output: OK!
2. check layer input grad: OK!
Iter 4
1. check layer output: OK!
2. check layer input grad: OK!
Iter 5
1. check layer output: OK!
2. check layer input grad: OK!
Iter 6
1. check layer output: OK!
2. check layer input grad: OK!
Iter 7
1. check layer output: OK!
2. check layer input grad: OK!
Iter 8
1. check layer output: OK!
2. check layer input grad: OK!
Iter 9
1. check layer output: OK!
2. check layer input grad: OK!
Iter 10
1. check layer output: OK!
2. check layer input grad: OK!
Iter 11
1. check layer output: OK!
2. check layer input grad: OK!
Iter 12
1. check layer output: OK!
2. check layer input grad: OK!
Iter 13
1. check layer output: OK!
2. check layer input grad: OK!
Iter 14
1. check layer output: OK!
2. check layer input grad: OK!
Iter 15
1. check lay

## 10. Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [None]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, input, target): 
        # Your code goes here. ################################################
        self.N = input.shape[0]
        self.output = np.negative(np.divide(np.sum(np.multiply(target, input)), self.N))
        #
        return self.output

    def updateGradInput(self, input, target):
        # Your code goes here. ################################################
        self.gradInput = np.negative(np.divide(target, self.N))
        #
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"

In [None]:
def test_ClassNLLCriterion():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"test_ClassNLLCriterion. Iter {i}")
        # layers initialization
        torch_layer = torch.nn.NLLLoss()
        custom_layer = ClassNLLCriterion()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        layer_input = torch.nn.LogSoftmax(dim=1)(Variable(torch.from_numpy(layer_input))).data.numpy()
        target_labels = np.random.choice(n_in, batch_size)
        target = np.zeros((batch_size, n_in), np.float32)
        target[np.arange(batch_size), target_labels] = 1  # one-hot encoding

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input, target)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var, 
                                             Variable(torch.from_numpy(target_labels), requires_grad=False))
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, target)
        torch_layer_output_var.backward()
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
#test_ClassNLLCriterion()

test_ClassNLLCriterion. Iter 0
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 1
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 2
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 3
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 4
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 5
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 6
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 7
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 8
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 9
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriterion. Iter 10
1. check layer output: OK!
2. check layer input grad: OK!
test_ClassNLLCriteri

# Optimizers

### SGD optimizer with momentum
- `variables` - list of lists of variables (one list per layer)
- `gradients` - list of lists of current gradients (same structure as for `variables`, one array for each var)
- `config` - dict with optimization parameters (`learning_rate` and `momentum`)
- `state` - dict with optimizator state (used to save accumulated gradients)

In [None]:
def sgd_momentum(variables, gradients, config, state):  
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('accumulated_grads', {})
    
    var_index = 0 
    for current_layer_vars, current_layer_grads in zip(variables, gradients): 
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):
            
            old_grad = state['accumulated_grads'].setdefault(var_index, np.zeros_like(current_grad))
            
            np.add(config['momentum'] * old_grad, config['learning_rate'] * current_grad, out=old_grad)
            
            current_var -= old_grad
            var_index += 1     

## 11. [Adam](https://arxiv.org/pdf/1412.6980.pdf) optimizer
- `variables` - list of lists of variables (one list per layer)
- `gradients` - list of lists of current gradients (same structure as for `variables`, one array for each var)
- `config` - dict with optimization parameters (`learning_rate`, `beta1`, `beta2`, `epsilon`)
- `state` - dict with optimizator state (used to save 1st and 2nd moment for vars)

Formulas for optimizer:

Current step learning rate: $$\text{lr}_t = \text{learning_rate} * \frac{\sqrt{1-\beta_2^t}} {1-\beta_1^t}$$
First moment of var: $$\mu_t = \beta_1 * \mu_{t-1} + (1 - \beta_1)*g$$ 
Second moment of var: $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2)*g*g$$
New values of var: $$\text{variable} = \text{variable} - \text{lr}_t * \frac{m_t}{\sqrt{v_t} + \epsilon}$$

In [None]:
def adam_optimizer(variables, gradients, config, state):  
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('m', {})  # first moment vars
    state.setdefault('v', {})  # second moment vars
    state.setdefault('t', 0)   # timestamp
    state['t'] += 1
    for k in ['learning_rate', 'beta1', 'beta2', 'epsilon']:
        assert k in config, config.keys()
    
    var_index = 0 
    lr_t = config['learning_rate'] * np.sqrt(1 - config['beta2']**state['t']) / (1 - config['beta1']**state['t'])
    for current_layer_vars, current_layer_grads in zip(variables, gradients): 
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):
            var_first_moment = state['m'].setdefault(var_index, np.zeros_like(current_grad))
            var_second_moment = state['v'].setdefault(var_index, np.zeros_like(current_grad))
            
            # <YOUR CODE> #######################################
            # update `current_var_first_moment`, `var_second_moment` and `current_var` values
            #np.add(... , out=var_first_moment)
            #np.add(... , out=var_second_moment)
            #current_var -= ...
            np.add(np.multiply(config['beta1'], var_first_moment), np.multiply((1 - config['beta1']), current_grad), out = var_first_moment)
            np.add(np.multiply(config['beta2'], var_second_moment), np.multiply((1 - config['beta2']), np.multiply(current_grad, current_grad)), out = var_second_moment)

            current_var -= np.multiply(lr_t, np.divide(var_first_moment, np.sqrt(var_second_moment) + config['epsilon']))
            #
            
            # small checks that you've updated the state; use np.add for rewriting np.arrays values
            assert var_first_moment is state['m'].get(var_index)
            assert var_second_moment is state['v'].get(var_index)
            var_index += 1


In [None]:
def test_adam_optimizer():
    state = {}  
    config = {'learning_rate': 1e-3, 'beta1': 0.9, 'beta2':0.999, 'epsilon':1e-8}
    variables = [[np.arange(10).astype(np.float64)]]
    gradients = [[np.arange(10).astype(np.float64)]]
    adam_optimizer(variables, gradients, config, state)
    
    assertAlmostEqual(state['m'][0], np.array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))
    assertAlmostEqual(state['v'][0], np.array([0., 0.001, 0.004, 0.009, 0.016, 0.025, 0.036, 0.049, 0.064, 0.081]))
    assertTrue(state['t'] == 1)
    assertAlmostEqual(variables[0][0], np.array([0., 0.999, 1.999, 2.999, 3.999, 4.999, 5.999, 6.999, 7.999, 8.999]))
    adam_optimizer(variables, gradients, config, state)
    assertAlmostEqual(state['m'][0], np.array([0., 0.19, 0.38, 0.57, 0.76, 0.95, 1.14, 1.33, 1.52, 1.71]))
    assertAlmostEqual(state['v'][0], np.array([0., 0.001999, 0.007996, 0.017991, 0.031984, 0.049975, 0.071964, 0.097951, 0.127936, 0.161919]))
    assertTrue(state['t'] == 2)
    assertAlmostEqual(variables[0][0], np.array([0., 0.998, 1.998, 2.998, 3.998, 4.998, 5.998, 6.998, 7.998, 8.998]))
    
#test_adam_optimizer()

???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!
???: OK!


# Layers for advanced track homework
You **don't need** to implement it if you are working on `homework_main-basic.ipynb`

## 12. Conv2d [Advanced]
- input:   **`batch_size x in_channels x h x w`**
- output: **`batch_size x out_channels x h x w`**

You should implement something like pytorch `Conv2d` layer with `stride=1` and zero-padding outside of image using `scipy.signal.correlate` function.

Practical notes:
- While the layer name is "convolution", the most of neural network frameworks (including tensorflow and pytorch) implement operation that is called [correlation](https://en.wikipedia.org/wiki/Cross-correlation#Cross-correlation_of_deterministic_signals) in signal processing theory. So **don't use** `scipy.signal.convolve` since it implements [convolution](https://en.wikipedia.org/wiki/Convolution#Discrete_convolution) in terms of signal processing.
- It may be convenient to use `skimage.util.pad` for zero-padding.
- It's rather ok to implement convolution over 4d array using 2 nested loops: one over batch size dimension and another one over output filters dimension
- Having troubles with understanding how to implement the layer? 
 - Check the last year video of lecture 3 (starting from ~1:14:20)
 - May the google be with you

In [None]:
import scipy as sp
import scipy.signal
import skimage

class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(Conv2d, self).__init__()
        assert kernel_size % 2 == 1, kernel_size
       
        stdv = 1./np.sqrt(in_channels)
        self.W = np.random.uniform(-stdv, stdv, size = (out_channels, in_channels, kernel_size, kernel_size))
        self.b = np.random.uniform(-stdv, stdv, size=(out_channels,))
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        pad_size = self.kernel_size // 2
        # YOUR CODE ##############################
        # 1. zero-pad the input array
        # 2. compute convolution using scipy.signal.correlate(... , mode='valid')
        # 3. add bias value
        
        # self.output = ...
        
        #
        
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        pad_size = self.kernel_size // 2
        # YOUR CODE ##############################
        # 1. zero-pad the gradOutput
        # 2. compute 'self.gradInput' value using scipy.signal.correlate(... , mode='valid')
        
        # self.gradInput = ...
        
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        pad_size = self.kernel_size // 2
        # YOUR CODE #############
        # 1. zero-pad the input
        # 2. compute 'self.gradW' using scipy.signal.correlate(... , mode='valid')
        # 3. compute 'self.gradb' - formulas like in Linear of ChannelwiseScaling layers
        
        # self.gradW = ...
        # self.gradb = ...
        pass
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Conv2d %d -> %d' %(s[1],s[0])
        return q

In [None]:
def test_Conv2d(self):
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in, n_out = 2, 3, 4
    h,w = 5,6
    kern_size = 3
    for _ in range(100):
        # layers initialization
        torch_layer = torch.nn.Conv2d(n_in, n_out, kern_size, padding=1)
        custom_layer = Conv2d(n_in, n_out, kern_size)
        custom_layer.W = torch_layer.weight.data.numpy() # [n_out, n_in, kern, kern]
        custom_layer.b = torch_layer.bias.data.numpy()

        layer_input = np.random.uniform(-1, 1, (batch_size, n_in, h,w)).astype(np.float32)
        next_layer_grad = np.random.uniform(-1, 1, (batch_size, n_out, h, w)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)

        # 3. check layer parameters grad
        custom_layer.accGradParameters(layer_input, next_layer_grad)
        weight_grad = custom_layer.gradW
        bias_grad = custom_layer.gradb
        torch_weight_grad = torch_layer.weight.grad.data.numpy()
        torch_bias_grad = torch_layer.bias.grad.data.numpy()
        #m = ~np.isclose(torch_weight_grad, weight_grad, atol=1e-5)
        assertAlmostEqual(torch_weight_grad, weight_grad, atol=1e-6, )
        assertAlmostEqual(torch_bias_grad, bias_grad, atol=1e-6)
        
# test_Conv2d()

## 13. MaxPool2d [Advanced]
- input:   **`batch_size x n_input_channels x h x w`**
- output: **`batch_size x n_output_channels x h // kern_size x w // kern_size`**

You are to implement simplified version of pytorch `MaxPool2d` layer with stride = kernel_size. Please note, that it's not a common case that stride = kernel_size: in AlexNet and ResNet kernel_size for max-pooling was set to 3, while stride was set to 2. We introduce this restriction to make implementation simplier.

Practical notes:
- During forward pass what you need to do is just to reshape the input tensor to `[n, c, h / kern_size, kern_size, w / kern_size, kern_size]`, swap two axes and take maximums over the last two dimensions. Reshape + axes swap is sometimes called space-to-batch transform.
- During backward pass you need to place the gradients in positions of maximal values taken during the forward pass
- In real frameworks the indices of maximums are stored in memory during the forward pass. It is cheaper than to keep the layer input in memory and recompute the maximums.

In [None]:
class MaxPool2d(Module):
    def __init__(self, kernel_size):
        super(MaxPool2d, self).__init__()
        self.kernel_size = kernel_size
        self.gradInput = None
                    
    def updateOutput(self, input):
        input_h, input_w = input.shape[-2:]
        # your may remove these asserts and implement MaxPool2d with padding
        assert input_h % self.kernel_size == 0  
        assert input_w % self.kernel_size == 0
        
        # YOUR CODE #############################
        # self.output = ...
        # self.max_indices = ...
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # YOUR CODE #############################
        # self.gradInput = ...
        return self.gradInput
    
    def __repr__(self):
        q = 'MaxPool2d, kern %d, stride %d' %(self.kernel_size, self.kernel_size)
        return q

In [None]:
def test_MaxPool2d():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 3
    h,w = 4,6
    kern_size = 2
    for _ in range(100):
        # layers initialization
        torch_layer = torch.nn.MaxPool2d(kern_size)
        custom_layer = MaxPool2d(kern_size)

        layer_input = np.random.uniform(-10, 10, (batch_size, n_in, h,w)).astype(np.float32)
        next_layer_grad = np.random.uniform(-10, 10, (batch_size, n_in, 
                                                      h // kern_size, w // kern_size)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)
        
#test_MaxPool2d()

### Flatten layer
Just reshapes inputs and gradients. It's usually used as proxy layer between Conv2d and Linear.

In [None]:
class Flatten(Module):
    def __init__(self):
         super(Flatten, self).__init__()
    
    def updateOutput(self, input):
        self.output = input.reshape(len(input), -1)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput.reshape(input.shape)
        return self.gradInput
    
    def __repr__(self):
        return "Flatten"