# Deep Learning

In [1]:
import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf

from typing import List, Set, Dict, Tuple, Optional, Union

%matplotlib inline
np.random.seed(1)

import IPython
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

print(tf.__version__)

2.4.1


## Backpropagation

* https://kevinzakka.github.io/2016/09/14/batch_normalization/
* https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795
* https://medium.com/@a.mirzaei69/implement-a-neural-network-from-scratch-with-python-numpy-backpropagation-e82b70caa9bb





### Implement simple regression first


1. One layer Dense
1. Regression

Steps:
    
* Implement parts 
    1. activatoins
    1. code one layer 


In [43]:
class Layer():
    
    def _get_activation_func(self,activation_type):
            
        if activation_type == 'relu':
            
            def activation_func(z: np.array):
    
                a = np.maximum(0.0,z)
        
                return a
        
        else:
            
            def activation_func(z: np.array):
    
                a = 1.0/(1.0 + np.exp(-z))
        
                return a
        
        return activation_func
            
    
    def __init__(self, n_neurons, activation_type):
        
        self.n_neurons = n_neurons
        self.activation_type = activation_type
        self.activation = self._get_activation_func(activation_type)
            
class DenseLayer(Layer):
    
    def __init__(self, n_neurons, activation_type):
        
        super().__init__(n_neurons, activation_type)
        
        
    def forward(self, X_in: np.array, W: np.array) -> Tuple[np.array]:
        
        z = X_in.dot(W)
        a = self.activation(z)
        
        return a, z

In [44]:
l1 = DenseLayer(3, 'relu')
l1.activation_type

l2 = DenseLayer(3, 'sigmoid')
l2.activation_type

X = np.array([[1.0, 2.0], [1.0,4.0]])
X.shape
W = np.array([-1.0, 0.5]).T
W.shape

print("Expected Z")
X.dot(W)

print("Layers output")
l1.forward(X,W)
l2.forward(X,W)

'relu'

'sigmoid'

(2, 2)

(2,)

Expected Z


array([0., 1.])

Layers output


(array([0., 1.]), array([0., 1.]))

(array([0.5       , 0.73105858]), array([0., 1.]))

In [11]:
class NN():

    def __init__(self, layers, seed= 2021):
        
        self._seed = seed 
        np.random.seed(seed)

        self.layers = layers
        self.n_layers = len(layers)
        self.cache = {}

                        
    # All bias are zero
    def _init_layer_parameter_random(self, layer_k, layer):
        
        pass
        
    def _init_layer_paremeter_xavier(self):
        
        pass
        
    # only 2 activations: relu and sigmoid
    def activation(self, z, activation_type='relu') -> np.array:
        
        if activation_type == 'relu':
            
            a = np.maximum(0.0,z)
        
        else:
            
            a = 1.0/(1.0 + np.exp(-z))
        
        return a      
        
    def forward_dense_layer(self, X_in: np.array, W: np.array, activation_type) -> Tuple[np.array]:
        
        z = W.dot(X_in)
        a = activation(z, activation_type)
        
        return a, z
    
    def forward_batchnormalization_layer(self,X: np.array) -> np.array:
        
        pass
    
    
    def forward_propagation(self, X_in: np.array) -> np.array: 
        
        A_curr = X_in
        
        for idx, layer in enumerate(nn_architecture):
            
            A_prev = A_curr
            
            A_curr, Z_curr = self._forward_dense_layer(A_curr, W_curr, ) 
            
            
        y = A_curr
    
        return y, 

In [None]:
nn_architecture = [
    {"type": 'dense' , "input_dim": 2, "output_dim": 1, "activation": "relu"},
 ]



## Batch normalization  

(Breaktrough in the area)
refs:
* https://towardsdatascience.com/understanding-batch-normalization-with-examples-in-numpy-and-tensorflow-with-interactive-code-7f59bb126642 <= very good in simple.
* https://kevinzakka.github.io/2016/09/14/batch_normalization/
* Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift (paper)
    * Authors: Sergey Ioffe (same author of PLDA and works at Google) n Christian Szegedy (google)
    * https://arxiv.org/pdf/1502.03167.pdf Paper **TODO** Read the paper. It is simple and easy to understand/ It is a good gain experience in reading paper  
* https://www.tensorflow.org/api_docs/python/tf/keras/layers/BatchNormalization tensprflow doc


refs:
* https://www.deeplearningbook.org/contents/optimization.html
* https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
* https://arxiv.org/pdf/1502.03167.pdf
* https://arxiv.org/pdf/1702.03275.pdf
* https://www.youtube.com/watch?v=nUUqwaxLnWs
* https://arxiv.org/pdf/1805.11604.pdf
* https://towardsdatascience.com/understanding-batch-normalization-with-examples-in-numpy-and-tensorflow-with-interactive-code-7f59bb126642 <= very good in simple.
* https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795    
* https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/

**It is being asked more often in the job interview**

Batch normalization is the technique to improve the performance and stability of neural networks by normalizing the inputs in every layer so that they have mean output activation of zero and standard deviation of one.


Don’t Use With Dropout:

Batch normalization offers some regularization effect, reducing generalization error, perhaps no longer requiring the use of dropout for regularization.


* Input are the values of x over a batch: $B = {x_1, x_2,..., x_i,..., x_m}$
    * where $m$ is the batch size
* Output: $y_i = BN_{\gamma,\beta}(x_i)$
* Learning parameters: $\gamma$ and $\beta$
* Normalization:

$
\mu_B = \frac{1}{m} \sum_{i=1}^m x_i \\
\sigma_B^2 = \frac{1}{m} \sum_{i=1}^m (x_i - \mu_B)^2 \\
z_i = \frac{x_i - \mu_B}{\sqrt{\sigma^2_B + \epsilon}} \\
y_i = BN_{\gamma,\beta}(x_i) \equiv \gamma z_i + \beta
$


In [14]:
# Batch
X = np.random.uniform(0,5.0,size=(10))
X

print(f"shape: {X.shape}; mean: {X.mean():.2f}; std: {X.std():.2f}")


array([2.01322734, 4.25871895, 4.02472907, 1.67608851, 2.38439136,
       3.27488305, 4.73704778, 0.24529551, 4.33834569, 1.60061159])

shape: (10,); mean: 2.86; std: 1.41


In [25]:
gamma = 1.0
beta = 0.0
epsilon = 0.0

# because we did not train the layer, we are passing the mean and the variance of the batch
Y = tf.nn.batch_normalization(X,
                    mean = X.mean(axis=0),        # batch mean
                    variance = X.var(axis=0),     # batch var
                    offset = beta,scale = gamma,  # batch beta and gamma See equations  
                    variance_epsilon = epsilon)   # batch epsilon See equations

Y.numpy()

# comparing with numpy

Z = (X - X.mean(axis=0))/np.sqrt(X.var(axis=0) + epsilon)
Y = gamma * Z + beta
Y

# Expectd zero mean and unit variance
print(f"shape: {Y.shape}; mean: {Y.mean():.2f}; std: {Y.std():.2f}")

array([-0.59603551,  0.99330345,  0.82768749, -0.83465937, -0.33332892,
        0.29695316,  1.33186033, -1.84736191,  1.04966255, -0.88808126])

array([-0.59603551,  0.99330345,  0.82768749, -0.83465937, -0.33332892,
        0.29695316,  1.33186033, -1.84736191,  1.04966255, -0.88808126])

shape: (10,); mean: -0.00; std: 1.00
