### Activation Functions:
- sigmoid (done)
- tanh 
- relu 
<br/> 
Helpful resource: 
- http://cs231n.github.io/optimization-2/
- https://towardsdatascience.com/activation-functions-neural-networks-1cbd9f8d91d6

![alt text](https://github.com/joannejc/MachineLearning_Numpy/blob/master/resources/fn%20table.png)


In [1]:
import numpy as np

class Sigmoid(object):
    '''
    '''
    def __init__(self):
        self.grad = None
        
    def forward(self, x):
        f = 1.0/(np.exp(x) + 1)
        self.grad = (1 - f) * f
        return f
    
    def backward(self, dx):
        ''' dx is downstream gradient, need to compute dx * grad to return to upstream.
            Assume dx and stored grad are of the same dimension.
        '''
        return self.grad * dx
        


In [None]:
## Test:
sig = Sigmoid()
np.random.seed(100)
x = np.random.randn(5,3)
f = sig.forward(x)

#x, f, sig.grad, sig.backward(x*x)
    

### Linear Layer:

In [32]:
# Create linear layer

class LinearLayer(object):
    ''' affine transform on input x  with parameters w and b: y = wx + b
        input_dim is number of variables (dim of x)
    '''
    def __init__(self, input_dim, output_dim, weight_init = 'randn'):
        ### add util to have all diff weight_init methods
        ### assume below wt initializing method is what we have for now:
        self.w = np.random.randn(input_dim, output_dim) * 1e-2
        self.b = np.random.randn(1, output_dim) * 0
        self.x = None
        
        self.dw = None
        self.db = None
    
    def forward(self, x):
        self.x = x
        y = np.matmul(x, self.w) + self.b
        #print(x.shape, self.w.shape, self.b.shape, y.shape)
        return y
    
    def backward(self, dy):
        ''' dy is downstream gradient, need to compute dw, db, and  
            return dx to upstream.
        '''
        self.dw = np.matmul(self.x.T,dy)
        self.db = np.sum(dy, axis = 0, keepdims= True)
        dx = np.matmul(dy, self.w.T)
        return dx
    

In [25]:
input_dim = 5
output_dim = 3
x = np.random.randn(10, 5) # 10 examples with 5 variables each
layer = LinearLayer(input_dim, output_dim)

y = layer.forward(x)

(10, 5) (5, 3) (1, 3) (10, 3)


In [29]:
# Check shapes:
x = np.ones([10,5])
W = np.ones([5,3])
b = np.ones([1,3])
y = np.matmul(x,W)
dy = y*2

dW = np.matmul(x.T,dy)
dW.shape

db = np.matmul(np.ones([1,10]), dy)
db.shape
np.sum(dy, axis = 0, keepdims= True), db

y = layer.forward(x)
y
#layer.backward(dy)

(10, 5) (5, 3) (1, 3) (10, 3)


array([[-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ],
       [-0.0008409 ,  0.02361694, -0.0287744 ]])

### Loss Function:

In [48]:
def MSE(y_pred, y):
    ''' Assume y_pred and y have the same dim and can be batched.
    '''
    b_size = y_pred.shape[0]
    diff = y_pred.reshape(b_size,-1) - y.reshape(b_size,-1)
    mse = np.power(diff, 2).mean()
    
    dLoss = 2 * diff # dLoss/dy_pred
    
    return mse, dLoss



In [49]:
## Test
mse, dLoss = MSE(np.random.rand(10,1), np.random.randn(10,1))
mse, dLoss
np.random.randn(10).shape[0]

10

### Model:
Test model1 with 1 hidden layer:

In [52]:
# Sample Data:
x = np.random.randn(100,5)
# example: y = x1*x2*x3^3 + x4/x5 - x5
y = np.asarray(x[:,0]*x[:,1]*np.power(x[:,2],3) + x[:,3]/x[:,4] - x[:,4])


# create our model, for now it's just a list, later i will make it a class

model = []

# let's add the modules to our model
model.append(LinearLayer(5,10)) # add input to hidden layer
model.append(Sigmoid()) # add the nonlinear activation
model.append(LinearLayer(10,1)) # output layer

# Forward pass:
print('before shape:',x.shape)

for mod in model:
    x = mod.forward(x)
    
print('after forward pass shape:',x.shape)


# Loss:
print(x.shape, y.shape)
mse, dLoss = MSE(x, y)
dx = dLoss


# Backward pass:
print('before shape:',x.shape)

for mod in reversed(model):
    dx = mod.backward(dx)
    
print('after forward pass shape:',x.shape)


before shape: (100, 5)
after forward pass shape: (100, 1)
(100, 1) (100,)
before shape: (100, 1)
after forward pass shape: (100, 1)


In [51]:
mse, dLoss = MSE(x, y)
dLoss.shape

(100, 1)

In [None]:
############
# Create hidden layer 1:
input_dim1 = x.shape[1] #5
layer1 = LinearLayer(5, 10)

# hidden layer 2:
layer2 = LinearLayer(10, 1)

# Create activation fn:
sig = Sigmoid()

# Model:
x1 = sig.forward(x)
y1 = layer1.forward(x1)
y_pred.shape
#mse, dLoss = MSE(y_pred, y)

In [None]:
def sigmoid(X, W):
    '''
    Computes and returns output of sigmoid fn (f), dX, dW
        Args:
            X: inputs
            W: weights
            X, W should be numpy arrays, could be either matrices or vectors of the same size
    '''
    
    # Forward pass:
    D = W.dot(X)
    f = 1.0/(np.exp(D) + 1)
    
    # Backpropagation:
    dD = np.asarray((1 - f) * f) # gradient of dot variable (D)
    dX = W.T.dot(dD) # backprop into X
    dW = dD.dot(X.T) # backprop into W
    
    return  f, dX, dW

In [None]:
# Test:
# matrix-matrix test:
np.random.seed(100)
W = np.random.randn(5,10)
X = np.random.randn(10,3)

F, dX, dW = sigmoid(X, W)

# vector-vector test:
w = [2,-3]
x = [-1, -2]
x = np.asarray(x)
w = np.asarray(w)
f, dx, dw = sigmoid(x,w)

f, F

In [None]:
def tanh(X, W):
    