# Part 2: Automatic differentiation

In [1]:
import numpy as np

### Question 1: implement the relu function and its VJP in the format above. Using the finite difference equation (slide 13), make sure that the VJP is correct numerically.

In [2]:
def relu(x):
  """
  Args:
    x: an array

  Returns:
    - value of the function ReLU(x) 
    - function vjp to easily compute vjp of ReLU
  """
  value = np.maximum(x,0) #by defintion of the ReLU function 
  
  def vjp(u):
    relu_derivative = (x > 0) * 1 #by definition of the derivative and ReLU function 
    vjp_wrt_x = np.multiply(u,relu_derivative) #using slides 27 of the course
    return vjp_wrt_x,  

  return value, vjp

Now, we will create some functions to numerically check if the function defined above (and others ones later) are correct.

In [11]:
def test_vjp(f, x, u, eps=1e-3):
  """
  Args:
    f: a function returning a tuple of size 2: array and vjp
    x: an array of size n 
    eps: numerical value (very small)
    u: an array of size m 

  Returns:
    numerical_vjp
  """
  
  def e(i): # to define each direction in the space
    basis_vector = np.zeros(len(x))
    basis_vector[i] = 1
    return basis_vector
  
  Jacobian = np.zeros((len(f(x)[0]),len(x)))
  for i in range (len(x)):
    Jacobian[:,i] = (f(x + eps * e(i))[0] - f(x)[0]) / eps #finite difference
  
  return np.dot(Jacobian.T,u)



We then test our implemented function

In [9]:
# we define some values for x and u, they must same dimension because for ReLU space of inputs and outputs are the same
x = np.linspace(start = -2, stop = 2, num = 40)
u = np.linspace(start = -5, stop = 2, num = 40)

implemented_vjp = relu(x)[1](u)[0]
relu_numerical_vjp = test_vjp(relu, x, u, eps=1e-3)

print("The numeric method gives for the VJP:")
print(relu_numerical_vjp)


The numeric method gives for the VJP:
[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         -1.41025641 -1.23076923 -1.05128205 -0.87179487
 -0.69230769 -0.51282051 -0.33333333 -0.15384615  0.02564103  0.20512821
  0.38461538  0.56410256  0.74358974  0.92307692  1.1025641   1.28205128
  1.46153846  1.64102564  1.82051282  2.        ]


In [10]:
print("The implemented method gives for the VJP:")
print(implemented_vjp)

The implemented method gives for the VJP:
[-0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -1.41025641 -1.23076923 -1.05128205 -0.87179487
 -0.69230769 -0.51282051 -0.33333333 -0.15384615  0.02564103  0.20512821
  0.38461538  0.56410256  0.74358974  0.92307692  1.1025641   1.28205128
  1.46153846  1.64102564  1.82051282  2.        ]


The two methods give the same results indeed.

### Question 2: reusing dot and relu, implement a 2-layer MLP with a relu activation


In [12]:
#we import dot from the google doc
def dot(W, x):
  """
  Args:
    W: an a matrix of shape (n1,n)
    x: an array of shape n 

  Returns:
    - value of the function dot(W,x) 
    - function vjp to easily compute vjp of the dot product
  """
  value = np.dot(W, x)

  def vjp(u):
    return np.outer(u, x), np.dot(W.T,u)

  return value, vjp

In [13]:
def mlp2(x, W1, W2):
    """
    This function defines a MLP-2 architecure with ReLU function activation

    Args:
        x: an array of shape n 
        W1: an a matrix of shape (n1,n)
        W2: an a matrix of shape (1,n1)

    Returns:
        - value of the function mlp2(x, W1, W2)
        - function vjp to easily compute vjp of mlp2 archietcture 
    """

    x1 = dot(W1,x)[0] #1st linear layer 
    x2 = relu(x1)[0] #ReLU activation 
    x3 = dot(W2,x2)[0] #2nd linear layer
    value = x3

    def vjp(u):
        
        vjp_jacobian_dot_W2 = dot(W2,x2)[1](u) 
        vjp_wrt_W2 = vjp_jacobian_dot_W2[0] # we keep the 1st vjp, with respect to W2

        vjp_jacobian_dot_W2_wrt_x2 = vjp_jacobian_dot_W2[1] # we keep the 2nd vjp, with respect to x2
        vjp_jacobian_relu_wrt_x1 = relu(x1)[1](vjp_jacobian_dot_W2_wrt_x2)[0]

        vjp_jacobian_dot_W1 =  dot(W1,x)[1](vjp_jacobian_relu_wrt_x1) 
        vjp_wrt_W1 = vjp_jacobian_dot_W1[0] # we keep the 1st vjp, with respect to W1
        vjp_wrt_x = vjp_jacobian_dot_W1[1] # we keep the 2nd vjp, with respect to x
        
        return vjp_wrt_x,vjp_wrt_W1, vjp_wrt_W2

    return value, vjp

### Question 3: implement the squared loss VJP

In [14]:
def squared_loss(y_pred, y):
    """
    This function defines squared-loss between 2 vectors

    Args:
        y_pred: a scalar
        y: a scalar 
        
    Returns:
        - value of the function squared_loss(y_pred, y)
        - function vjp to easily compute vjp of the squared loss
    """

    residual = y_pred - y
    value = 0.5 * np.sum(residual ** 2)

    def vjp(u):
        vjp_y_pred = np.multiply(residual,u)
        vjp_y = -np.multiply(residual,u)
        return vjp_y_pred, vjp_y
    
    # The code requires every output to be an array.
    return np.array([value]), vjp

### Question 4: implement the loss by composing mlp2 and squared_loss


In [15]:
def loss(x, y, W1, W2):
    """
    This function defines the loss, by combining previous function squared_loss and mlp2

    Args:
        x: an array of shape n 
        y: an array of shape n 
        W1: an a matrix of shape (n1,n)
        W2: an a matrix of shape (1,n1)
        
    Returns:
        - value of the function loss(x, y, W1, W2)
        - function vjp to easily compute vjp of the loss
    """

    y_pred = mlp2(x, W1, W2)[0]
    value = squared_loss(y_pred, y)[0]

    def vjp(u):
        
        vjp_squared_loss = squared_loss(y_pred, y)[1](u)
        vjp_wrt_y = vjp_squared_loss[1] # we keep the vjp with respect to y 

        vjp_mlp2 = mlp2(x, W1, W2)[1](vjp_squared_loss[0])

        vjp_wrt_W2 = vjp_mlp2[2] # we keep the vjp with respect to W2
        vjp_wrt_W1 = vjp_mlp2[1] # we keep the vjp with respect to W1
        vjp_wrt_x = vjp_mlp2[0] # we keep the vjp with respect to Wx
        
        return vjp_wrt_x, vjp_wrt_y, vjp_wrt_W1, vjp_wrt_W2

    return value, vjp

Now let's numerically test our architecture

In [20]:
#check the gradient for x coordinate 

def test_architecture_vjp(f,x,y,W1,W2,u, eps=1e-3): 

    """
    Args:
        f: a function returning a tuple of size 2: array and vjp
        x: an array of size n 
        eps: numerical value (very small)
        u: an array of size m 

    Returns:
        numerical_vjp

    """
    
    def e(i): # to define each direction in the space
        basis_vector = np.zeros(len(x))
        basis_vector[i] = 1
        return basis_vector
  
    Jacobian = np.zeros((len(f(x,y,W1,W2)[0]),len(x)))
    for i in range (len(x)):
        Jacobian[:,i] = (f(x + eps * e(i),y,W1,W2)[0] - f(x,y,W1,W2)[0]) / eps #finite difference
  
    return np.dot(Jacobian.T,u) 
    

In [21]:
# we define some values for x and u, they must same dimension because for ReLU space of inputs and outputs are the same
m = 5 #size 
k = 10 #size of the ouput vector for the MLP

x = np.linspace(start = -2, stop = 2, num = 40) #size of the input vector
y = np.linspace(start = -2, stop = 2, num = k)
u = np.random.rand(1) #size 1 for vjp because f will output a scalar

W1 = np.random.rand(m,len(x))
W2 = np.random.rand(k,m)

relu_numerical_vjp = test_architecture_vjp(loss,x,y,W1,W2,u, eps=1e-3)
implemented_vjp = loss(x, y, W1, W2)[1](u)[0]

print("The numeric method gives for the VJP:")
print(relu_numerical_vjp)

The numeric method gives for the VJP:
[ 7.4375586   9.87695888  1.95932956 18.31859134 13.76357548 13.58615237
  5.81185748  8.99747236 13.11428322  5.4961817   3.79019975  8.97573862
 12.07180389  7.88578523 15.63086431 11.85023676 12.93270159 10.13338122
 14.74583103 12.95043222  9.4298893  12.08964973 10.6792685   2.82691115
 14.01778692  9.70597254 13.69946318 14.59873402  2.86478384  7.94638825
 12.4839964   7.11444291 14.73401608 10.34344895  4.92563418 11.89847156
  9.02085114 17.81972381  8.43384488 17.17667414]


In [22]:
print("The implemented method gives for the VJP:")
print(implemented_vjp)

The implemented method gives for the VJP:
[ 7.43615383  9.87517025  1.95925226 18.31180555 13.75977471 13.58209663
  5.81103095  8.9959864  13.11090335  5.49560483  3.78988702  8.97386563
 12.06884202  7.88440531 15.62531862 11.84692928 12.92939966 10.13097778
 14.74086667 12.94685423  9.42789153 12.08615381 10.67695728  2.82674093
 14.0138359   9.70375019 13.6958312  14.59452164  2.86463335  7.94515883
 12.48060479  7.11309926 14.72931155 10.34145929  4.92518079 11.89567071
  9.01933432 17.81322035  8.43219516 17.17062616]


2 methods are really close but don't give exactly same results (this is because of numerical approximations).

### Question 5: implement an MLP with an arbitrary number of layers.


### Question 6: implement SGD to train your MLP on a dataset of your choice. Study the impact of depth (number of layers) and width (number of hidden units).



Let's implement our autodiff on a simple dataset with a neural network using mlp2.

In [24]:
x = np.linspace(start = -2, stop = 2, num = 40)
noise = np.random.rand(40)/10
y = 3*x+2 + noise