# v1

In [3]:
import torch

class MLP:
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
        """
        self.f_function = f_function
        self.g_function = g_function

        self.parameters = dict(
            W1 = torch.randn(linear_1_out_features, linear_1_in_features),
            b1 = torch.randn(linear_1_out_features),
            W2 = torch.randn(linear_2_out_features, linear_2_in_features),
            b2 = torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1 = torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1 = torch.zeros(linear_1_out_features),
            dJdW2 = torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2 = torch.zeros(linear_2_out_features),
        )
        

        # put all the cache value you need in self.cache
        self.cache = dict()

    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        """
       
        
        # TODO: Implement the forward function
        
        h1 = x@self.parameters['W1'].t() + self.parameters['b1']
        self.cache['h1'] = h1
        if self.f_function == 'relu':
            z1 = F.relu(h1)
            
        if self.f_function == 'sigmoid':
            z1 = F.sigmoid(h1)
            
        if self.f_function == 'identity':
            z1 = h1
            
        self.cache['z1'] = z1
        
        h2 = z1@self.parameters['W2'].t() + self.parameters['b2']
            
        if self.g_function == 'relu':
            z2 = F.relu(h2)
            
        if self.g_function == 'sigmoid':
            z2 = F.sigmoid(h2)
            
        if self.g_function == 'identity':
            z2 = h2
            
        return z2
        pass
    
    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
        """
        # TODO: Implement the backward function
        #flow: x -> linear1(w,b,x) ->h1 -> activation1(h1) -> z1 -> linear2(w2,b2,z1)-> h2 -> activation2(h2) -> y_hat
        
        #g_function grads
        if self.g_function == 'relu':
            dAct2 = (h2>0).float() * dJdy_hat
        if self.g_function == 'sigmoid':
            #dsigmoid = sigmoid(1-sigmoid)
            dAct2 = 1/(1+exp(-h2))(1 - (1/(1+exp(-h2))))
        else:
            dAct2 = dJdy_hat
        
        #linear 2 grads
        
        #print(self.cache['z1'].size(),dAct2.size())

        #self.param transposed in forward pose so A.t().t() = A
        dZ1 = dAct2 @ self.parameters['W2']
        
        #print(self.cache['z1'].size(),dAct2.size())
        #print(self.cache['z1'].unsqueeze(-1).size(), 'act2unsq', dAct2.unsqueeze(1).size())
        dJdW2 = (self.cache['z1'].unsqueeze(-1) * dAct2.unsqueeze(1)).sum(0)
        dJdb2 = dAct2.sum(0)
    
        #f_function grads
        if self.f_function == 'relu':
            dAct1 = (self.cache['h1']>0).float() * dZ1
        if self.f_function == 'sigmoid':
            #dsigmoid = sigmoid(1-sigmoid)
            dAct1 = 1/(1+exp(-self.cache['h1']))(1 - (1/(1+exp(-self.cache['h1']))))
        else:
            dAct1 = dZ1
        
        #linear 1 grads
        
        #self.param transposed in forward pose so A.t().t() = A
        #dx = dAct2 @ self.parameters['W1']
        dJdW1 = (x.unsqueeze(-1) * dAct1.unsqueeze(1)).sum(0)
        dJdb1 = dAct1.sum(0)     

        pass

    
    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()
        

In [2]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=5,
    g_function='identity'
)
x = torch.randn(10, 2)
y = torch.randn(10, 5)


#Forward: (10,2)@(2,20)@(20,5) -> (10,5)
#Backward: dJdy_hat:(10,5)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)

net.backward(dJdy_hat)

# y_hat.shape

NameError: name 'mse_loss' is not defined

## loss v1

In [3]:
def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)

    Return:
        J: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the mse loss
    J = ((y - y_hat).pow(2)).mean()
    dJdy_hat = 2*(y - y_hat)
    pass



    return J, dJdy_hat



In [4]:
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20, 5)),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm()< 1e-3)

tensor(False)
tensor(False)
tensor(False)
tensor(False)


In [None]:
def bce_loss(y, y_hat):
    """
    Args:
        y_hat: the prediction tensor
        y: the label tensor
        
    Return:
        loss: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the bce loss
    pass

    # return loss, dJdy_hat

Bigger question is why the course has decided to set up the net this way for the first exercise. Maybe this is more intuitive another way, or I've been spoiled by already somewhat knowing what route pytorch goes, but storing the weight matrices in a dict and then having activation functions callable by the test suite just seems to make an ugly mess out of things.

What if we try not write this weird dictionary version of the weights?

Okay, so having fiddled around a bit more, I'm beginning to understand why ```self.parameters``` as a dict works. For separate forward and backward passes, you want the parameters to be globally available to the MLP class

# V2

In [93]:
class MLP:
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
            
        """
        self.parameters = dict(
            W1 = torch.randn(linear_1_out_features,linear_1_in_features),
            b1 = torch.randn(linear_1_out_features),
            W2 = torch.randn(linear_2_out_features,linear_2_in_features),
            b2 = torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1 = torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1 = torch.zeros(linear_1_out_features),
            dJdW2 = torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2 = torch.zeros(linear_2_out_features),
        )
        
        self.f_function = f_function
        self.g_function = g_function
        
        # put all the cache value you need in self.cache
        self.cache = dict()
    
    def linear(self, inp, W, b):
        return inp@W.t() + b #torch convention does linear as W.t() - see defn of W above as W = (out, in)
    
    def mse(self, output, targ): 
        return (output.squeeze(-1) - targ).pow(2).mean()          
        
    def lin_grad(self, inp, out, w, b):
        # grad of matmul with respect to input
        #print('out.g', out.g.size(),'squeeeze', out.g.squeeze().size())
        
        inp.g = out.g.squeeze() @ w  #w.t().t()
        #print("input",inp.unsqueeze(-1).size(),"output", out.g.squeeze().unsqueeze(1).size()) #for discussion of matrix dimensions see here https://forums.fast.ai/t/lesson-8-2019-discussion-wiki/41323/642
        
        #in brief: input (10,20) is batch size 10 with 20 features computed by prev linear layer. 
        #Unsqueeze adds a dim to make it (10,20,1), and unsqueeze on out.g makes it (10,1,5)
        #reason for out.g.squeeze().unsqueeze is because out.g for linear 2 was unsqueezed in mse
        #multipyling the two matrices gives result of (10,20,5) i.e. 10 (20,5) weight matrices
        #sum over dim(0) here then aggregates the contributions of our gradients for each of the 10 examples
        
        w.g = (inp.unsqueeze(-1) * out.g.squeeze().unsqueeze(1)).sum(0) #sum(0) is to sum the gradients over the batch afaik
        b.g = out.g.sum(0) 
        #capture grad info for comparison to torch
        
        #print("dJdW1",self.grads['dJdW2'].size(), "w.g", w.g.t().size(), "w1", self.parameters['W1'].size(), "w", w.size())
        #print(str(w),\n,str(self.parameters['W2']))
        if str(w) == str(self.parameters['W2']):
            self.grads['dJdW2'] = w.g.t()
            print("before setting",self.grads['dJdb2'])
            new_bg = torch.zeros(b.size())
             
            for i in range(b.g.numel()):
                new_bg[i] = float(b.g[i])
            self.grads['dJdb2'] = new_bg
            print("after setting",self.grads['dJdb2'])
        if str(w) == str(self.parameters['W1']):
            self.grads['dJdW1'] = w.g.t()
            self.grads['dJdb1'] = b.g

    def relu_grad(self, inp, out):
        inp.g = (inp>0).float() * out.g

    def mse_grad(self, inp, targ): 
        # grad of loss with respect to output of previous layer
        inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.numel()
        
        
    def forward(self, x, y):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        """
        self.cache['x'] = x
        h1 = self.linear(x,self.parameters['W1'],self.parameters['b1'])
        self.cache['h1'] = h1
        z1 = F.relu(h1)
        self.cache['z1'] = z1
        
#         print('z1size1', z1.size(),'z1cache',self.cache['z1'].size())
        
        h2 = self.linear(z1,self.parameters['W2'],self.parameters['b2'])
        self.cache['h2'] = h2
        y_hat = h2 #g_function is identity
        self.cache['y_hat'] = y_hat
        loss = self.mse(y_hat,y)
        
        #print("forward grads", self.grads['dJdb2'])
        
        return loss
        
        
        pass
    
    def backward(self, y):

        self.mse_grad(self.cache['y_hat'], y)
        # activation 2 is Identity
        self.lin_grad(self.cache['z1'], self.cache['h2'], self.parameters['W2'], self.parameters['b2']) #linear 2
        #print("backward grads", self.grads['dJdb2'])
        self.relu_grad(self.cache['h1'], self.cache['z1']) #act 1
        self.lin_grad(self.cache['x'], self.cache['h1'], self.parameters['W1'], self.parameters['b1'])   #linear 1

        pass

    
    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()

In [45]:
def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)
    Return:
        loss: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """

    # Implement the mse loss
    # batch_size = y.size(0)
    # linear_2_out_features = y.size(1)
    loss = sum(sum((y-y_hat).pow(2)))
    dJdy_hat = 2*(y_hat-y)
    # taking mean
    loss = loss/y.numel()
    dJdy_hat = dJdy_hat/y.numel()
    return loss, dJdy_hat


def mse(output, targ): 
    return (output.squeeze(-1) - targ).pow(2).mean() 


def mse_grad(inp, targ): 
    # grad of loss with respect to output of previous layer
    inp.g = 2 * (inp - targ) / inp.numel() #the fast.ai version dividing by inp.shape[0] here doesn't work because in this we have a batch size so can't divide by the number of entries in our input vector but most divide by number of entries in our input TENSOR! (hence need .numel())
    return inp.g

In [46]:
test = torch.randn(5,5)
targ = torch.randn(5,5)


print(targ.numel())
print(targ.size())

loss, dJdy_hat = mse_loss(targ,test)

print((loss - mse(test,targ)).norm() < 1e-3)

print((dJdy_hat - mse_grad(test,targ)).norm() < 1e-3)

25
torch.Size([5, 5])
tensor(True)
tensor(True)


## tests

In [94]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=5,
    g_function='identity'
)
x = torch.randn(10, 2) #means batch size of 10, input features 2
y = torch.randn(10, 5) #means 10 inputs classified between 5 outputs

net.clear_grad_and_cache()
y_hat = net.forward(x,y)

net.backward(y)


before setting tensor([0., 0., 0., 0., 0.])
after setting tensor([-0.7399, -1.1462, -3.6524,  5.6012,  1.4159])


In [95]:
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2,20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20,5)),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()


print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data.t() - net.grads['dJdb2']).norm() < 1e-3)

tensor(True)
tensor(True)
tensor(True)
tensor(True)


In [96]:
print(net.grads['dJdb2'])
print(net_autograd.linear2.bias.grad.data)

tensor([-0.7399, -1.1462, -3.6524,  5.6012,  1.4159])
tensor([-0.7399, -1.1462, -3.6524,  5.6012,  1.4159])


currently unsure why I'm failing these tests.

Turns out it was literally 1 line in the code - normalising the mse_grad by ``.shape[0]`` rather than by ``numel()``.

The reason for my mistake was not thinking clearly about the batch size - I was implicitly dealing with a batch size of 1 but that doesn't make sense for this!

In [8]:
print(net.grads)

{'dJdW1': tensor([[  6.3688,  -0.5613],
        [ -4.2895,   2.2825],
        [  5.5466,  -2.0888],
        [ -1.5885,  -1.8586],
        [ -7.1742,   4.1545],
        [ -9.2972,   6.8226],
        [ -1.6694,   1.2718],
        [  7.2225,  -1.4345],
        [ -1.4697,  -1.3303],
        [  6.5135,  -3.5458],
        [ -2.8434,   0.3092],
        [ -8.2150,   4.7369],
        [-11.7968,   3.2561],
        [ 18.2542,  -2.8796],
        [-12.3716,   6.5908],
        [ 12.5386,  -4.8811],
        [ -8.1859,   4.6049],
        [ -7.3827,  -0.1397],
        [ -5.4608,   3.4130],
        [ 17.6165,  -3.1712]]), 'dJdb1': tensor([  5.8247,   4.5963,  -3.6311,  -8.3613,   5.2256,  17.8559,  -1.9851,
          7.3082, -12.4775,  -4.5635,   1.6294,  18.2070,  10.4480,  20.4293,
          7.5341,  17.7802,   4.4985,   9.2700,   1.7760,  22.0492]), 'dJdW2': tensor([[ 3.2155e+00,  4.5823e+00,  1.7658e+00,  3.2703e+00,  8.3859e+00,
          7.7257e+00,  3.2235e+00,  1.0451e+00,  6.5746e+00,  2.4196e+

In [9]:
print(net.grads['dJdW1'].norm())

tensor(43.5475)


In [10]:
print(net_autograd.linear1.weight.grad.data.norm())

tensor(8.7095)


# V3

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP:
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
        """
        self.f_function = f_function
        self.g_function = g_function

        self.parameters = dict(
            W1 = torch.randn(linear_1_out_features, linear_1_in_features),
            b1 = torch.randn(linear_1_out_features),
            W2 = torch.randn(linear_2_out_features, linear_2_in_features),
            b2 = torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1 = torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1 = torch.zeros(linear_1_out_features),
            dJdW2 = torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2 = torch.zeros(linear_2_out_features),
        )

        # put all the cache value you need in self.cache
        self.cache = dict(
        )

    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        """
        # TODO: Implement the forward function
        w1 = self.parameters['W1']
        b1 = self.parameters['b1']
        w2 = self.parameters['W2']
        b2 = self.parameters['b2']
        
        self.z1 = w1.mm(x.t())
        for i in range(10):
          self.z1[:,i] = self.z1[:,i] + b1

        if self.f_function == 'identity':
          self.z2 = self.z1
        elif self.f_function == 'relu':
          self.z2 = F.relu_(self.z1)
        else:
          self.z2 = F.sigmoid(self.z1)
        
        self.z3 = w2.mm(self.z2)
        for i in range(10):
          self.z3[:,i] = self.z3[:,i] + b2
        if self.g_function == 'relu':
          self.y_hat = F.relu_(self.z3)
        elif self.g_function == 'identity':
          self.y_hat = self.z3
        elif self.g_function == 'sigmoid':
          self.y_hat = F.sigmoid(self.z3)
        return self.y_hat

        pass
    
    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
        """
        # TODO: Implement the backward function
        if self.g_function == 'identity':
          dy_hatdz3 = 1
        elif self.g_function == 'sigmoid':
          a = 1 - F.sigmoid(self.z3)
          dy_hatdz3 = F.sigmoid(self.z3) * a
        elif self.g_function == 'relu':
          dy_hatdz3 = torch.sign(F.relu_(self.z3))
        
        self.grads['dJdW2'] = (dJdy_hat * dy_hatdz3).mm(self.z2.t())/dJdy_hat.size(0)/dJdy_hat.size(1)
        self.grads['dJdb2'] = (dJdy_hat * dy_hatdz3).sum(1)/dJdy_hat.size(0)/dJdy_hat.size(1)


        dz3dz2 = self.parameters['W2']


        if self.f_function == 'identity':
          dz2dz1 = 1
        elif self.f_function == 'sigmoid':
          b = 1 - F.sigmoid(self.z1)
          dz2dz1 = F.sigmoid(self.z1) * b
        elif self.f_function == 'relu':
          dz2dz1 = torch.sign(F.relu_(self.z1))
        
        self.grads['dJdW1'] = (dz3dz2.t().mm(dJdy_hat * dy_hatdz3) * dz2dz1).mm(x)/dJdy_hat.size(0)/dJdy_hat.size(1)
        self.grads['dJdb1'] = (dz3dz2.t().mm(dJdy_hat * dy_hatdz3) * dz2dz1).sum(1)/dJdy_hat.size(0)/dJdy_hat.size(1)


        pass

    
    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()

def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)
    Return:
        J: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """
    # TODO: Implement the mse loss
    loss_matrix = 0.5 * (y.t()-y_hat).pow(2).sum(1)
    dim = y.size(0)
    J = loss_matrix/dim
    dJdy_hat = y_hat - y.t()
    return J, dJdy_hat
    pass


In [4]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=5,
    g_function='identity'
)
x = torch.randn(10, 2)
y = torch.randn(10, 5)


#Forward: (10,2)@(2,20)@(20,5) -> (10,5)
#Backward: dJdy_hat:(10,5)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)

net.backward(dJdy_hat)


# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20, 5)),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm()< 1e-3)

tensor(False)
tensor(False)
tensor(False)
tensor(False)


# V4

In [50]:
import torch

class MLP:
    def __init__(
        self,
        linear_1_in_features,
        linear_1_out_features,
        f_function,
        linear_2_in_features,
        linear_2_out_features,
        g_function
    ):
        """
        Args:
            linear_1_in_features: the in features of first linear layer
            linear_1_out_features: the out features of first linear layer
            linear_2_in_features: the in features of second linear layer
            linear_2_out_features: the out features of second linear layer
            f_function: string for the f function: relu | sigmoid | identity
            g_function: string for the g function: relu | sigmoid | identity
        """
        self.f_function = f_function
        self.g_function = g_function

        self.parameters = dict(
            W1 = torch.randn(linear_1_out_features, linear_1_in_features),
            b1 = torch.randn(linear_1_out_features),
            W2 = torch.randn(linear_2_out_features, linear_2_in_features),
            b2 = torch.randn(linear_2_out_features),
        )
        self.grads = dict(
            dJdW1 = torch.zeros(linear_1_out_features, linear_1_in_features),
            dJdb1 = torch.zeros(linear_1_out_features),
            dJdW2 = torch.zeros(linear_2_out_features, linear_2_in_features),
            dJdb2 = torch.zeros(linear_2_out_features),
        )

        # put all the cache value you need in self.cache
        self.cache = dict()


    def activation(self, z, type):
        """
        Args:
            z: tensor shape (batch_size, linear_out_features)
            type: string: relu | sigmoid | identity
        Return:
            z: tensor shape (batch_size, linear_out_features)
            dzdz: element-wisely gradient tensor (batch_size, linear_out_features)
        """
        if type == 'relu': # ReLU fct and its gradient
            dzdz = torch.zeros(z.size())
            dzdz[z>=0] = 1
            z[z<0]=0
        elif type == 'sigmoid': # sigmoid fct and its gradient
            # dzdz = torch.exp(-z) / (1 + torch.exp(-z)).pow(2)
            dzdz = 1 / (2 + torch.exp(-z) + torch.exp(z))
            z = 1 / (1 + torch.exp(-z))
        else: # identity fct and its gradient
            dzdz = torch.ones(z.size())
        return z, dzdz



    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, linear_1_in_features)
        Return:
            y_hat: the prediction tensor (batch_size, linear_2_out_features)
        """
        # Implement the forward function, cache internal gradients
        self.cache['x'] = x
        # linear_1
        z = torch.mm(x, (self.parameters['W1']).t()) + self.parameters['b1'] # tensor (batch_size, linear_1_out_features)
        # f_function
        z, dzdz = self.activation(z, self.f_function) # z: changes, dzdz: gradient tensor (batch_size, linear_1_out_features)
        self.cache['dzdz']=dzdz
        self.cache['z']=z
        # linear_2
        y_hat = torch.mm(z, (self.parameters['W2']).t()) + self.parameters['b2'] # tensor (batch_size, linear_2_out_features)
        # g_function
        y_hat, dydz = self.activation(y_hat, self.g_function) # y_hat: changes, dydz: gradient tensor (batch_size, linear_1_out_features)
        self.cache['dydz']=dydz
        return y_hat


    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
        """
        # TODO: Implement the backward function
        # layer 2
        dJdz = dJdy_hat*self.cache['dydz']
        # b2
        self.grads['dJdb2'] = sum(dJdz)
        # W2
        self.grads['dJdW2'] = torch.mm(dJdz.t(), self.cache['z'])
        # layer 1
        dJdx= torch.mm(dJdz,self.parameters['W2'])*self.cache['dzdz']
        # b1
        self.grads['dJdb1'] = sum(dJdx)
        # W1
        self.grads['dJdW1'] = torch.mm(dJdx.t(),self.cache['x'])



    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()

def mse_loss(y, y_hat):
    """
    Args:
        y: the label tensor (batch_size, linear_2_out_features)
        y_hat: the prediction tensor (batch_size, linear_2_out_features)
    Return:
        loss: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, linear_2_out_features)
    """

    # Implement the mse loss
    # batch_size = y.size(0)
    # linear_2_out_features = y.size(1)
    loss = sum(sum((y_hat-y).pow(2)))
    dJdy_hat = 2*(y_hat-y)
    # taking mean
    loss = loss/y.numel()
    dJdy_hat = dJdy_hat/y.numel()
    return loss, dJdy_hat

In [51]:
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F

net = MLP(
    linear_1_in_features=2,
    linear_1_out_features=20,
    f_function='relu',
    linear_2_in_features=20,
    linear_2_out_features=5,
    g_function='identity'
)
x = torch.randn(10, 2)
y = torch.randn(10, 5)


#Forward: (10,2)@(2,20)@(20,5) -> (10,5)
#Backward: dJdy_hat:(10,5)

net.clear_grad_and_cache()
y_hat = net.forward(x)
J, dJdy_hat = mse_loss(y, y_hat)

net.backward(dJdy_hat)

In [52]:
# compare the result with autograd
net_autograd = nn.Sequential(
    OrderedDict([
        ('linear1', nn.Linear(2, 20)),
        ('relu', nn.ReLU()),
        ('linear2', nn.Linear(20, 5)),
    ])
)
net_autograd.linear1.weight.data = net.parameters['W1']
net_autograd.linear1.bias.data = net.parameters['b1']
net_autograd.linear2.weight.data = net.parameters['W2']
net_autograd.linear2.bias.data = net.parameters['b2']

y_hat_autograd = net_autograd(x)

J_autograd = F.mse_loss(y_hat_autograd, y)

net_autograd.zero_grad()
J_autograd.backward()

print((net_autograd.linear1.weight.grad.data - net.grads['dJdW1']).norm() < 1e-3)
print((net_autograd.linear1.bias.grad.data - net.grads['dJdb1']).norm() < 1e-3)
print((net_autograd.linear2.weight.grad.data - net.grads['dJdW2']).norm() < 1e-3)
print((net_autograd.linear2.bias.grad.data - net.grads['dJdb2']).norm()< 1e-3)

tensor(True)
tensor(True)
tensor(True)
tensor(True)
