In [1]:
# Import pytorch
import torch

## 1. Build the model
- Input (x1,x2): 2 nodes
- First hidden layer: 10 nodes, with weights (w) and bias (b), sigmoid activation function
- Second hidden layer: 10 nodes, with weights (w) and bias (b), sigmoid activation function
- Output (predict): 1 node

In [2]:
from torch import nn

# Building Neural Network using nn.Sequential
# Hyperparameters for our network
input_size = 2
hidden_sizes = [10,10]
output_size = 1

model = nn.Sequential(
    # Input with 2 nodes to first hidden layer with 10 nodes
    nn.Linear(input_size, hidden_sizes[0]), 
    # Pass through Sigmoid activation function
    nn.Sigmoid(),
    # First hidden layer with 10 nodes to second hidden layer with 10 nodes
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    # Pass through Sigmoid activation function
    nn.Sigmoid(),
    # Second hidden layer with 10 nodes to output layer with 1 node
    nn.Linear(hidden_sizes[1], output_size),
)

print('Model:\n',model)

Model:
 Sequential(
  (0): Linear(in_features=2, out_features=10, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=10, out_features=10, bias=True)
  (3): Sigmoid()
  (4): Linear(in_features=10, out_features=1, bias=True)
)


## 2. Generate the random number x1, x2

In [3]:
import numpy as np
np.random.seed(4)
x1 = np.random.uniform()
x2 = np.random.uniform()
x = torch.tensor([[x1,x2]], requires_grad=True)
print("input(x1,x2):\n ",x)

input(x1,x2):
  tensor([[0.9670, 0.5472]], requires_grad=True)


## 3. Generate the label y_true

In [4]:
y_true = (x1*x1+x2*x2)/2
y_true = torch.tensor([[y_true]], requires_grad=True)
print('The value of y_true is:\n', y_true)

The value of y_true is:
 tensor([[0.6173]], requires_grad=True)


## 4. Build a loss function L = (y_predict - y_true)^2

In [5]:
def loss_fn(y_true, y_pred):
    return torch.sum((y_pred - y_true) ** 2)

## 5. Forward / Backward propagation

In [6]:
# 10 epochs
# learning rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

for i in range(10):
    # Zero out all the gradients at every epoch
    optimizer.zero_grad()
    # Forward propogation
    y_pred = model(x)
    # Show loss at each epoch
    loss = loss_fn(y_pred, y_true)
    print(f"Epoch = {i + 1} | Loss=%f" % (loss.item())) 
    # Backward propagation and update the gradients
    loss.backward()
    optimizer.step()

Epoch = 1 | Loss=0.411943
Epoch = 2 | Loss=0.048174
Epoch = 3 | Loss=0.005531
Epoch = 4 | Loss=0.000627
Epoch = 5 | Loss=0.000071
Epoch = 6 | Loss=0.000008
Epoch = 7 | Loss=0.000001
Epoch = 8 | Loss=0.000000
Epoch = 9 | Loss=0.000000
Epoch = 10 | Loss=0.000000


In [7]:
# y_pred is the same as y_true
print(y_pred)
print(y_true)

tensor([[0.6173]], grad_fn=<AddmmBackward>)
tensor([[0.6173]], requires_grad=True)


## 6a. Calculate the gradients of the loss wrt weights and bias

In [8]:
# Check the weights and biases of first hidden layer
print('Network Structure of first hidden layer:\n',model[0])
print('Weight gradient of network:\n',model[0].weight.grad)
print('Bias gradient of network:\n',model[0].bias.grad)

Network Structure of first hidden layer:
 Linear(in_features=2, out_features=10, bias=True)
Weight gradient of network:
 tensor([[ 1.1669e-09,  6.6035e-10],
        [ 7.4194e-08,  4.1985e-08],
        [ 1.7838e-07,  1.0095e-07],
        [-1.9635e-07, -1.1111e-07],
        [ 7.8348e-08,  4.4337e-08],
        [-5.3009e-07, -2.9997e-07],
        [ 5.3999e-07,  3.0558e-07],
        [-2.6930e-07, -1.5240e-07],
        [-1.5825e-07, -8.9551e-08],
        [-5.2903e-08, -2.9937e-08]])
Bias gradient of network:
 tensor([ 1.2067e-09,  7.6723e-08,  1.8446e-07, -2.0304e-07,  8.1020e-08,
        -5.4816e-07,  5.5840e-07, -2.7848e-07, -1.6364e-07, -5.4706e-08])


In [9]:
# Check the weights and biases of second hidden layer
print('Network Structure of second hidden layer:\n',model[2])
print('Weight gradient of network:\n',model[2].weight.grad)
print('Bias gradient of network:\n',model[2].bias.grad)

Network Structure of second hidden layer:
 Linear(in_features=10, out_features=10, bias=True)
Weight gradient of network:
 tensor([[ 2.5782e-07,  2.6286e-07,  4.1569e-07,  3.1589e-07,  2.2706e-07,
          3.0214e-07,  1.7527e-07,  1.9254e-07,  1.6258e-07,  1.5492e-07],
        [-1.3693e-06, -1.3961e-06, -2.2078e-06, -1.6777e-06, -1.2059e-06,
         -1.6047e-06, -9.3089e-07, -1.0226e-06, -8.6347e-07, -8.2282e-07],
        [-1.8169e-06, -1.8524e-06, -2.9294e-06, -2.2261e-06, -1.6001e-06,
         -2.1292e-06, -1.2352e-06, -1.3568e-06, -1.1457e-06, -1.0918e-06],
        [-2.4050e-06, -2.4521e-06, -3.8778e-06, -2.9468e-06, -2.1181e-06,
         -2.8185e-06, -1.6350e-06, -1.7961e-06, -1.5166e-06, -1.4452e-06],
        [-1.6985e-06, -1.7318e-06, -2.7386e-06, -2.0811e-06, -1.4959e-06,
         -1.9905e-06, -1.1547e-06, -1.2684e-06, -1.0711e-06, -1.0207e-06],
        [-1.5941e-06, -1.6252e-06, -2.5702e-06, -1.9531e-06, -1.4039e-06,
         -1.8681e-06, -1.0837e-06, -1.1904e-06, -1.0052e-0

In [10]:
# Check the weights and biases of output layer
print('Network Structure of output layer:\n',model[4])
print('Weight gradient of network:\n',model[4].weight.grad)
print('Bias gradient of network:\n',model[4].bias.grad)

Network Structure of output layer:
 Linear(in_features=10, out_features=1, bias=True)
Weight gradient of network:
 tensor([[-3.0957e-05, -3.4134e-05, -3.2646e-05, -3.6027e-05, -3.5243e-05,
         -4.4999e-05, -2.6554e-05, -2.8694e-05, -2.5792e-05, -3.9136e-05]])
Bias gradient of network:
 tensor([-7.1406e-05])


## 6b. Write to torch_autograd.dat

## 7a. Implement forward propagation and backpropagation algorithm from scratch

### Define loss function, sigmoid function, sigmoid derivative function

In [11]:
import numpy as np

# Loss function
def loss(y_true, y_pred):
    return np.power(y_pred - y_true, 2)

# Sigmoid Activation function for feed-forward
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Sigmoid Derivative function for back-propagation
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

### x_train

In [12]:
np.random.seed(4)
x1 = np.random.uniform()
x2 = np.random.uniform()
x_train = np.array([[x1,x2]])
print("input(x1,x2):\n", x_train)

input(x1,x2):
 [[0.96702984 0.54723225]]


### y_train

In [13]:
y_train = np.array([[(x_train[0][0]*x_train[0][0] + x_train[0][1]*x_train[0][1])/2]])
print(y_train)

[[0.61730492]]


### Define forward propagation and backward propagation

In [14]:
# 10 epochs
# learning rate = 0.1

class FeedForward:
    def __init__(self, input_layer, hidden_layer_1, hidden_layer_2, output_layer):
        # Number of nodes for each layer
        self.input_layer = input_layer
        self.hidden_layer_1 = hidden_layer_1
        self.hidden_layer_2 = hidden_layer_2
        self.output_layer = output_layer

        # Since there are 3 layers (excluding the input layer), there are 6 unknown parameters for weights and biases
        # Initialization of parameters w1, w2, w3, b1, b2, b3
        self.w1 = np.random.rand(input_layer, hidden_layer_1)
        self.w2 = np.random.rand(hidden_layer_1, hidden_layer_2)
        self.w3 = np.random.rand(hidden_layer_2, output_layer)
        self.b1 = np.random.rand(hidden_layer_1)
        self.b2 = np.random.rand(hidden_layer_2)
        self.b3 = np.random.rand(output_layer)
        
    # Method to train model
    # Change the number of epochs 
    def fit(self, x_training, y_training, learning_rate=0.1, epochs=10): 
        # Epoch loop
        for epoch in range(epochs):
            error = 0
            # Training set loop
            # Training data is passed through the Sigmoid function to add non-linearity to the model
            for x, y_true in zip(x_training, y_training):
                # Forward propagation
                h1_ = np.dot(x, self.w1) + self.b1
                h1 = sigmoid(h1_)
             
                h2_ = np.dot(h1, self.w2) + self.b2
                h2 = sigmoid(h2_)
              
                y = np.dot(h2, self.w3) + self.b3

                # Error computation against the true value of y using sum of squared errors method
                error += loss(y_true, y)
      
                # Backward propagation
                derivative_w1 = np.zeros((self.input_layer, self.hidden_layer_1))
                derivative_w2 = np.zeros((self.hidden_layer_1, self.hidden_layer_2))
                derivative_w3 = np.zeros((self.hidden_layer_2, self.output_layer))
                derivative_b1 = np.zeros(self.hidden_layer_1)
                derivative_b2 = np.zeros(self.hidden_layer_2)
                derivative_b3 = np.zeros(self.output_layer)
            
                for i in range(self.output_layer):
                    derivative_b3[i] = (y[i] - y_true[i])
            
                for i in range(self.hidden_layer_2):
                    for j in range(self.output_layer):
                        derivative_w3[i][j] = derivative_b3[j] * h2[i]
                        
                for i in range(self.hidden_layer_2):
                    derivative_b2[i] = sum([derivative_b3[j] * self.w3[i][j] * sigmoid_derivative(h2_[i]) for j in range(self.output_layer)])
                
                for i in range(self.hidden_layer_1):
                    for j in range(self.hidden_layer_2):
                        derivative_w2[i][j] = derivative_b2[j] * h1[i]
                        
                for i in range(self.hidden_layer_1):
                    derivative_b1[i] = sum([derivative_b2[j] * self.w2[i][j] * sigmoid_derivative(h1_[i]) for j in range(self.hidden_layer_1)])
                    
                for i in range(self.input_layer):
                    for j in range(self.hidden_layer_1):
                        derivative_w1[i][j] = derivative_b1[j] * x[i]
                                            
                # Gradient Descent
                self.b1 -= learning_rate * derivative_b1
                self.w1 -= learning_rate * derivative_w1
                self.b2 -= learning_rate * derivative_b2
                self.w2 -= learning_rate * derivative_w2
                self.b3 -= learning_rate * derivative_b3
                self.w3 -= learning_rate * derivative_w3
                           
            # Show loss at each epoch
            error /= x_training.shape[0]
            print(f"Epoch = {epoch + 1} | Loss = {error}")
        
        print()
        print('Weight gradient of first hidden layer:\n', derivative_w1)
        print('Bias gradient of first hidden layer:\n', derivative_b1)
        print()
        print('Weight gradient of second hidden layer:\n', derivative_w2)
        print('Bias gradient of second hidden layer:\n', derivative_b2)
        print()
        print('Weight gradient of output layer:\n', derivative_w3)
        print('Bias gradient of output layer:\n', derivative_b3)
            
    # Method to do a single prediction based on x1, x2
    def predict(self, x_test):
        # Forward propagation
        # Input to first hidden layer
        h1_ = np.dot(x_test, self.w1) + self.b1
        # Sigmoid activation function
        h1 = sigmoid(h1_)
        # First hidden layer to second hidden layer
        h2_ = np.dot(h1, self.w2) + self.b2
        # Sigmoid activation function
        h2 = sigmoid(h2_)
        # Output layer
        y = np.dot(h2, self.w3) + self.b3

        return y

### Build and fit the model

In [15]:
# Build the model
feed_forward = FeedForward(2, 10, 10, 1)
# Fit the model
feed_forward.fit(x_train, y_train)

Epoch = 1 | Loss = [27.70769141]
Epoch = 2 | Loss = [0.16445874]
Epoch = 3 | Loss = [0.00095962]
Epoch = 4 | Loss = [5.59923213e-06]
Epoch = 5 | Loss = [3.2670576e-08]
Epoch = 6 | Loss = [1.90627393e-10]
Epoch = 7 | Loss = [1.11227919e-12]
Epoch = 8 | Loss = [6.48996442e-15]
Epoch = 9 | Loss = [3.78678678e-17]
Epoch = 10 | Loss = [2.20952638e-19]

Weight gradient of first hidden layer:
 [[-2.10219322e-13  1.27505382e-13  1.28203309e-13  1.97357924e-13
  -1.65282859e-13 -1.01060617e-13  3.30899500e-15 -2.17395276e-13
  -4.80810282e-13 -2.26756252e-13]
 [-1.18960954e-13  7.21539853e-14  7.25489350e-14  1.11682821e-13
  -9.35318714e-14 -5.71891645e-14  1.87252627e-15 -1.23021753e-13
  -2.72085598e-13 -1.28319033e-13]]
Bias gradient of first hidden layer:
 [-2.17386593e-13  1.31852582e-13  1.32574305e-13  2.04086695e-13
 -1.70918055e-13 -1.04506203e-13  3.42181272e-15 -2.24807205e-13
 -4.97203150e-13 -2.34487337e-13]

Weight gradient of second hidden layer:
 [[-9.53685898e-13 -1.89116603e-

In [16]:
# y_pred is the same as y_train
print('y_train is:', y_train)
print('y_predict is:', feed_forward.predict(x_train))

y_train is: [[0.61730492]]
y_predict is: [[0.61730492]]


## 7b. Write to my_autograd.dat