# Linear Layer notebook
A notebook to test the implemention of the linear layer

In [1]:
%cd ../..

e:\KTorch


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
from nn import Linear, ReLU, Sigmoid, Tanh, Softmax 
import numpy as np
from autograd import Tensor

### Test 1: 1 Layer NN Forward propagation

In [3]:
# inputs
np.random.seed(0)
x = np.random.randn(32, 16).astype(np.float32)
x.shape

(32, 16)

In [4]:
# KTorch 
x_tensor = Tensor(x)
linear = Linear(16, 32)
my_output = linear.forward(x_tensor)
my_output.shape, type(my_output)

((32, 32), autograd.engine.Tensor)

In [5]:
# Test matrix multiplication
weights = linear.weight.numpy()
bias = linear.bias.numpy()
weights.shape, bias.shape, x.shape

((16, 32), (32,), (32, 16))

In [6]:
manual_output = np.dot(x, weights) + bias
manual_output.shape

(32, 32)

In [7]:
# Compare outputs
np.equal(my_output.numpy(), manual_output).all()

True

In [8]:
# Backprop ktorch
my_output.backward()

In [9]:
w_grad = linear.weight.grad
b_grad = linear.bias.grad
w_grad.shape, b_grad.shape

((16, 32), (32,))

In [10]:
# Compute gradients manually
manual_w_grad = np.dot(x.T, np.ones_like(manual_output))
manual_b_grad = np.ones_like(manual_output).sum(axis=0)
manual_w_grad.shape, manual_b_grad.shape

((16, 32), (32,))

In [11]:
# Compare gradients
np.equal(w_grad, manual_w_grad).all(), np.equal(b_grad, manual_b_grad).all()

(True, True)

### Test 2: 4 Layers NN

In [12]:
# input
# inputs
np.random.seed(0)
x = np.random.randn(32, 16).astype(np.float32)
x_tensor = Tensor(x)
x.shape

(32, 16)

In [13]:
# Create our layers
linear1 = Linear(16, 32)
linear2 = Linear(32, 64)
linear3 = Linear(64, 32)
linear4 = Linear(32, 16)

my_output = linear1.forward(x_tensor)
my_output = linear2.forward(my_output)
my_output = linear3.forward(my_output)
my_output = linear4.forward(my_output)

my_output.shape, type(my_output)

((32, 16), autograd.engine.Tensor)

In [14]:
# manual forward pass
weights1 = linear1.weight.numpy()
bias1 = linear1.bias.numpy()
weights2 = linear2.weight.numpy()
bias2 = linear2.bias.numpy()
weights3 = linear3.weight.numpy()
bias3 = linear3.bias.numpy()
weights4 = linear4.weight.numpy()
bias4 = linear4.bias.numpy()

h1 = np.dot(x, weights1) + bias1
h2 = np.dot(h1, weights2) + bias2
h3 = np.dot(h2, weights3) + bias3
manual_output = np.dot(h3, weights4) + bias4
manual_output.shape

(32, 16)

In [15]:
# Compare outputs
np.equal(my_output.numpy(), manual_output).all()

True

In [16]:
# Backprop ktorch
my_output.backward()

In [17]:
# Gradients
w1_grad = linear1.weight.grad
b1_grad = linear1.bias.grad
w2_grad = linear2.weight.grad
b2_grad = linear2.bias.grad
w3_grad = linear3.weight.grad
b3_grad = linear3.bias.grad
w4_grad = linear4.weight.grad
b4_grad = linear4.bias.grad

w1_grad.shape == weights1.shape, b1_grad.shape == bias1.shape, w2_grad.shape == weights2.shape, b2_grad.shape == bias2.shape, w3_grad.shape == weights3.shape, b3_grad.shape == bias3.shape, w4_grad.shape == weights4.shape, b4_grad.shape == bias4.shape

(True, True, True, True, True, True, True, True)

In [18]:
# Compute gradients manually (layer 4)
manual_output_grad = np.ones_like(manual_output)
manual_w4_grad = np.dot(h3.T, manual_output_grad)
manual_b4_grad = manual_output_grad.sum(axis=0)
np.equal(w4_grad, manual_w4_grad).all(), np.equal(b4_grad, manual_b4_grad).all()

(True, True)

In [19]:
# Compute gradients manually (layer 3)
h3_grad = np.dot(manual_output_grad, weights4.T)
manual_w3_grad = np.dot(h2.T, h3_grad)
manual_b3_grad = h3_grad.sum(axis=0)

np.equal(w3_grad, manual_w3_grad).all(), np.equal(b3_grad, manual_b3_grad).all()

(True, True)

In [20]:
# Compute gradients manually (layer 2)
h2_grad = np.dot(h3_grad, weights3.T)
manual_w2_grad = np.dot(h1.T, h2_grad)
manual_b2_grad = h2_grad.sum(axis=0)

np.equal(w2_grad, manual_w2_grad).all(), np.equal(b2_grad, manual_b2_grad).all()

(True, True)

In [21]:
# Compute gradients manually (layer 1)
h1_grad = np.dot(h2_grad, weights2.T)
manual_w1_grad = np.dot(x.T, h1_grad)
manual_b1_grad = h1_grad.sum(axis=0)

np.equal(w1_grad, manual_w1_grad).all(), np.equal(b1_grad, manual_b1_grad).all()

(True, True)

### Test 3: Test Activation functions with 5 layered nn

In [22]:
# inputs
np.random.seed(0)
x = np.random.randn(6, 16).astype(np.float32)
x_tensor = Tensor(x)
x.shape

(6, 16)

In [23]:
# Create our layers
linear1 = Linear(16, 32)
relu = ReLU()
linear2 = Linear(32, 64)
sigmoid = Sigmoid()
linear3 = Linear(64, 32)
tanh = Tanh()
linear4 = Linear(32, 16)

my_output = linear1.forward(x_tensor)
my_output = relu.forward(my_output)
my_output = linear2.forward(my_output)
my_output = sigmoid.forward(my_output)
my_output = linear3.forward(my_output)
my_output = tanh.forward(my_output)
my_output = linear4.forward(my_output)

my_output.shape, type(my_output)

((6, 16), autograd.engine.Tensor)

In [24]:
from scipy.special import erf
# manual forward pass
weights1 = linear1.weight.numpy()
bias1 = linear1.bias.numpy()
weights2 = linear2.weight.numpy()
bias2 = linear2.bias.numpy()
weights3 = linear3.weight.numpy()
bias3 = linear3.bias.numpy()
weights4 = linear4.weight.numpy()
bias4 = linear4.bias.numpy()

h1 = np.dot(x, weights1) + bias1
print(h1.shape)
h2 = np.maximum(h1, 0)
print(h2.shape)
h3 = np.dot(h2, weights2) + bias2
print(h3.shape)
h4 = 1 / (1 + np.exp(-h3))
print(h4.shape)
h5 = np.dot(h4, weights3) + bias3
print(h5.shape)
h6 = np.tanh(h5)
print(h6.shape)
manual_output = np.dot(h6, weights4) + bias4
manual_output.shape

(6, 32)
(6, 32)
(6, 64)
(6, 64)
(6, 32)
(6, 32)


(6, 16)

In [25]:
# Compare outputs
np.equal(my_output.numpy(), manual_output).all()

True

In [26]:
# Backprop ktorch
my_output.backward()

In [27]:
# Gradients
w1_grad = linear1.weight.grad
b1_grad = linear1.bias.grad
w2_grad = linear2.weight.grad
b2_grad = linear2.bias.grad
w3_grad = linear3.weight.grad
b3_grad = linear3.bias.grad
w4_grad = linear4.weight.grad
b4_grad = linear4.bias.grad

w1_grad.shape == weights1.shape, b1_grad.shape == bias1.shape, w2_grad.shape == weights2.shape, b2_grad.shape == bias2.shape, w3_grad.shape == weights3.shape, b3_grad.shape == bias3.shape, w4_grad.shape == weights4.shape, b4_grad.shape == bias4.shape

(True, True, True, True, True, True, True, True)

In [28]:
# Compute gradients manually (output layer)
manual_output_grad = np.ones_like(manual_output) # dL/dL = 1
manual_w4_grad = np.dot(h6.T, manual_output_grad)
manual_b4_grad = manual_output_grad.sum(axis=0)

np.equal(w4_grad, manual_w4_grad).all(), np.equal(b4_grad, manual_b4_grad).all()

(True, True)

In [29]:
# Compute gradients manually (layer 8 + 7)
h6_grad = np.dot(manual_output_grad, weights4.T)
h5_grad = h6_grad * (1 - h6**2)
manual_w3_grad = np.dot(h4.T, h5_grad)
manual_b3_grad = h5_grad.sum(axis=0)

np.equal(w3_grad, manual_w3_grad).all(), np.equal(b3_grad, manual_b3_grad).all()

(True, True)

In [30]:
# Compute gradients manually (layer 6 + 5)
h4_grad = np.dot(h5_grad, weights3.T)
h3_grad = h4_grad * h4 * (1 - h4)
manual_w2_grad = np.dot(h2.T, h3_grad)
manual_b2_grad = h3_grad.sum(axis=0)

np.equal(w2_grad, manual_w2_grad).all(), np.equal(b2_grad, manual_b2_grad).all()

(True, True)

In [31]:
# Compute gradients manually (layer 4 + 3)
h2_grad = np.dot(h3_grad, weights2.T)
h1_grad = h2_grad * (h2 > 0)
manual_w1_grad = np.dot(x.T, h1_grad)
manual_b1_grad = h1_grad.sum(axis=0)

np.equal(w1_grad, manual_w1_grad).all(), np.equal(b1_grad, manual_b1_grad).all()

(True, True)