# Compare Brunoflow to Pytorch and validate it works as expected 

In [1]:
import numpy as np
import brunoflow as bf
from brunoflow import Node
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


### Simple scalar math operations

In [None]:
# Brunoflow
x_bf = bf.Parameter(5, name="x")
y_bf = bf.Parameter(10, name="y")
a_bf = bf.Parameter(1, name="a")
z_bf = x_bf * x_bf + y_bf * y_bf * y_bf + a_bf
# z_bf.name = "z"
z_bf.backprop()
print("BF results:", z_bf.val, x_bf.grad, y_bf.grad)

# Pytorch
x_pt = torch.tensor(5, dtype=torch.float32, requires_grad=True)
y_pt = torch.tensor(10, dtype=torch.float32, requires_grad=True)
a_pt = torch.tensor(1, dtype=torch.float32, requires_grad=True)
z_pt = x_pt * x_pt + y_pt * y_pt * y_pt + a_pt
z_pt.backward()
print("PT results:", z_pt.item(), x_pt.grad, y_pt.grad)

BF results: 1026 10.0 300.0
PT results: 1026.0 tensor(10.) tensor(300.)


### Simple array math operations

#### Simplest case
$$z = x^2 + y^3$$

In [None]:
# Brunoflow
x_bf = bf.Parameter(np.array([1.0, 5.0]), name="x")
y_bf = bf.Parameter(np.array([3.0, 10.0]), name="y")
z_bf = x_bf * x_bf + y_bf * y_bf * y_bf
z_bf.backprop()
print("BF results:", z_bf.val, x_bf.grad, y_bf.grad)

# Pytorch
x_pt = torch.tensor([1, 5], dtype=torch.float32, requires_grad=True)
y_pt = torch.tensor([3, 10], dtype=torch.float32, requires_grad=True)
z_pt = x_pt * x_pt + y_pt * y_pt * y_pt

# These are the weights to combine the output components dz_1/x and dz_2/x. 
# (Since the inputs X and Y both are vectors with shape (2,), and output Z is also a vector with shape (2,), 
#   if you don't combine them then you end up with a (2, 2) gradient output - one for each (output component, input component) pair.
#   So how do you combine them? I guess you want to preserve the gradient wrt each input component so you reduce along the axis of the "output components".
#   See https://stackoverflow.com/questions/43451125/pytorch-what-are-the-gradient-arguments for more info.
gradient_weights = torch.FloatTensor([1., 1])
z_pt.backward(gradient=gradient_weights) 
print("PT results:", z_pt, x_pt.grad, y_pt.grad)

BF results: [  28. 1025.] [ 2. 10.] [ 27. 300.]
PT results: tensor([  28., 1025.], grad_fn=<AddBackward0>) tensor([ 2., 10.]) tensor([ 27., 300.])


#### Slightly more complicated array math
$$z = x^2 + y^3 + x*y$$

In [None]:
# Brunoflow
x_bf = bf.Parameter(np.array([1.0, 5.0]))
y_bf = bf.Parameter(np.array([3.0, 10.0]))
z_bf = x_bf * x_bf + y_bf * y_bf * y_bf + sum(x_bf * y_bf)
z_bf.backprop()
print("BF results:", z_bf.val, x_bf.grad, y_bf.grad)

# Pytorch
x_pt = torch.tensor([1, 5], dtype=torch.float32, requires_grad=True)
y_pt = torch.tensor([3, 10], dtype=torch.float32, requires_grad=True)
z_pt = x_pt * x_pt + y_pt * y_pt * y_pt + sum(x_pt * y_pt)
z_pt.backward(gradient=torch.FloatTensor([1., 1]))
print("PT results:", z_pt, x_pt.grad, y_pt.grad)

BF results: [  81. 1078.] [ 8. 30.] [ 29. 310.]
PT results: tensor([  81., 1078.], grad_fn=<AddBackward0>) tensor([ 8., 30.]) tensor([ 29., 310.])


### Linear Layers (with weights and biases all initialized to 1)

#### Brief tangent about using BF - you need to wrap everything in a Node or Parameter!

In [None]:
# Brunoflow WITHOUT np arrays wrapped in nodes
ff1_bf = bf.net.LinearInitToOne(3, 1)
input_bf = np.expand_dims(np.array(range(0, 3)), axis=0)
output_bf = ff1_bf(input_bf)
output_bf.backprop()
print("output_bf.grad", output_bf.grad)
try:
    print("input_bf.grad", input_bf.grad)
except AttributeError as e:
    print(f"computing input_bf.grad fails! with the following error: {e}")
    
assert(isinstance(output_bf.inputs[0].inputs[0], np.ndarray))

# Brunoflow WITH np arrays wrapped in nodes
ff1_bf = bf.net.LinearInitToOne(3, 1)
input_bf = Node(np.expand_dims(np.array(range(0, 3)), axis=0))
output_bf = ff1_bf(input_bf)
output_bf.backprop()
print("output_bf.grad", output_bf.grad)
print("input_bf.grad", input_bf.grad)
assert(isinstance(output_bf.inputs[0].inputs[0], Node))

output_bf.grad [[1.]]
computing input_bf.grad fails! with the following error: 'numpy.ndarray' object has no attribute 'grad'
output_bf.grad [[1.]]
input_bf.grad [[1. 1. 1.]]


  + np.matmul(__np_matrix_transpose(-A_factor * np.log(A_factor)), out_abs_val_grad),
  + np.matmul(__np_matrix_transpose(-A_factor * np.log(A_factor)), out_abs_val_grad),


#### Ok, now actually comparing BF to Pytorch for a linear network with a single output

In [None]:
# Brunoflow
ff1_bf = bf.net.LinearInitToOne(3, 1)
input_bf = Node(np.expand_dims(np.array(range(0, 3)), axis=0))
output_bf = ff1_bf(input_bf)
output_bf.backprop()
print("BF results:")
print("output_bf.val", output_bf.val)
print("output_bf.grad", output_bf.grad)
print("input_bf.grad", input_bf.grad)
print()

# Pytorch
ff1_pt = nn.Linear(3, 1)
with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.ones_like(ff1_pt.weight))
    ff1_pt.bias = nn.Parameter(torch.ones_like(ff1_pt.bias))
    
input_pt = torch.tensor(np.expand_dims(np.array(range(0, 3)), axis=0), dtype=torch.float32, requires_grad=True)
output_pt = ff1_pt(input_pt)

input_pt.is_leaf # True, because we created it
output_pt.is_leaf # False, because it's computed by some operations involving other tensors

# Since output_pt is not a leaf node (e.g. not created by the user and requires_grad = True), 
# we need to explicitly tell pytorch to compute the gradient w.r.t. to this variable
output_pt.retain_grad()
output_pt.backward()
print("Pytorch results:")
print("output_pt value", output_pt)
print("output_pt.grad", output_pt.grad)
print("input_pt.grad", input_pt.grad)

BF results:
output_bf.val [[4.]]
output_bf.grad [[1.]]
input_bf.grad [[1. 1. 1.]]

Pytorch results:
output_pt value tensor([[4.]], grad_fn=<AddmmBackward0>)
output_pt.grad tensor([[1.]])
input_pt.grad tensor([[1., 1., 1.]])


#### BF vs PYT for linear network with multiple outputs

In [None]:
# Brunoflow
ff1_bf = bf.net.LinearInitToOne(3, 2)
input_bf = Node(np.array([range(0, 3)])) # shape (1, 3)
output_bf = ff1_bf(input_bf)
output_bf.backprop()
print("BF results:")
print("output_bf.val", output_bf.val)
print("output_bf.grad", output_bf.grad)
print("input_bf.grad", input_bf.grad)
print()

# Pytorch
ff1_pt = nn.Linear(3, 2)
with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.ones_like(ff1_pt.weight))
    ff1_pt.bias = nn.Parameter(torch.ones_like(ff1_pt.bias))
    
input_pt = torch.tensor(np.array([range(0, 3)]), dtype=torch.float32, requires_grad=True)
output_pt = ff1_pt(input_pt) # shape (1, 2)
output_pt.retain_grad()

# Evidently, the "gradient" parameter here needs to be of shape (1, 2) to match the shape of the *output* layer.
# This means that for each output value there's a gradient being computed...somehow? And the weight of each input is 1. in this case.
# Well, the takeaway here appears to be to simply use a tensor of 1s of the same shape as the output layer and we'll get the same value as for BF.
output_pt.backward(gradient=torch.FloatTensor([[1., 1.]]))
print("Pytorch results:")
print("output_pt value", output_pt)
print("output_pt.grad", output_pt.grad)
print("input_pt.grad", input_pt.grad)

BF results:
output_bf.val [[4. 4.]]
output_bf.grad [[1. 1.]]
input_bf.grad [[2. 2. 2.]]

Pytorch results:
output_pt value tensor([[4., 4.]], grad_fn=<AddmmBackward0>)
output_pt.grad tensor([[1., 1.]])
input_pt.grad tensor([[2., 2., 2.]])


#### BF vs PYT for linear network with multiple inputs and outputs (input shape (2, 3) -> output shape (2, 4))

In [None]:
# Brunoflow
ff1_bf = bf.net.LinearInitToOne(3, 4)
input_bf = Node(np.array([range(0, 3), range(4,7)]))
output_bf = ff1_bf(input_bf)
output_bf.backprop()
print("BF results:")
print("output_bf.val", output_bf.val)
print("output_bf.grad", output_bf.grad)
print("input_bf.grad", input_bf.grad)
print()

# Pytorch
ff1_pt = nn.Linear(3, 4)
with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.ones_like(ff1_pt.weight))
    ff1_pt.bias = nn.Parameter(torch.ones_like(ff1_pt.bias))
    
input_pt = torch.tensor(np.array([range(0, 3), range(4,7)]), dtype=torch.float32, requires_grad=True)
output_pt = ff1_pt(input_pt)
output_pt.retain_grad()
output_pt.backward(gradient=torch.FloatTensor([[1., 1., 1., 1.], [1., 1., 1., 1.]]))
print("Pytorch results:")
print("output_pt value", output_pt)
print("output_pt.grad", output_pt.grad)
print("input_pt.grad", input_pt.grad)

BF results:
output_bf.val [[ 4.  4.  4.  4.]
 [16. 16. 16. 16.]]
output_bf.grad [[1. 1. 1. 1.]
 [1. 1. 1. 1.]]
input_bf.grad [[4. 4. 4.]
 [4. 4. 4.]]

Pytorch results:
output_pt value tensor([[ 4.,  4.,  4.,  4.],
        [16., 16., 16., 16.]], grad_fn=<AddmmBackward0>)
output_pt.grad tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]])
input_pt.grad tensor([[4., 4., 4.],
        [4., 4., 4.]])


### Linear Layer with activations

In [None]:
from brunoflow.func import softmax, log_softmax, relu
print(softmax(np.array([1, 1, 1]), axis=0))
print(log_softmax(np.array([1, 1, 1]), axis=0))
print(relu(np.array([1, -1, 1])))

torch.set_printoptions(precision=8)

node(name: (exp (- [1 1 1] (expand_dims (+ (log (sum (exp (- [1 1 1] (expand_dims (max [1 1 1] axis=0) axis=0))) axis=0)) (max [1 1 1] axis=0)) axis=0))), val: [0.33333333 0.33333333 0.33333333], grad: [0. 0. 0.])
node(name: (- [1 1 1] (expand_dims (+ (log (sum (exp (- [1 1 1] (expand_dims (max [1 1 1] axis=0) axis=0))) axis=0)) (max [1 1 1] axis=0)) axis=0)), val: [-1.09861229 -1.09861229 -1.09861229], grad: [0. 0. 0.])
node(name: (max [ 1 -1  1] 0), val: [1 0 1], grad: [0. 0. 0.])


#### Softmax - this actually faces some numerical stability errors so you won't see them exactly match

In [None]:
# Shared
# input size = 3, output size = 2
output_layer_weights = np.array([0.5, 2.1], dtype=np.float64)
W = np.tile(output_layer_weights, (3, 1)) # to use tile, you give it the number of repetitions you want in each axis
b = np.ones(shape=(2,), dtype=np.float64)
# input_arr = np.array([range(0, 3)])
input_arr = np.array([[1.,2.,1.]], dtype=np.float64)

# Brunoflow
ff1_bf = bf.net.Linear(3, 2)
ff1_bf.set_weights(W)
ff1_bf.set_bias(b)

input_bf = Node(input_arr) # shape (1, 3)
ff1_output_bf = ff1_bf(input_bf)
softmax_bf = softmax(ff1_output_bf, axis=1)
softmax_bf.backprop()
print("BF results:")
print("softmax_bf.val", softmax_bf.val)
print("softmax_bf.grad", softmax_bf.grad)
print("ff1_output_bf.val", ff1_output_bf.val)
print("ff1_output_bf.grad", ff1_output_bf.grad)
print("input_bf.val", input_bf.val)
print("input_bf.grad", input_bf.grad)
print()

# Pytorch
ff1_pt = nn.Linear(3, 2)
with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.tensor(W.transpose(), dtype=torch.float64))
    ff1_pt.bias = nn.Parameter(torch.tensor(b, dtype=torch.float64))
    
input_pt = torch.tensor(input_arr, dtype=torch.float64, requires_grad=True)
ff1_output_pt = ff1_pt(input_pt) # shape (1, 2)
ff1_output_pt.retain_grad()
softmax_pt = nn.functional.softmax(ff1_output_pt, dim=1)
softmax_pt.retain_grad()

# Evidently, the "gradient" parameter here needs to be of shape (1, 2) to match the shape of the *output* layer.
# This means that for each output value there's a gradient being computed...somehow? And the weight of each input is 1. in this case.
# Well, the takeaway here appears to be to simply use a tensor of 1s of the same shape as the output layer and we'll get the same value as for BF.
softmax_pt.backward(gradient=torch.tensor([[1., 1.]], dtype=torch.float64))

print("Pytorch results:")
print("softmax_pt value", softmax_pt)
print("softmax_pt.grad", softmax_pt.grad)
print("ff1_output_pt value", ff1_output_pt)
print("ff1_output_pt.grad", ff1_output_pt.grad)
print("input_pt value", input_pt)
print("input_pt.grad", input_pt.grad)

BF results:
softmax_bf.val [[0.0016588 0.9983412]]
softmax_bf.grad [[1. 1.]]
ff1_output_bf.val [[3.  9.4]]
ff1_output_bf.grad [[-4.33680869e-19  0.00000000e+00]]
input_bf.val [[1. 2. 1.]]
input_bf.grad [[-2.16840434e-19 -2.16840434e-19 -2.16840434e-19]]

Pytorch results:
softmax_pt value tensor([[0.00165880, 0.99834120]], dtype=torch.float64,
       grad_fn=<SoftmaxBackward0>)
softmax_pt.grad tensor([[1., 1.]], dtype=torch.float64)
ff1_output_pt value tensor([[3.00000000, 9.40000000]], dtype=torch.float64,
       grad_fn=<AddmmBackward0>)
ff1_output_pt.grad tensor([[0., 0.]], dtype=torch.float64)
input_pt value tensor([[1., 2., 1.]], dtype=torch.float64, requires_grad=True)
input_pt.grad tensor([[0., 0., 0.]], dtype=torch.float64)


#### Log softmax - luckily, this is not nearly as unstable and you see the results match!

In [None]:
# Shared
# input size = 3, output size = 2
output_layer_weights = np.array([0.5, 2.3], dtype=np.float64)
W = np.tile(output_layer_weights, (3, 1)) # to use tile, you give it the number of repetitions you want in each axis
b = np.ones(shape=(2,), dtype=np.float64)
# input_arr = np.array([range(0, 3)])
input_arr = np.array([[1.,2.,1.]], dtype=np.float64)

# Brunoflow
ff1_bf = bf.net.Linear(3, 2)
ff1_bf.set_weights(W)
ff1_bf.set_bias(b)

input_bf = Node(input_arr) # shape (1, 3)
ff1_output_bf = ff1_bf(input_bf)
softmax_bf = log_softmax(ff1_output_bf, axis=1)
softmax_bf.backprop()
print("BF results:")
print("softmax_bf.val", softmax_bf.val)
print("softmax_bf.grad", softmax_bf.grad)
print("ff1_output_bf.val", ff1_output_bf.val)
print("ff1_output_bf.grad", ff1_output_bf.grad)
print("input_bf.val", input_bf.val)
print("input_bf.grad", input_bf.grad)
print()

# Pytorch
ff1_pt = nn.Linear(3, 2)
with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.tensor(W.transpose(), dtype=torch.float64))
    ff1_pt.bias = nn.Parameter(torch.tensor(b, dtype=torch.float64))
    
input_pt = torch.tensor(input_arr, dtype=torch.float64, requires_grad=True)
ff1_output_pt = ff1_pt(input_pt) # shape (1, 2)
ff1_output_pt.retain_grad()
softmax_pt = nn.functional.log_softmax(ff1_output_pt, dim=1)
softmax_pt.retain_grad()

# Evidently, the "gradient" parameter here needs to be of shape (1, 2) to match the shape of the *output* layer.
# This means that for each output value there's a gradient being computed...somehow? And the weight of each input is 1. in this case.
# Well, the takeaway here appears to be to simply use a tensor of 1s of the same shape as the output layer and we'll get the same value as for BF.
softmax_pt.backward(gradient=torch.tensor([[1., 1.]], dtype=torch.float64))

print("Pytorch results:")
print("softmax_pt value", softmax_pt)
print("softmax_pt.grad", softmax_pt.grad)
print("ff1_output_pt value", ff1_output_pt)
print("ff1_output_pt.grad", ff1_output_pt.grad)
print("input_pt value", input_pt)
print("input_pt.grad", input_pt.grad)

BF results:
softmax_bf.val [[-7.20074631e+00 -7.46307252e-04]]
softmax_bf.grad [[1. 1.]]
ff1_output_bf.val [[ 3.  10.2]]
ff1_output_bf.grad [[ 0.99850794 -0.99850794]]
input_bf.val [[1. 2. 1.]]
input_bf.grad [[-1.7973143 -1.7973143 -1.7973143]]

Pytorch results:
softmax_pt value tensor([[-7.20074631e+00, -7.46307252e-04]], dtype=torch.float64,
       grad_fn=<LogSoftmaxBackward0>)
softmax_pt.grad tensor([[1., 1.]], dtype=torch.float64)
ff1_output_pt value tensor([[ 3.00000000, 10.20000000]], dtype=torch.float64,
       grad_fn=<AddmmBackward0>)
ff1_output_pt.grad tensor([[ 0.99850794, -0.99850794]], dtype=torch.float64)
input_pt value tensor([[1., 2., 1.]], dtype=torch.float64, requires_grad=True)
input_pt.grad tensor([[-1.79731430, -1.79731430, -1.79731430]], dtype=torch.float64)


#### ReLU

In [None]:
# Shared
# input size = 3, output size = 2
output_layer_weights = np.array([0.5, 2.3], dtype=np.float64)
W = np.tile(output_layer_weights, (3, 1)) # to use tile, you give it the number of repetitions you want in each axis
b = np.ones(shape=(2,), dtype=np.float64)
# input_arr = np.array([range(0, 3)])
input_arr = np.array([[1.,2.,1.]], dtype=np.float64)

# Brunoflow
ff1_bf = bf.net.Linear(3, 2)
ff1_bf.set_weights(W)
ff1_bf.set_bias(b)

input_bf = Node(input_arr) # shape (1, 3)
ff1_output_bf = ff1_bf(input_bf)
relu_bf = relu(ff1_output_bf)
relu_bf.backprop()
print("BF results:")
print("relu_bf.val", relu_bf.val)
print("relu_bf.grad", relu_bf.grad)
print("ff1_output_bf.val", ff1_output_bf.val)
print("ff1_output_bf.grad", ff1_output_bf.grad)
print("input_bf.val", input_bf.val)
print("input_bf.grad", input_bf.grad)
print()

# Pytorch
ff1_pt = nn.Linear(3, 2)
with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.tensor(W.transpose(), dtype=torch.float64))
    ff1_pt.bias = nn.Parameter(torch.tensor(b, dtype=torch.float64))
    
input_pt = torch.tensor(input_arr, dtype=torch.float64, requires_grad=True)
ff1_output_pt = ff1_pt(input_pt) # shape (1, 2)
ff1_output_pt.retain_grad()
relu_pt = nn.functional.relu(ff1_output_pt)
relu_pt.retain_grad()

# Evidently, the "gradient" parameter here needs to be of shape (1, 2) to match the shape of the *output* layer.
# This means that for each output value there's a gradient being computed...somehow? And the weight of each input is 1. in this case.
# Well, the takeaway here appears to be to simply use a tensor of 1s of the same shape as the output layer and we'll get the same value as for BF.
relu_pt.backward(gradient=torch.tensor([[1., 1.]], dtype=torch.float64))

print("Pytorch results:")
print("relu_pt value", relu_pt)
print("relu_pt.grad", relu_pt.grad)
print("ff1_output_pt value", ff1_output_pt)
print("ff1_output_pt.grad", ff1_output_pt.grad)
print("input_pt value", input_pt)
print("input_pt.grad", input_pt.grad)

BF results:
relu_bf.val [[ 3.  10.2]]
relu_bf.grad [[1. 1.]]
ff1_output_bf.val [[ 3.  10.2]]
ff1_output_bf.grad [[1. 1.]]
input_bf.val [[1. 2. 1.]]
input_bf.grad [[2.8 2.8 2.8]]

Pytorch results:
relu_pt value tensor([[ 3.00000000, 10.20000000]], dtype=torch.float64,
       grad_fn=<ReluBackward0>)
relu_pt.grad tensor([[1., 1.]], dtype=torch.float64)
ff1_output_pt value tensor([[ 3.00000000, 10.20000000]], dtype=torch.float64,
       grad_fn=<AddmmBackward0>)
ff1_output_pt.grad tensor([[1., 1.]], dtype=torch.float64)
input_pt value tensor([[1., 2., 1.]], dtype=torch.float64, requires_grad=True)
input_pt.grad tensor([[2.80000000, 2.80000000, 2.80000000]], dtype=torch.float64)


  + (-np.abs(l_adj) * np.log(np.abs(l_adj)) * out_grad_and_entropy_dict["out_abs_val_grad"])
  + (-np.abs(l_adj) * np.log(np.abs(l_adj)) * out_grad_and_entropy_dict["out_abs_val_grad"])


### How about with some loss?

In [None]:
from brunoflow.opt import cross_entropy_loss

In [None]:
# Shared
# input size = 3, output size = 8 (but actually a scalar because we're looking at loss here)
input_arr = np.array([[1.,2.,1.]], dtype=np.float64)
target = np.array([0])

# Brunoflow
ff1_bf = bf.net.Linear(3, 8)
W_ff1 = ff1_bf.W.val
b_ff1 = ff1_bf.b.val

input_bf = Node(input_arr) # shape (1, 3)
ff1_output_bf = ff1_bf(input_bf)
loss_bf = cross_entropy_loss(ff1_output_bf, target=target)
loss_bf.backprop()

# Pytorch
ff1_pt = nn.Linear(3, 8)

with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.tensor(W_ff1.transpose(), dtype=torch.float64))
    ff1_pt.bias = nn.Parameter(torch.tensor(b_ff1, dtype=torch.float64))
    
input_pt = torch.tensor(input_arr, dtype=torch.float64, requires_grad=True)
ff1_output_pt = ff1_pt(input_pt) # shape (1, 2)
ff1_output_pt.retain_grad()
loss_pt = nn.functional.cross_entropy(ff1_output_pt, target=torch.tensor(target))
loss_pt.retain_grad()

# Since the gradient here is w.r.t. a scalar (the loss), we don't really need to fill in the gradient function, 
# but to be super explicit here it is as just...the scalar 1!
loss_pt.backward(gradient=torch.tensor(1.))

# Check near-equality
try:
    assert(torch.allclose(loss_pt, torch.tensor(loss_bf.val)))
    assert(torch.allclose(ff1_output_pt, torch.tensor(ff1_output_bf.val)))
    assert(torch.allclose(input_pt, torch.tensor(input_bf.val)))
    print("All layers have close enough values between BF and pytorch!")
except AssertionError as e:
    print(f"Uhoh! torch and bf had a difference with following error - {e}")
    
    print("BF results:")
    print("loss_bf", loss_bf)
    print("ff1_output_bf", ff1_output_bf)
    print("input_bf", input_bf)
    print()

    print("Pytorch results:")
    print("loss_pt value", loss_pt)
    print("loss_pt.grad", loss_pt.grad)
    print("ff1_output_pt value", ff1_output_pt)
    print("ff1_output_pt.grad", ff1_output_pt.grad)
    print("input_pt value", input_pt)
    print("input_pt.grad", input_pt.grad)

All layers have close enough values between BF and pytorch!


### Time for some MLPs!

In [None]:
# Shared
# input size = 3, output size = 2
input_arr = np.array([[1.,2.,1.]], dtype=np.float64)

# Brunoflow
ff1_bf = bf.net.Linear(3, 10)
ff2_bf = bf.net.Linear(10, 2)
W_ff1 = ff1_bf.W.val
b_ff1 = ff1_bf.b.val
W_ff2 = ff2_bf.W.val
b_ff2 = ff2_bf.b.val

input_bf = Node(input_arr) # shape (1, 3)
ff1_output_bf = ff1_bf(input_bf)
relu_bf = relu(ff1_output_bf)
ff2_output_bf = ff2_bf(relu_bf)
log_softmax_bf = log_softmax(ff2_output_bf, axis=1)
log_softmax_bf.backprop()

# Pytorch
ff1_pt = nn.Linear(3, 10)
ff2_pt = nn.Linear(10, 2)

with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.tensor(W_ff1.transpose(), dtype=torch.float64))
    ff1_pt.bias = nn.Parameter(torch.tensor(b_ff1, dtype=torch.float64))
    ff2_pt.weight = nn.Parameter(torch.tensor(W_ff2.transpose(), dtype=torch.float64))
    ff2_pt.bias = nn.Parameter(torch.tensor(b_ff2, dtype=torch.float64))
    
input_pt = torch.tensor(input_arr, dtype=torch.float64, requires_grad=True)
ff1_output_pt = ff1_pt(input_pt) # shape (1, 2)
ff1_output_pt.retain_grad()
relu_pt = nn.functional.relu(ff1_output_pt)
relu_pt.retain_grad()
ff2_output_pt = ff2_pt(relu_pt)
ff2_output_pt.retain_grad()
log_softmax_pt = nn.functional.log_softmax(ff2_output_pt, dim=1)
log_softmax_pt.retain_grad()

# # Evidently, the "gradient" parameter here needs to be of shape (1, 2) to match the shape of the *output* layer.
# # This means that for each output value there's a gradient being computed...somehow? And the weight of each input is 1. in this case.
# # Well, the takeaway here appears to be to simply use a tensor of 1s of the same shape as the output layer and we'll get the same value as for BF.
log_softmax_pt.backward(gradient=torch.tensor([[1., 1.]], dtype=torch.float64))

# Check near-equality
try:
    assert(torch.allclose(log_softmax_pt, torch.tensor(log_softmax_bf.val)))
    assert(torch.allclose(ff2_output_pt, torch.tensor(ff2_output_bf.val)))
    assert(torch.allclose(relu_pt, torch.tensor(relu_bf.val)))
    assert(torch.allclose(ff1_output_pt, torch.tensor(ff1_output_bf.val)))
    assert(torch.allclose(input_pt, torch.tensor(input_bf.val)))
    print("All layers have close enough values between BF and pytorch!")
except AssertionError as e:
    print(f"Uhoh! torch and bf had a difference with following error - {e}")
    
    print("BF results:")
    print("log_softmax_bf", log_softmax_bf)
    print("ff2_output_bf", ff2_output_bf)
    print("relu_bf", relu_bf)
    print("ff1_output_bf", ff1_output_bf)
    print("input_bf", input_bf)
    print()

    print("Pytorch results:")
    print("log_softmax_pt value", log_softmax_pt)
    print("log_softmax_pt.grad", log_softmax_pt.grad)
    print("ff2_output_pt value", ff2_output_pt)
    print("ff2_output_pt.grad", ff2_output_pt.grad)
    print("relu_pt value", relu_pt)
    print("relu_pt.grad", relu_pt.grad)
    print("ff1_output_pt value", ff1_output_pt)
    print("ff1_output_pt.grad", ff1_output_pt.grad)
    print("input_pt value", input_pt)
    print("input_pt.grad", input_pt.grad)

All layers have close enough values between BF and pytorch!


#### MLP with multiple inputs and loss! (putting it mostly all together)

In [None]:
# Shared
# input size = 3, output size = 2, batch_size = 2
input_arr = np.array([[1.,2.,1.], [2,4,6]], dtype=np.float64)
target = np.array([0, 1])

# Brunoflow
ff1_bf = bf.net.Linear(3, 10)
ff2_bf = bf.net.Linear(10, 2)
W_ff1 = ff1_bf.W.val
b_ff1 = ff1_bf.b.val
W_ff2 = ff2_bf.W.val
b_ff2 = ff2_bf.b.val

input_bf = Node(input_arr) # shape (1, 3)
ff1_output_bf = ff1_bf(input_bf)
relu_bf = relu(ff1_output_bf)
ff2_output_bf = ff2_bf(relu_bf)
loss_bf = cross_entropy_loss(ff2_output_bf, target=target)
loss_bf.backprop()

# Pytorch
ff1_pt = nn.Linear(3, 10)
ff2_pt = nn.Linear(10, 2)

with torch.no_grad():
    ff1_pt.weight = nn.Parameter(torch.tensor(W_ff1.transpose(), dtype=torch.float64))
    ff1_pt.bias = nn.Parameter(torch.tensor(b_ff1, dtype=torch.float64))
    ff2_pt.weight = nn.Parameter(torch.tensor(W_ff2.transpose(), dtype=torch.float64))
    ff2_pt.bias = nn.Parameter(torch.tensor(b_ff2, dtype=torch.float64))
    
input_pt = torch.tensor(input_arr, dtype=torch.float64, requires_grad=True)
ff1_output_pt = ff1_pt(input_pt) # shape (1, 2)
ff1_output_pt.retain_grad()
relu_pt = nn.functional.relu(ff1_output_pt)
relu_pt.retain_grad()
ff2_output_pt = ff2_pt(relu_pt)
ff2_output_pt.retain_grad()
loss_pt = nn.functional.cross_entropy(ff2_output_pt, target=torch.tensor(target))
loss_pt.retain_grad()

# Since the gradient here is w.r.t. a scalar (the loss), we don't really need to fill in the gradient function, 
# but to be super explicit here it is as just...the scalar 1!
loss_pt.backward(gradient=torch.tensor(1.))

# Check near-equality
try:
    assert(torch.allclose(loss_pt, torch.tensor(loss_bf.val)))
    assert(torch.allclose(ff2_output_pt, torch.tensor(ff2_output_bf.val)))
    assert(torch.allclose(relu_pt, torch.tensor(relu_bf.val)))
    assert(torch.allclose(ff1_output_pt, torch.tensor(ff1_output_bf.val)))
    assert(torch.allclose(input_pt, torch.tensor(input_bf.val)))
    print("All layers have close enough values between BF and pytorch!")
except AssertionError as e:
    print(f"Uhoh! torch and bf had a difference with following error - {e}")
    
    print("BF results:")
    print("loss_bf", loss_bf)
    print("ff2_output_bf", ff2_output_bf)
    print("relu_bf", relu_bf)
    print("ff1_output_bf", ff1_output_bf)
    print("input_bf", input_bf)
    print()

    print("Pytorch results:")
    print("loss_pt value", loss_pt)
    print("loss_pt.grad", loss_pt.grad)
    print("ff2_output_pt value", ff2_output_pt)
    print("ff2_output_pt.grad", ff2_output_pt.grad)
    print("relu_pt value", relu_pt)
    print("relu_pt.grad", relu_pt.grad)
    print("ff1_output_pt value", ff1_output_pt)
    print("ff1_output_pt.grad", ff1_output_pt.grad)
    print("input_pt value", input_pt)
    print("input_pt.grad", input_pt.grad)

All layers have close enough values between BF and pytorch!


In [None]:
print("Fertig!")

Fertig!


In [30]:
from jax import numpy as jnp

torch.manual_seed(0)
x = torch.tensor([[1., 2., 3.]], requires_grad=True)
target_class = 8
linear_torch = torch.nn.Linear(3, 10)
out = linear_torch(x)
print("x torch grad before backward:", x.grad)
out[:, target_class].backward()
print("x torch grad after backward:",x.grad)

x_bf = bf.Node(jnp.array(x.detach().numpy()), name="x_bf")
linear_bf = bf.net.Linear(3, 10)
linear_bf.weight.val = jnp.array(linear_torch.weight.detach().numpy())
linear_bf.bias.val = jnp.array(linear_torch.bias.detach().numpy())
out_bf = linear_bf(x_bf)
print("x bf grad before backward:", x_bf.grad)
out_bf[0, target_class].backprop()
print(out_bf.shape)
print("x bf grad after backward:", x_bf.grad)

x torch grad before backward: None
x torch grad after backward: tensor([[-0.0931,  0.0611,  0.5228]])
x bf grad before backward: [[0. 0. 0.]]
(1, 10)
x bf grad after backward: [[-0.09305926  0.0610918   0.5227769 ]]


In [33]:
from jax import numpy as jnp

torch.manual_seed(0)
x = torch.tensor([[1., 2., 3.]], requires_grad=True)
target_class = 8
linear_torch = torch.nn.Linear(3, 10)
out = linear_torch(x)
print("x torch grad before backward:", x.grad)
out.backward(gradient=torch.ones_like(out))
print("x torch grad after backward:",x.grad)

x_bf = bf.Node(jnp.array(x.detach().numpy()), name="x_bf")
linear_bf = bf.net.Linear(3, 10)
linear_bf.weight.val = jnp.array(linear_torch.weight.detach().numpy())
linear_bf.bias.val = jnp.array(linear_torch.bias.detach().numpy())
out_bf = linear_bf(x_bf)
print("x bf grad before backward:", x_bf.grad)
out_bf.backprop()
print(out_bf.shape)
print("x bf grad after backward:", x_bf.grad)

x torch grad before backward: None
x torch grad after backward: tensor([[-1.3587, -0.4561,  0.6417]])
x bf grad before backward: [[0. 0. 0.]]
(1, 10)
x bf grad after backward: [[-1.3587005 -0.4560883  0.6416915]]
