## Interface Decision

* Two classes: Tensor, DifferentiableOperator

In [62]:
import numpy as np

In [268]:
class Tensor:
    data = None
    grad = None
    requires_grad = True
    
    def __init__(self, data):
        self.data = data
        
    def backward(self, grad):
        # TODO: add into the grad, not overriding
        self.grad = grad
        
    def update(self, lr=0.1):
        self.data = self.data - lr * self.grad

In [64]:
class DifferentiableOperator:
    def forward(self, *args):
        pass
    def backward(self, output):
        pass

In [65]:
class ResultTensor(Tensor):
    # the input operation
    # arguments of the input operation
    fn = None
    args = None
    def __init__(self, data, fn, *args):
        self.fn = fn
        self.args = args
        self.data = data
    
    def backward(self, grad):
        super().backward(grad)
        self.fn.backward(self)

In [66]:
class MatrixMultiply(DifferentiableOperator):
    def __init__(self):
        pass
    
    def forward(self, m1, m2):
        tensor = ResultTensor(np.matmul(m1.data, m2.data), self, m1, m2) # np.dot also works
        return tensor
    
    def backward(self, output):
        # what we have: the output ResultTensor, ie, output.grad...
        # what we need to do: populates the gradient of all of the input, ie, m1.grad, m2.grad
        m1, m2 = output.args
        grad1 = np.matmul(output.grad, m2.data.T)
        grad2 = np.matmul(m1.data.T, output.grad)
        
        m1.backward(grad1)
        m2.backward(grad2) 

In [67]:
a = Tensor(np.random.rand(2, 3))
b = Tensor(np.random.rand(3, 2))
print(a.data)
print(b.data)

c = MatrixMultiply().forward(a, b)
print(c.data)
c.backward(np.ones((2, 2)))

[[0.23783741 0.43212538 0.28352959]
 [0.7753874  0.57462035 0.07725642]]
[[0.55873305 0.36287997]
 [0.1699361  0.4447101 ]
 [0.6790102  0.50643311]]
[[0.39884081 0.42206573]
 [0.5833412  0.57603724]]


In [74]:
a.grad

array([[0.92161303, 0.6146462 , 1.18544332],
       [0.92161303, 0.6146462 , 1.18544332]])

### To test on if it works

In [73]:
import torch
a_p = torch.tensor(a.data, requires_grad=True)
b_p = torch.tensor(b.data, requires_grad=True)
c_p = torch.mm(a_p, b_p)
loss = c_p.sum()
loss.backward()
print(a_p.grad)

tensor([[0.9216, 0.6146, 1.1854],
        [0.9216, 0.6146, 1.1854]], dtype=torch.float64)


### Other functions
* Relu
* AddBias
* MSE
 * Make a whole MSE op (Caffe) 
 * Subtraction, Squaring, Meaning (TensorFlow)

* CE
* Softmax: s(1-s)

In [147]:
class Relu(DifferentiableOperator):
    def __init__(self):
        pass
    
    def forward(self, x):
        tensor = ResultTensor(np.where(x.data > 0, x.data, 0), self, x)
        return tensor
    
    def backward(self, output):
        x, = output.args
        grad = output.grad * np.where(x.data > 0, 1, 0)
        x.backward(grad)

In [95]:
class Subtraction(DifferentiableOperator):
    def __init__(self):
        pass
    
    def forward(self, x1, x2):
        tensor = ResultTensor(x1.data-x2.data, self, x1, x2)
        return tensor
    
    def backward(self, output):
        x1, x2 = output.args
        grad1 = output.grad
        grad2 = -1 * output.grad
        
        x1.backward(grad1)
        x2.backward(grad2)

In [128]:
class Square(DifferentiableOperator):
    def __init__(self):
        pass
    
    def forward(self, x):
        tensor = ResultTensor(x.data**2, self, x)
        return tensor
    
    def backward(self, output):
        x, = output.args
        grad = 2 * output.grad * x.data 
        x.backward(grad)

In [144]:
class Mean(DifferentiableOperator):
    def __init__(self):
        pass
    
    def forward(self, x):
        tensor = ResultTensor(np.mean(x.data), self, x)
        return tensor
    
    def backward(self, output):
        x, = output.args
        grad = output.grad * 1/x.data.size + np.zeros_like(x.data)
        x.backward(grad)

In [130]:
class AddBias(DifferentiableOperator):
    def __init__(self):
        pass
    
    def forward(self, x, b):
        tensor = ResultTensor(x.data+b.data, self, x, b)
        return tensor
    
    def backward(self, output):
        # Bias does not have the same shape as the x
        # Note: not clear to me why the derivative of broadcast is a sum
        x, b = output.args
        grad_x = output.grad
        grad_b = np.sum(output.grad, axis=0)
        
        x.backward(grad_x)
        b.backward(grad_b)

#### test relu

In [148]:
a = Tensor(np.random.randn(2, 3))
print(a.data)
c = Relu().forward(a)
print(c.data)
c.backward(np.ones((2, 3)))
print(a.grad)

[[ 0.33786494  0.7045165  -1.27312199]
 [-0.76477894 -0.7626472   0.11342666]]
[[0.33786494 0.7045165  0.        ]
 [0.         0.         0.11342666]]
[[1. 1. 0.]
 [0. 0. 1.]]


In [149]:
import torch.nn.functional as F

a_p = torch.tensor(a.data, requires_grad=True)
c_p = F.relu(a_p)
loss = c_p.sum()
loss.backward()
print(a_p.grad)

tensor([[1., 1., 0.],
        [0., 0., 1.]], dtype=torch.float64)


#### test mean

In [145]:
a = Tensor(np.random.randn(2, 3))
print(a.data)
c = Mean().forward(a)
print(c.data)
c.backward(1)
print(a.grad)

[[ 0.30427552 -0.66222056 -1.06750844]
 [ 0.04757668  0.64406412 -0.04436361]]
-0.12969604785466024
[[0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667]]


In [139]:
import torch.nn.functional as F

a_p = torch.tensor(a.data, requires_grad=True)
c_p = a_p.mean()
loss = c_p.sum()
loss.backward()
print(a_p.grad)

tensor([[0.1667, 0.1667, 0.1667],
        [0.1667, 0.1667, 0.1667]], dtype=torch.float64)


#### test Subtraction 

In [167]:
a = Tensor(np.random.randn(2, 3))
b = Tensor(np.random.randn(2, 3))
print(a.data)
c = Subtraction().forward(a, b)
print(c.data)
loss = Mean().forward(c)
loss.backward(1)
print(a.grad)
print(b.grad)

[[-0.44752451  0.99641571  0.0563563 ]
 [-0.92334486  0.41534708 -0.44654058]]
[[ 1.31328856  0.79497774  1.66899498]
 [-2.10848229  0.41344388  0.2265242 ]]
[[0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667]]
[[-0.16666667 -0.16666667 -0.16666667]
 [-0.16666667 -0.16666667 -0.16666667]]


In [173]:
import torch.nn.functional as F

a_p = torch.tensor(a.data, requires_grad=True)
b_p = torch.tensor(b.data, requires_grad=True)
c_p = a_p - b_p
loss = c_p.mean()
loss.backward()
print(a_p.grad)
print(b_p.grad)

tensor([[0.1667, 0.1667, 0.1667],
        [0.1667, 0.1667, 0.1667]], dtype=torch.float64)
tensor([[-0.1667, -0.1667, -0.1667],
        [-0.1667, -0.1667, -0.1667]], dtype=torch.float64)


#### test square

In [169]:
a = Tensor(np.random.randn(2, 3))
print(a.data)
c = Square().forward(a)
print(c.data)
loss = Mean().forward(c)
loss.backward(1)
print(a.grad)

[[-0.72880416  0.44638519 -0.96024022]
 [ 1.34800874  0.89310222  1.19075777]]
[[0.5311555  0.19925973 0.92206128]
 [1.81712758 0.79763158 1.41790407]]
[[-0.24293472  0.14879506 -0.32008007]
 [ 0.44933625  0.29770074  0.39691926]]


In [170]:
import torch.nn.functional as F

a_p = torch.tensor(a.data, requires_grad=True)
c_p = a_p ** 2
loss = c_p.mean()
loss.backward()
print(a_p.grad)

tensor([[-0.2429,  0.1488, -0.3201],
        [ 0.4493,  0.2977,  0.3969]], dtype=torch.float64)


#### test AddBias

In [174]:
a = Tensor(np.random.randn(2, 3))
b = Tensor(np.random.randn(3))
print(a.data)
c = AddBias().forward(a, b)
print(c.data)
loss = Mean().forward(c)
loss.backward(1)
print(a.grad)
print(b.grad)

[[-1.39892804  0.11880573 -1.26419115]
 [-0.97360996 -1.15602519  0.71572582]]
[[-0.40932178  2.11966685 -1.12248335]
 [ 0.0159963   0.84483593  0.85743362]]
[[0.16666667 0.16666667 0.16666667]
 [0.16666667 0.16666667 0.16666667]]
[0.33333333 0.33333333 0.33333333]


In [175]:
import torch.nn.functional as F

a_p = torch.tensor(a.data, requires_grad=True)
b_p = torch.tensor(b.data, requires_grad=True)
c_p = a_p + b_p
loss = c_p.mean()
loss.backward()
print(a_p.grad)
print(b_p.grad)

tensor([[0.1667, 0.1667, 0.1667],
        [0.1667, 0.1667, 0.1667]], dtype=torch.float64)
tensor([0.3333, 0.3333, 0.3333], dtype=torch.float64)


### End2End Test
#### Pytorch Version

In [164]:
X = torch.tensor([[0.,0],
                  [1, 0],
                  [0, 1],
                  [1, 1]])
y = torch.tensor([[0], [1], [1], [0]])

hidden_size = 5
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], hidden_size),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_size, y.shape[1]),
)

sgd = torch.optim.SGD(model.parameters(), lr=0.1)

for epoch in range(1000):
    pred = model(X)
    loss = ((pred - y) ** 2).mean()
    if epoch % 100 == 0:
        print(loss)
    sgd.zero_grad()
    loss.backward()
    sgd.step()

tensor(0.8683, grad_fn=<MeanBackward0>)
tensor(0.1006, grad_fn=<MeanBackward0>)
tensor(0.0124, grad_fn=<MeanBackward0>)
tensor(0.0005, grad_fn=<MeanBackward0>)
tensor(1.7311e-05, grad_fn=<MeanBackward0>)
tensor(5.7242e-07, grad_fn=<MeanBackward0>)
tensor(2.0854e-08, grad_fn=<MeanBackward0>)
tensor(6.3484e-10, grad_fn=<MeanBackward0>)
tensor(2.0531e-11, grad_fn=<MeanBackward0>)
tensor(1.0394e-12, grad_fn=<MeanBackward0>)


In [176]:
# To Demonstrate that XOR cannot be done linearly.

X = torch.tensor([[0.,0],
                  [1, 0],
                  [0, 1],
                  [1, 1]])
y = torch.tensor([[0], [1], [1], [0]])

hidden_size = 5
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], hidden_size),
    torch.nn.Linear(hidden_size, y.shape[1]),
)

sgd = torch.optim.SGD(model.parameters(), lr=0.1)

for epoch in range(1000):
    pred = model(X)
    loss = ((pred - y) ** 2).mean()
    if epoch % 100 == 0:
        print(loss)
    sgd.zero_grad()
    loss.backward()
    sgd.step()

tensor(0.7439, grad_fn=<MeanBackward0>)
tensor(0.2501, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)
tensor(0.2500, grad_fn=<MeanBackward0>)


#### Backprop Lib Version

In [293]:
X = Tensor(np.array([[0.,0], [1, 0], [0, 1], [1, 1]]))
y = Tensor(np.array([[0], [1], [1], [0]]))

In [292]:
class OldLinear(DifferentiableOperator):
    def __init__(self, in_size, h_size):
        self.in_size = in_size
        self.h_size = h_size
        self.w = Tensor(np.ones((in_size, h_size)))
        self.b = Tensor(np.zeros((1, h_size)))
        
    def forward(self, x):
        out = MatrixMultiply().forward(x, self.w)
        out = AddBias().forward(Tensor(out.data), self.b)
        tensor = ResultTensor(out.data, self, x)
        return tensor
    
    def backward(self, out):
        x, = out.args
        
        x_grad = out.grad * self.w.data
        w_grad = out.grad * x.data
        b_grad = np.sum(out.grad, axis=0)
        
        x.backward(x_grad)
        self.w.backward(w_grad)
        self.b.backward(b_grad)
#         print(f"backward called: {w_grad}\t{b_grad}")
        
    def update(self, lr=0.1):
        self.w.update(lr)
        self.b.update(lr)

In [303]:
class Linear:
    def __init__(self, in_size, h_size):
        self.in_size = in_size
        self.h_size = h_size
        
        self.w = Tensor(np.random.randn(in_size, h_size) * 0.5 + np.ones((in_size, h_size)))
#         self.w = Tensor(np.random.randn(in_size, h_size) * 0.05)
        self.b = Tensor(np.zeros((1, h_size)))
        
    def forward(self, x):
        out = MatrixMultiply().forward(x, self.w)
        out = AddBias().forward(out, self.b)
        return out
            
    def update(self, lr=0.1):
        self.w.update(lr)
        self.b.update(lr)

In [304]:
lr = 0.1
layers = [
    Linear(X.data.shape[1], hidden_size),
    Linear(hidden_size, y.data.shape[1]),
]

for epoch in range(10000):
    out = layers[0].forward(X)
    out = Relu().forward(out)
    out = layers[1].forward(out)
    
    sub = Subtraction().forward(out, y)
    sqr = Square().forward(sub)
    loss = Mean().forward(sqr)
    if epoch % 100 == 0:
        print(f"epoch {epoch}: {loss.data}")
        
    loss.backward(1)
    layers[0].update()
    layers[1].update()

epoch 0: 26.076136196674582
epoch 100: 0.018423169530866517
epoch 200: 0.00021248478110615876
epoch 300: 1.717420943951188e-06
epoch 400: 1.343363600656358e-08
epoch 500: 1.0477276053473084e-10
epoch 600: 8.177878683891807e-13
epoch 700: 6.3804096884287626e-15
epoch 800: 4.980032386055118e-17
epoch 900: 3.8862877649210135e-19
epoch 1000: 3.0321597203244807e-21
epoch 1100: 2.363886865090597e-23
epoch 1200: 1.843026295901889e-25
epoch 1300: 1.4324603052678796e-27
epoch 1400: 1.3935430362370784e-29
epoch 1500: 1.6589914659983336e-30
epoch 1600: 1.1683785692944992e-30
epoch 1700: 1.1648025252004656e-30
epoch 1800: 1.164802430367919e-30
epoch 1900: 1.1648024303654011e-30
epoch 2000: 1.1648024303654011e-30
epoch 2100: 1.1648024303654011e-30
epoch 2200: 1.1648024303654011e-30
epoch 2300: 1.1648024303654011e-30
epoch 2400: 1.1648024303654011e-30
epoch 2500: 1.1648024303654011e-30
epoch 2600: 1.1648024303654011e-30
epoch 2700: 1.1648024303654011e-30
epoch 2800: 1.1648024303654011e-30
epoch 2900