In [1]:
import numpy as np

In [2]:
class Module:
    def zero_grad(self):
        for p in self.parameters():
            p.grad = np.zeroes([1,])

    def parameters(self):
        return []

### Total Derivative and the Multivariable Chain Rule

In [21]:
class Tensor(Module):
    def __init__(self, data, children=()):
        self.data = data if isinstance(other, np.array) else np.array(data)
        self.grad = np.zeros(data.shape)
        self._backward = lambda: None
        self._prev = set(children)


    def zero_grad(self):
        self.grad = np.zeroes(data.shape)

    # Vector Addition:
    # out = self + other
    # partial(Err, self) = partial(E, out) * partial(out, self)
    # partial(Err, self) = out.grad * 1
    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(np.add(self.data, other.data), children=(self, other))
        
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out.backward = _backward

        return out

    # Pairwise Multiplication:
    # out = self * other
    # partial(Err, self) = partial(E, out) * partial(out, self)
    # partial(Err, self) = out.grad * other
    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(np.multiply(self.data, other), children=(self, other))

        def _backward():
            self.grad += np.multiply(other.data, out.grad)
            other.grad += np.multiply(self.data, out.grad)
        out._backward = _backward

        return out

    # Scalar Product or Matrix Multiplication
    # c = a @ b
    # partial(Err, a[i,j]) = partial(Err, c[i,1]) * partial(c[i,1], a[i,j]) + ... + partial(Err, c[i,n]) * partial(c[i,n], a[i,j])
    # partial(Err, a[i,j]) = c.grad[i,1] * b[j,1] + ... + c.grad[i,n] * b[j,n]
    def matmul(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(np.matmul(self.data, other.data), children=(self, other))  

        def _backward():
            self.grad = np.matmul(out.grad, np.transpose(other.data))
            other.grad = np.matmul(np.transpose(self.data), out.grad)
        out._backward = _backward

        return out
        
    # Power:
    # out = self**other
    # partial(Err, self) = partial(Err, out) * partial(out, self)
    # partial(Err, self) = out.grad * other * self**2
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Tensor(np.power(self.data, other), children=(self))

        def _backward():
            self.grad += np.multiply(other * np.power(self.data, other-1), out.grad)
        out._backward = _backward

        return out
        
    # out = max(0, self)
    # partial(Err, self) = partial(Err, out) * partial(out, self)
    # partial(Err, self) = partial
    def relu(self):
        out = Tensor(np.maximum(0, self.data))

        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward

        return out
    
    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        # matrix multiplication is not commutative
        return other * self

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

In [25]:
list((1, 2, (3, 4, 3)))
np.zeros([5,])
np.dot([1, 2, 3], [3, 4, 5])
np.array([1, 2, 3]) * 2
2 * np.power([1, 2, 3], 2)
a = np.array([-3, -2, -1, 0, 1, 2, 3, 4, 5])
np.maximum(0, a)
np.multiply([1, 2, 3], [4, 5, 6])
b = np.transpose(a)
print(b)
print(b.shape)
print(a)
print(a.shape)
print(a.reshape(3, 3))
print(np.transpose(a.reshape(3, 3)))
print(np.multiply(a.reshape(3, 3), np.transpose(a.reshape(3, 3))))
print(np.transpose(a).shape)
print(a.shape)
c = np.arange(1.0, 9.0)
print(c**-1)

[-3 -2 -1  0  1  2  3  4  5]
(9,)
[-3 -2 -1  0  1  2  3  4  5]
(9,)
[[-3 -2 -1]
 [ 0  1  2]
 [ 3  4  5]]
[[-3  0  3]
 [-2  1  4]
 [-1  2  5]]
[[ 9  0 -3]
 [ 0  1  8]
 [-3  8 25]]
(9,)
(9,)
[1.         0.5        0.33333333 0.25       0.2        0.16666667
 0.14285714 0.125     ]


In [31]:
a = np.array([-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
b = a.reshape(4, 4)
print(b)
print(b.diagonal())

c = a.reshape(2, 8)
print(c)
print(c.diagonal())

d = np.array([1])
print(d.reshape(-1, 1))
print(np.matmul(d, d))
print(np.matmul(a, a))
print(np.matmul(b, b))

[[-3 -2 -1  0]
 [ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[-3  2  7 12]
[[-3 -2 -1  0  1  2  3  4]
 [ 5  6  7  8  9 10 11 12]]
[-3  6]
[[1]]
1
664
[[  2  -4 -10 -16]
 [ 50  60  70  80]
 [ 98 124 150 176]
 [146 188 230 272]]
