Value would be the class whose objects would be the nodes of our imaginary DAG for forward pass and back prop.

In [1]:
import numpy as np
import math

In [1]:
class Value:

    def __init__(self,data,_children=(),_op='',label=''):#using tuple because tuple is immutable.
        self.data=data
        self.grad=0.0 #derivative of final variable wrt this variable
        #ofc not applicable for leaf nodes which correspond to input.
        #just find prod of the adjacent node if its a * label, else it will be 1.
        self._prev=set(_children)
        self._op=_op
        self.label=label
        self._backward=lambda:None # this is the backward function which will be determined at runtime for each Value object. 
        #by the way we implement it, it would be only for derived nodes and not leaf nodes.

    def __repr__(self):
        return f"Value(data:{self.data})"
    
    def __add__(self,other):
        other = other if isinstance(other,Value) else Value(other) # so that (value object) + (abstract datatype) makes sense.
        out=Value(self.data+other.data,(self,other),'+')
        def _backward(): # this fills in the derivatives of children
            self.grad+=1.0*out.grad #using += in case same node is used more than once, eg: b=a+a or (d=a+b and c=a*b) and we don't override the definition and instead
            #sum up the derivatives of all the instances of the node
            other.grad+=out.grad #adding in
        out._backward=_backward
        return out
    
    def __radd__(self,other): # fallback function for 2+(value object)
        return self+other

    def __mul__(self,other):
        other = other if isinstance(other,Value) else Value(other)
        out=Value(self.data*other.data,(self,other),'*')
        def _backward():
            self.grad+=other.data*out.grad
            other.grad+=self.data*out.grad
        out._backward=_backward
        return out

    def __rmul__(self,other): # so that (abstract data type)*self makes sense
        return self*other
    
    def __neg__(self):
        return self*-1
    
    def __sub__(self,other):
        return self+(-other)
    
    def __rsub__(self,other):
        return self+(other)
    
    def __pow__(self,other):
        assert isinstance(other,(int,float))
        out = Value(pow(self.data,other),(self,),f"**{other}")
        def _backward():
            self.grad+=out.grad*other*(self.data**(other-1))
        out._backward=_backward
        return out
    
    def __truediv__(self,other):
        return self*(other**-1)
    
    def __rtruediv__(self,other): #other/self
        return other * (self**-1)
    #needed to be defined even though in this case self*(other**-1) will go to rmul, but truediv will only be called in case of self/other
    # and not other/self.

    def tanh(self):
        x=self.data
        res = (math.exp(2*x)-1)/(math.exp(2*x)+1)
        out =Value(res,(self,),_op='tanh')
        def _backward():
            self.grad+=(1-(out.data**2))*out.grad
        out._backward=_backward
        return out

    def exp(self):
        out = Value(math.exp(self.data),(self,),'exp')
        def _backward():
            self.grad+=out.data*out.grad
        out._backward=_backward
        return out

    def relu(self):
        out = Value(0 if self.data<=0 else self.data,(self,),'relu')
        def _backward():
            self.grad+=out.grad*(0 if out.data==0 else 1)
        out._backward=_backward
        return out

    def sigmoid(self):
        res = 1/(1+math.exp(-self.data))
        out = Value(res,(self,),'sigmoid')
        def _backward():
            self.grad+=out.grad*(out.data*(1-out.data))
        out._backward=_backward
        return out

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()


In [17]:
import random
class Neuron:
    def __init__(self,n_in):
        self.w = [Value(random.uniform(-1,1)) for _ in range(n_in)]
        self.b = Value(random.uniform(-1,1))
    def __call__(self,x): # call directly on the object.
        z = sum((wi*xi for wi,xi in zip(self.w,x)) ,self.b)
        act = z.tanh()
        return act
    def parameters(self):
        return self.w+[self.b]

class Layer:
    def __init__(self,nin,nout): # nin means input dimensions and nout means output dimensions or no of units
        self.neurons = [Neuron(nin) for _ in range(nout)]
    def __call__(self,x):
        outs=[n(x) for n in self.neurons ]
        return outs[0] if len(outs)==1 else outs
    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]
class MLP:
    def __init__(self,nin,nouts):
        self.layers=[]
        for out in nouts:
            self.layers.append(Layer(nin,out))
            nin=out
    def __call__(self,x):
        for layer in self.layers:
            x=layer(x)
        return x
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]
    def zero_grad(self):
        for p in self.parameters():
            p.grad=0

In [18]:
mlp = MLP(3,[4,4,1])

In [19]:
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

In [20]:
for k in range(20):
  
  # forward pass
  ypred = [mlp(x) for x in xs]
  loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
  
  # backward pass
  for p in mlp.parameters():
    p.grad = 0.0
  loss.backward()
  
  # update
  for p in mlp.parameters():
    p.data += -0.01 * p.grad
  
  print(k, loss.data)

0 5.994820397534473
1 5.6422608157698795
2 5.397586529702362
3 5.218915550245745
4 5.063944903977825
5 4.892966008489986
6 4.655943320248902
7 4.260485171391139
8 3.5664810048888365
9 2.7600172681386166
10 2.336741929958314
11 2.099976712737065
12 1.9194364149351675
13 1.7673984878504367
14 1.6329625196546438
15 1.510130045850243
16 1.3952551949545005
17 1.2860926223810505
18 1.1813304131171396
19 1.0803484187829493


In [24]:
import torch
def test_sanity_check():

    x = Value(-4.0)
    z = 2 * x + 2 + x
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xmg, ymg = x, y

    x = torch.Tensor([-4.0]).double()
    x.requires_grad = True
    z = 2 * x + 2 + x
    q = z.relu() + z * x
    h = (z * z).relu()
    y = h + q + q * x
    y.backward()
    xpt, ypt = x, y

    # forward pass went well
    assert ymg.data == ypt.data.item()
    # backward pass went well
    assert xmg.grad == xpt.grad.item()

def test_more_ops():

    a = Value(-4.0)
    b = Value(2.0)
    c = a + b
    d = a * b + b**3
    c += c + 1
    c += 1 + c + (-a)
    d += d * 2 + (b + a).relu()
    d += 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g += 10.0 / f
    g.backward()
    amg, bmg, gmg = a, b, g

    a = torch.Tensor([-4.0]).double()
    b = torch.Tensor([2.0]).double()
    a.requires_grad = True
    b.requires_grad = True
    c = a + b
    d = a * b + b**3
    c = c + c + 1
    c = c + 1 + c + (-a)
    d = d + d * 2 + (b + a).relu()
    d = d + 3 * d + (b - a).relu()
    e = c - d
    f = e**2
    g = f / 2.0
    g = g + 10.0 / f
    g.backward()
    apt, bpt, gpt = a, b, g

    tol = 1e-6
    # forward pass went well
    assert abs(gmg.data - gpt.data.item()) < tol 
    # backward pass went well
    assert abs(amg.grad - apt.grad.item()) < tol
    assert abs(bmg.grad - bpt.grad.item()) < tol

In [25]:
test_sanity_check()

In [30]:
test_more_ops()

24.70408163265306   24.70408163265306


# Q&A
- Will the DAG be valid if the data of a node changes? nope but it don't matter because once we update the weights, we will do a forward pass again in which new nodes/DAG would be created.
- How much to swing the parameter to reduce the loss (see the loss vs parameter parabola), is determined by the optimizer algo chosen + learning rate
- L2 Regularization prevents the model parameters from getting extreme absolute values. If a parameter has too large a value, its grad will be large too and the param's value will be decreased. If a parameter has a too large negative value, its derivative would be large negative too, which will result in increase of parameter value. This prevents overfitting of model to the training data because overfitting happens when the parameters of a model gain extreme values because of biased training data.