In [29]:
# https://www.wrighters.io/using-autoreload-to-speed-up-ipython-and-jupyter-work/
%load_ext autoreload
%autoreload 2
# micograd_from_scratch_mlp_1, but by applying the modularized Classes from micrograd.nn
from micrograd.tracegraph import draw_dot
from micrograd.topo import build, findLeafNodes
from micrograd.nn import Neuron, Layer, MLP
from micrograd.engine import Value
from io import StringIO

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Dataset with 4 Input values, assigned each to each of the 3 Neurons of Layer 0.
# The rows in the first column are the Input values assigned to each Neuron 0, 1, 2 and 3 of Layer 0.
# The rows in the second column are the Input values assigned to each Neuron 0, 1, 2 and 3 of Layer 0.
# The rows in the third column are the Input values assigned to each Neuron 0, 1, 2 and 3 of Layer 0.
xs = [
    [2.0, 3.0, -1.0], # Example #0 
    [3.0, -1.0, 0.5], # Example #1
    [0.5, 1.0, 1.0],  # Example #2
    [1.0, 1.0, -1.0], # Example #3
]
# Desired targets for each Example #; a simple binary classifier; Also called the g(round) t(ruths)
# Values labeled gt0, gt1, gt2 and gt3 are the ground truths for each Example
ygts = [Value(1.0, _label='gt0'), Value(-1.0, _label='gt1'), Value(-1.0, _label='gt2'), Value(1.0, _label='gt3')]
# A MLP neuronal network with 3 Input value and, 3 Layers by 4x4x1 Neurons
mlp_nn1 = MLP(3, [4, 4, 1])
# WANT
# The current prediction for each Example
#   The MLP to output  1.0 given Example #0
#   The MLP to output -1.0 given Example #1
#   The MLP to output -1.0 given Example #2
#   The MLP to output  1.0 given Example #3
yspred = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred

[Value(data=0.8024582077087925, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.5413612523015063, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.35568112667755925, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.7577300512482477, grad=0, op=tanh, label=L2|N0|o)]

In [3]:
# WANT
# How do we tune the weights to better predict the desired targets?
# Calculate a single number that measures the total performance of the neural net.
# This single number is called the Loss. 
# So first, we implement the Loss function with a mean squared error Loss. The Loss is the difference between the 
# prediction and the gound truth of y. Squared, to always get a positive number.
losss = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred)]
# label the loss by index for each loss
for idx, loss in enumerate(losss): loss._label = f'loss{idx}'
# The overall loss is
losssum = sum(losss)
losssum._label = 'losssum'
print(f'loss for all ground truths: {losssum}')

loss for all ground truths: Value(data=4.311383315096003, grad=0, op=+, label=losssum)


In [4]:
# draw_dot of losssum before calling backward -> see doc/micograd_from_scratch_mlp_3_withlosssum_beforebackward.svg
# draw_dot(losssum)

In [5]:
losssum.backward()

In [6]:
# draw_dot of losssum after calling backward -> see doc/micograd_from_scratch_mlp_3_withlosssum_afterbackward.svg
# draw_dot(losssum)

In [43]:
# If we look at some weight - here the 1st Weight of the 1st Neuron of the 1st Layer - ...
print(f'If we look at some weight - here the 1st Weight of the 1st Neuron of the 1st Layer -...\n{mlp_nn1.layers[0].neurons[0].w[0]}')
# ... escpecially at its's data ...
print(f'... escpecially at its\'s data ...\n{mlp_nn1.layers[0].neurons[0].w[0].data}')
# ... and at its's grad
print(f'... and at it\'s grad ...\n{mlp_nn1.layers[0].neurons[0].w[0].grad}')
print(f'... we see that the grad\'s influence is ? positive or ? negative')
#
print("")
print("If the Grad of this Weight of this particular Neuron of this particular Layer is positive, the influence of the Weight to ")
print("the Loss is also positive. So decreasing the Weight of this particular Neuron would make the loss go down.")
print("")
print("If the Grad of this Weight of this particular Neuron of this particular Layer is negative, the influence of the Weight to")
print("the Loss is also negative. So increasing the Weight of this particular Neuron would make the loss go down.")

If we look at some weight - here the 1st Weight of the 1st Neuron of the 1st Layer -...
Value(data=-0.959487738115776, grad=-0.24558073915563292, op=prim, label=L0|N0|w0)
... escpecially at its's data ...
-0.959487738115776
... and at it's grad ...
-0.24558073915563292
... we see that the grad's influence is ? positive or ? negative

If the Grad of this Weight of this particular Neuron of this particular Layer is positive, the influence of the Weight to 
the Loss is also positive. So decreasing the Weight of this particular Neuron would make the loss go down.

If the Grad of this Weight of this particular Neuron of this particular Layer is negative, the influence of the Weight to
the Loss is also negative. So increasing the Weight of this particular Neuron would make the loss go down.


In [34]:
# WANT
# A convenience methode, to gather all those parameters called Weights and Biases so we can change them.
# To print each parameter on a line, we use print but instead of to the console, we print to a variable 
output = StringIO()
print(*mlp_nn1.parameters(), sep="\n", file=output)
print(f'Parameters of the MLP:\n{output.getvalue()}')
print(f'Number of Parameters of the MLP: {len(mlp_nn1.parameters())}')

Parameters of the MLP:
Value(data=-0.959487738115776, grad=-0.24558073915563292, op=prim, label=L0|N0|w0)
Value(data=0.375987038868441, grad=-0.47746037458081275, op=prim, label=L0|N0|w1)
Value(data=-0.41034046594409146, grad=-0.4603471973615333, op=prim, label=L0|N0|w2)
Value(data=-0.5310559751256543, grad=-0.47132442367378147, op=prim, label=L0|N0|b)
Value(data=-0.2587547667948096, grad=-11.328698606799321, op=prim, label=L0|N1|w0)
Value(data=0.0403237811106445, grad=2.849281943171405, op=prim, label=L0|N1|w1)
Value(data=0.5833304446497909, grad=-3.553646227793297, op=prim, label=L0|N1|w2)
Value(data=0.4031334249509986, grad=-4.752577074474798, op=prim, label=L0|N1|b)
Value(data=-0.08045905136012599, grad=0.5985595195990767, op=prim, label=L0|N2|w0)
Value(data=0.8449829371178346, grad=-0.18795489475913127, op=prim, label=L0|N2|w1)
Value(data=0.08749341362998475, grad=0.10041199251109707, op=prim, label=L0|N2|w2)
Value(data=0.3596692071246599, grad=0.20629667644175093, op=prim, label=

In [47]:
print(f'In the current mlp the grad of N0 in L0 -\n{mlp_nn1.layers[0].neurons[0].w[0].grad}')
print(f'- is negative. Therefore the influence of the Weight to the Loss is also negative.')
print(f'So increasing the Weight of this particular Neuron would make the loss go down.')

In the current mlp the grad of N0 in L0 -
-0.24558073915563292
- is negative. Therefore the influence of the Weight to the Loss is also negative.
So increasing the Weight of this particular Neuron would make the loss go down.


In [53]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [54]:
print(f'After increasing it, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After increasing it, the data is
Value(data=-0.9570319307242197, grad=-0.24558073915563292, op=prim, label=L0|N0|w0)


In [55]:
print("Now we are making a next forward pass ...")
yspred_1 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_1

Now we are making a second forward pass ...


[Value(data=0.6864091663240514, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.33709395290648175, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.08774657037385546, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.634525942145128, grad=0, op=tanh, label=L2|N0|o)]

In [57]:
print("... and recalculate the loss")
losss_1 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_1)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_1): loss._label = f'loss{idx}'
# The overall loss is
losssum_1 = sum(losss_1)
losssum_1._label = 'losssum'
print(f'loss for all ground truths: {losssum_1}')

... and recalculate the loss
loss for all ground truths: Value(data=1.8545475265637212, grad=0, op=+, label=losssum)


In [58]:
# Now we start the next iteration #2 to minimize the loss
losssum_1.backward()

In [59]:
print(f'After backward#2, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward, the data is
Value(data=-0.9570319307242197, grad=-0.500757439537063, op=prim, label=L0|N0|w0)


In [60]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [61]:
print("Now we are making a next forward pass ...")
yspred_2 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_2

Now we are making a next forward pass ...


[Value(data=0.41459615714647247, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.8762625661546329, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.38381458442773747, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.3673637661304555, grad=0, op=tanh, label=L2|N0|o)]

In [62]:
print("... and recalculate the loss")
losss_2 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_2)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_2): loss._label = f'loss{idx}'
# The overall loss is
losssum_2 = sum(losss_2)
losssum_2._label = 'losssum'
print(f'loss for all ground truths: {losssum_2}')

... and recalculate the loss
loss for all ground truths: Value(data=1.137921682530917, grad=0, op=+, label=losssum)


In [63]:
# Now we start the next iteration #3 to minimize the loss
losssum_2.backward()

In [64]:
print(f'After backward#3, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#3, the data is
Value(data=-0.9520243563288491, grad=-0.8243632146237716, op=prim, label=L0|N0|w0)


In [65]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [66]:
print("Now we are making a next forward pass ...")
yspred_3 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_3

Now we are making a next forward pass ...


[Value(data=0.41984219150271845, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9478352811859603, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.7026437031622795, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.27078194954012846, grad=0, op=tanh, label=L2|N0|o)]

In [67]:
print("... and recalculate the loss")
losss_3 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_3)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_3): loss._label = f'loss{idx}'
# The overall loss is
losssum_3 = sum(losss_3)
losssum_3._label = 'losssum'
print(f'loss for all ground truths: {losssum_3}')

... and recalculate the loss
loss for all ground truths: Value(data=0.9594839730348546, grad=0, op=+, label=losssum)


In [68]:
# Now we start the next iteration #4 to minimize the loss
losssum_3.backward()

In [70]:
print(f'After backward#4, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#4, the data is
Value(data=-0.9437807241826114, grad=-1.2634181690347575, op=prim, label=L0|N0|w0)


In [71]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [72]:
print("Now we are making a next forward pass ...")
yspred_4 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_4

Now we are making a next forward pass ...


[Value(data=0.7237366492032441, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.967311577375384, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.8402956602263896, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.49660603112648943, grad=0, op=tanh, label=L2|N0|o)]

In [73]:
print("... and recalculate the loss")
losss_4 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_4)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_4): loss._label = f'loss{idx}'
# The overall loss is
losssum_4 = sum(losss_4)
losssum_4._label = 'losssum'
print(f'loss for all ground truths: {losssum_4}')

... and recalculate the loss
loss for all ground truths: Value(data=0.3563009360078866, grad=0, op=+, label=losssum)


In [74]:
# Now we start the next iteration #5 to minimize the loss
losssum_4.backward()

In [75]:
print(f'After backward#5, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#5, the data is
Value(data=-0.9311465424922638, grad=-1.4899964043175222, op=prim, label=L0|N0|w0)


In [76]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [77]:
print("Now we are making a next forward pass ...")
yspred_5 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_5

Now we are making a next forward pass ...


[Value(data=0.8605699469316133, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9768581419767903, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9038499001183877, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.7235049940253461, grad=0, op=tanh, label=L2|N0|o)]

In [78]:
print("... and recalculate the loss")
losss_5 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_5)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_5): loss._label = f'loss{idx}'
# The overall loss is
losssum_5 = sum(losss_5)
losssum_5._label = 'losssum'
print(f'loss for all ground truths: {losssum_5}')

... and recalculate the loss
loss for all ground truths: Value(data=0.10567061532758747, grad=0, op=+, label=losssum)


In [79]:
# Now we start the next iteration #6 to minimize the loss
losssum_5.backward()

In [80]:
print(f'After backward#6, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#6, the data is
Value(data=-0.9162465784490886, grad=-1.566860467143402, op=prim, label=L0|N0|w0)


In [81]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [82]:
print("Now we are making a next forward pass ...")
yspred_6 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_6

Now we are making a next forward pass ...


[Value(data=0.9105576870819314, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9827749432042266, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9353791751927711, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.845631470132911, grad=0, op=tanh, label=L2|N0|o)]

In [83]:
print("... and recalculate the loss")
losss_6 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_6)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_6): loss._label = f'loss{idx}'
# The overall loss is
losssum_6 = sum(losss_6)
losssum_6._label = 'losssum'
print(f'loss for all ground truths: {losssum_6}')

... and recalculate the loss
loss for all ground truths: Value(data=0.036302123933844244, grad=0, op=+, label=losssum)


In [84]:
# Now we start the next iteration #7 to minimize the loss
losssum_6.backward()

In [85]:
print(f'After backward#7, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#7, the data is
Value(data=-0.9005779737776546, grad=-1.5926440655553684, op=prim, label=L0|N0|w0)


In [86]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [87]:
print("Now we are making a next forward pass ...")
yspred_7 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_7

Now we are making a next forward pass ...


[Value(data=0.9356801507960734, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9868019791950361, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9511034875965612, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.9047017212725309, grad=0, op=tanh, label=L2|N0|o)]

In [88]:
print("... and recalculate the loss")
losss_7 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_7)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_7): loss._label = f'loss{idx}'
# The overall loss is
losssum_7 = sum(losss_7)
losssum_7._label = 'losssum'
print(f'loss for all ground truths: {losssum_7}')

... and recalculate the loss
loss for all ground truths: Value(data=0.01578386160842216, grad=0, op=+, label=losssum)


In [89]:
# Now we start the next iteration #8 to minimize the loss
losssum_7.backward()

In [90]:
print(f'After backward#8, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#8, the data is
Value(data=-0.8846515331221009, grad=-1.602359010806833, op=prim, label=L0|N0|w0)


In [91]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [92]:
print("Now we are making a next forward pass ...")
yspred_8 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_8

Now we are making a next forward pass ...


[Value(data=0.9515750792350357, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9896797690734922, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9581596984928638, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.9354891324195409, grad=0, op=tanh, label=L2|N0|o)]

In [93]:
print("... and recalculate the loss")
losss_8 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_8)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_8): loss._label = f'loss{idx}'
# The overall loss is
losssum_8 = sum(losss_8)
losssum_8._label = 'losssum'
print(f'loss for all ground truths: {losssum_8}')

... and recalculate the loss
loss for all ground truths: Value(data=0.008363742983661115, grad=0, op=+, label=losssum)


In [94]:
# Now we start the next iteration #9 to minimize the loss
losssum_8.backward()

In [95]:
print(f'After backward#9, the data is\n{mlp_nn1.layers[0].neurons[0].w[0]}')

After backward#9, the data is
Value(data=-0.8686279430140326, grad=-1.606208092216953, op=prim, label=L0|N0|w0)


In [96]:
# Now we go to change the parameter slightly according to the gradient information to minimize the Loss function.
# In a gradient descent scheme, we think of the Gradient Descent as a vector pointing in the direction of increased loss.
# As we want to decrease the loss, we must go to change the parameter in the opposite direction.
# So we increase the data of each parameter by multiplying with the negative 1% of it's grad
for p in mlp_nn1.parameters():
    p.data += -0.01 * p.grad

In [97]:
print("Now we are making a next forward pass ...")
yspred_9 = [(mlp_nn1(x)) for x in xs]
# Values labeled L2 | N0 | o are the ys predictions for each Example
yspred_9

Now we are making a next forward pass ...


[Value(data=0.9627277755860726, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9918035519357604, grad=0, op=tanh, label=L2|N0|o),
 Value(data=-0.9597340937135229, grad=0, op=tanh, label=L2|N0|o),
 Value(data=0.9534033599201349, grad=0, op=tanh, label=L2|N0|o)]

In [98]:
print("... and recalculate the loss")
losss_9 = [(ypred - ygt)**2 for ygt, ypred in zip(ygts, yspred_9)]
# label the loss by index for each loss
for idx, loss in enumerate(losss_9): loss._label = f'loss{idx}'
# The overall loss is
losssum_9 = sum(losss_9)
losssum_9._label = 'losssum'
print(f'loss for all ground truths: {losssum_9}')

... and recalculate the loss
loss for all ground truths: Value(data=0.005248990549435787, grad=0, op=+, label=losssum)
