In [146]:
import numpy as np

In [147]:
# Initial inputs
x0 = np.array([1, 2])
x1 = np.array([0.5, 3])

# Labels
y0 = 0.5
y1 = 1.25

# Initial weights
Wa = np.array([0.45, 0.25])
Wi = np.array([0.95, 0.8])
Wf = np.array([0.7, 0.45])
Wo = np.array([0.6, 0.4])
W = np.array([Wa, Wi, Wf, Wo])
WT = np.transpose(W)

# Initial weights associated with prev output (ht-1)
Ua = 0.15
Ui = 0.8
Uf = 0.1
Uo = 0.25
U = np.array([Ua, Ui, Uf, Uo])
UT = np.transpose(U)

# Biases
ba = 0.2
bi = 0.65
bf = 0.15
bo = 0.1
b = np.array([ba, bi, bf, bo])

# Assume initial output of previous state as 0
ht = 0
Ct = 0
print(WT, U)

[[0.45 0.95 0.7  0.6 ]
 [0.25 0.8  0.45 0.4 ]] [0.15 0.8  0.1  0.25]


In [148]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [149]:
# Forward pass t=0, part 1 (gates)
a0 = np.tanh(np.dot(Wa, x0) + np.dot(Ua, ht) + ba)
i0 = sigmoid(np.dot(Wi, x0) + np.dot(Ui, ht) + bi)
f0 = sigmoid(np.dot(Wf, x0) + np.dot(Uf, ht) + bf)
o0 = sigmoid(np.dot(Wo, x0) + np.dot(Uo, ht) + bo)
print("a0: ", a0, "\ni0: ", i0, "\nf0: ", f0, "\no0: ", o0)

a0:  0.8177540779702877 
i0:  0.9608342772032357 
f0:  0.8519528019683106 
o0:  0.8175744761936437


In [150]:
# Forward pass t=0, part 2 (state and output)
Ct0 = np.dot(a0, i0) + np.dot(f0, Ct)
ht0 = np.dot(np.tanh(Ct0), o0)
print("Ct0: ", Ct0, "\nht0: ", ht0)

Ct0:  0.7857261484365797 
ht0:  0.5363133978820118


In [151]:
# Forward pass t=1, part 1 (gates)
a1 = np.tanh(np.dot(Wa, x1) + np.dot(Ua, ht0) + ba)
i1 = sigmoid(np.dot(Wi, x1) + np.dot(Ui, ht0) + bi)
f1 = sigmoid(np.dot(Wf, x1) + np.dot(Uf, ht0) + bf)
o1 = sigmoid(np.dot(Wo, x1) + np.dot(Uo, ht0) + bo)
print("a1: ", a1, "\ni1: ", i1, "\nf1: ", f1, "\no1: ", o1)

a1:  0.8498040223194213 
i1:  0.9811839683254171 
f1:  0.8703019698552491 
o1:  0.8499333428022842


In [152]:
# Forward pass t=1, part 2 (state and output)
Ct1 = np.dot(a1, i1) + np.dot(f1, Ct0)
ht1 = np.dot(np.tanh(Ct1), o1)
print("Ct1: ", Ct1, "\nht1: ", ht1)

Ct1:  1.5176330976694041 
ht1:  0.7719811057588907


In [153]:
# Backward pass t=1
Loss0 = ((ht0 - y0)**2)/2
Loss1 = ((ht1 - y1)**2)/2
print("Loss 0:", Loss0, "Loss 1:", Loss1)
dL1dht1 = ht1 - y1
dht1do1 = np.tanh(Ct1)
do1dz1 = o1 * (1 - o1)
dz1dWo1 = x1
dLdWo1 = dL1dht1*dht1do1*do1dz1*dz1dWo1

dht1dCt1 = (1-(np.tanh(Ct1)**2))*o1

dstate = dL1dht1*dht1dCt1

dCt1da1 = i1
da1dz1 = 1-a1**2
da = dstate*i1*(1-a1**2)
dz1dWa1 = x1
dLdWa1 = dL1dht1*dht1dCt1*dCt1da1*da1dz1*dz1dWa1

dCt1di1 = a1 
di1dz1 = i1*(1 - i1)
dz1dWi = x1
dLdWi1 = dL1dht1*dht1dCt1*dCt1di1*di1dz1*dz1dWi

dCt1df1 = Ct0 
df1dz = f1*(1 - f1)
dzdWf1 = x1
dLdWf1 = dL1dht1*dht1dCt1*dCt1df1*df1dz*dzdWf1
print("dLdWo1: ", dLdWo1, "\ndLdWa1: ", dLdWa1, "\ndLdWi1: ", dLdWi1, "\ndLdWf1: ", dLdWf1)
dht1dCt1
dL1dht1
dstate
da

Loss 0: 0.0006593314328686503 Loss 1: 0.11425103162574643
dLdWo1:  [-0.02768892 -0.16613349] 
dLdWa1:  [-0.00969217 -0.05815304] 
dLdWi1:  [-0.00055781 -0.00334684] 
dLdWf1:  [-0.00315327 -0.01891963]


-0.01938434750774075

In [155]:
# Backward pass t=0
print("Loss 0:", Loss0, "Loss 1:", Loss1)
dL0dht0 = ht0 - y0
dht0do0 = np.tanh(Ct0)
do0dz0 = o0 * (1 - o0)
dz0dWo0 = x0
dLdWo0 = dL0dht0*dht0do0*do0dz0*dz0dWo0

dht0dCt0 = (1-np.tanh(Ct0)**2)*o0
dCt0da0 = i0
da0dz0 = 1-a0**2
dz0dWa0 = x0
dLdWa0 = dL0dht0*dht0dCt0*dCt0da0*da0dz0 #*dz0dWa0

dCt0di0 = a0 
di0dz0 = i0*(1 - i0)
dz0dWi0 = x0
dLdWi0 = dL0dht0*dht0dCt0*dCt0di0*di0dz0*dz0dWi0

dCt0df0 = Ct
df0dz = f0*(1 - f0)
dzdWf0 = x0
dLdWf0 = dL0dht0*dht0dCt0*dCt0df0*df0dz*dzdWf0
print("dLdWo0: ", dLdWo0, "\ndLdWa0: ", dLdWa0, "\ndLdWi0: ", dLdWi0, "\ndLdWf0: ", dLdWf0)

Loss 0: 0.0006593314328686503 Loss 1: 0.11425103162574643
dLdWo0:  [0.0035528  0.00710561] 
dLdWa0:  0.005383606814026229 
dLdWi0:  [0.00052049 0.00104097] 
dLdWf0:  [0. 0.]


In [156]:
dW = dLdWa0 + dLdWa1
dW

array([-0.00430857, -0.05276944])

In [157]:
Wa1 = Wa - 0.1*(dLdWa0 + dLdWa1)
Wa1

array([0.45043086, 0.25527694])

In [158]:
# Backward pass delta components
deltaT0 = delta t the output difference as computed by any subsequent layers
deltaOut0 = UT * dgates1 #delta out the output difference as computed by the next time-step LSTM 

# Let's find:
dout0 = deltaT0 + deltaOut0
dstate0 = dout0 * ot0 * (1 - np.tanh(Ct0)**2) + (dstate1 * f1) #equivalent to dL0/dCt0 + dL1/dCt1

# dgates are the deltas at the gate that will be used to calculate then the final derivative for a respective weight
# W, U and b
da0 = dCt0 * i0 * (1 - a0**2)
di0 = dCt0 * a0 * i0 * (1 - i0)
df0 = dCt0 * Ct-1 * f0 * (1 - f0)
do0 = dout0 * np.tanh(Ct0) * o0 * (1 - o0)
dx0 = WT * dgates0 #What is this?
deltaOut-1 = UT * dgates0

# Final updates to the internal parameters are computed as:
dgates = [da0, di0, df0, do0]
dW = dgates0*x0 + dgates1*x1 + ... + dgatesn*xn
dU = dgates1*out0 + dgates2*out1 + ... + dgates3*out2
db = dgates1 + ... + dgates2

SyntaxError: invalid syntax (<ipython-input-158-6a3b9c995dce>, line 2)

In [159]:
# Backward pass t=1 Deltas
Loss1 = ((ht1 - y1)**2)/2
delta1 = ht1 - y1 
deltaOut = 0 #because there are no future time-steps
dout1 = delta1 + deltaOut
dCt2 = 0
f2 = 0
dCt1 = dout1 * o1 * (1-np.tanh(Ct1)**2) + dCt2 * f2 # backprop into state
da1 = dCt1*i1*(1 - a1**2)  # backprop into gate a
di1 = dCt1*a1*i1*(1 - i1)  # backprop into gate i                        
df1 = dCt1*Ct0*f1*(1-f1)   # backprop into gate f
do1 = dout1*np.tanh(Ct1)*o1*(1 - o1)  # backprop into gate o
dgates1 = np.array([da1, di1, df1, do1]) # all gate deltas
dx1 = np.matmul(WT, dgates1)             # backprop into input
deltaOut0 = np.dot(UT, dgates1)          # backprop into prev h
print("Loss1: ", Loss1, "\ndelta1: ", delta1, "\ndeltaOut: ", deltaOut, "\ndout1: ", dout1, "\ndCt1: ", dCt1,
      "\nda1: ", da1, "\ndi1: ", di1, "\ndf1: ", df1, "\ndo1: ", do1, "\ndgates1: ", dgates1, "\ndx1: ", dx1,
      "\ndeltaOut0: ", deltaOut0)

Loss1:  0.11425103162574643 
delta1:  -0.4780188942411093 
deltaOut:  0 
dout1:  -0.4780188942411093 
dCt1:  -0.07110771475756385 
da1:  -0.01938434750774075 
di1:  -0.001115614070446023 
df1:  -0.006306541742007648 
do1:  -0.05537783112520711 
dgates1:  [-0.01938435 -0.00111561 -0.00630654 -0.05537783] 
dx1:  [-0.04742407 -0.03072765] 
deltaOut0:  -0.018275255338020473


In [160]:
# Backward pass t=0 Deltas
Loss0 = ((ht0 - y0)**2)/2
delta0 = ht0 - y0
#deltaOut0 = 0 #because there are no future time-steps
dout0 = delta0 + deltaOut0
dCt0 = dout0 * o0 * (1-np.tanh(Ct0)**2) + (dCt1 * f1)
da0 = dCt0*i0*(1 - (a0**2))
di0 = dCt0*a0*i0*(1 - i0)
Ct = 0
df0 = dCt0*Ct*f0*(1-f0)
do0 = dout0*np.tanh(Ct0)*o0*(1 - o0)
dgates0 = np.array([da0, di0, df0, do0])
dx0 = np.matmul(WT, dgates0)
deltaOut = np.dot(UT, dgates0)
print("Loss0: ", Loss0, "\ndelta0: ", delta0, "\ndeltaOut: ", deltaOut, "\ndout0: ", dout0, "\ndCt0: ", dCt0,
      "\nda0: ", da1, "\ndi0: ", di1, "\ndf0: ", df1, "\ndo0: ", do0, "\ndgates0: ", dgates0, "\ndx0: ", dx0,
      "\ndeltaOut: ", deltaOut)

Loss0:  0.0006593314328686503 
delta0:  0.03631339788201182 
deltaOut:  -0.003429111652144951 
dout0:  0.01803814254399135 
dCt0:  -0.05348368434062696 
da0:  -0.01938434750774075 
di0:  -0.001115614070446023 
df0:  -0.006306541742007648 
do0:  0.0017648023073026033 
dgates0:  [-0.01702404 -0.00164588 -0.          0.0017648 ] 
dx0:  [-0.00816553 -0.0048668 ] 
deltaOut:  -0.003429111652144951


In [161]:
dgates0.shape

(4,)

In [162]:
print(np.array(np.transpose(dgates0)).reshape(1, 4))
print(x0.reshape(1, 2).shape)
print(x0.shape)
print(np.transpose(x0).shape)

[[-0.01702404 -0.00164588 -0.          0.0017648 ]]
(1, 2)
(2,)
(2,)


In [163]:
# Changing weights
dW = np.outer(dgates0, x0) + np.outer(dgates1, (x1))
dW

array([[-0.02671622, -0.09220113],
       [-0.00220369, -0.00663861],
       [-0.00315327, -0.01891963],
       [-0.02592411, -0.16260389]])

In [164]:
dU = np.outer(dgates1, ht0)
dU1 = np.dot(dgates1, ht)
dU = dgates1*ht0
#print('{0:.16f}'.format(dU))
dU

array([-0.01039609, -0.00059832, -0.00338228, -0.02969987])

In [165]:
db = dgates0 + dgates1
db

array([-0.03640839, -0.0027615 , -0.00630654, -0.05361303])

In [169]:
# Updating all weights
lr = 0.1
W1 = W - (lr*dW)
print("W1: \n",W1)
U1 = U - (lr*dU)
print("\nU1: ", U1)
b1 = b - (lr*db)
print("\nb1: ", b1)

W1: 
 [[0.45267162 0.25922011]
 [0.95022037 0.80066386]
 [0.70031533 0.45189196]
 [0.60259241 0.41626039]]

U1:  [0.15103961 0.80005983 0.10033823 0.25296999]

b1:  [0.20364084 0.65027615 0.15063065 0.1053613 ]
