In [1]:
import numpy as np

In [22]:
class Tanh(object):
    
    def forward(self, X_in):
        return np.tanh(X_in)
    
    def backward(self, X_in):
        #dEdX = dEdY * dYdX = dEdY * 1 - (tanh(X))^2
        return 1 - (np.tanh(X_in))**2

In [56]:
class ReLu(object):
    
    def forward(self, X_in):
        return np.maximum(X_in, 0)
    
    def backward(self, X_in):
        dYdX = (X_in > 0)  
        return dYdX

In [144]:
class RnnLayer(object):
    
    def __init__(self, input_dim, hidden_dim, seq_len, batch_size, use_bias=True, activation=Tanh):
        sq = np.sqrt(1. / hidden_dim)
        self.use_bias = use_bias
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.activation = activation()
        self.input_weights = np.random.uniform(-sq, sq, (hidden_dim, input_dim))
        self.hidden_weights = np.random.uniform(-sq, sq, (hidden_dim, hidden_dim))
        
        if self.use_bias:
            self.bias = np.random.uniform(-sq, sq, hidden_dim)
        else:
            self.bias = np.zeros((hidden_dim))
        
    def forward(self, X_in):        
        #treba li dodati provjeru je li X_in stvarno ima sekvencu jednaku seq_len?
        #treba li dodati provjeru je li X_in prva koordinata jednaka batch_size
        
        #u ovom slucaju sam pretpostavio da je za sve inpute, pocetno stanje 0 u 0. vremenskom trenutku
        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim)) 
        
        for i in range(self.seq_len):
            
            input_part = np.einsum('ij,jk->ik', X_in[:,i,:], self.input_weights.T)
            hidden_part = np.einsum('ij,jj->ij', H[:,i,:], self.hidden_weights.T)
            
            H[:,i+1,:] = self.activation.forward(input_part + hidden_part + self.bias)
       
        return H, H[:,self.seq_len,:]
    
    def book_forward(self, X_in):
        
        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim)) 
        
        for i in range(self.seq_len):
            #ovdje dobivam transponirano iz mog forwarda, ali sam u einsum zamijenio vrijednosti, tako da zapravo dobijem isto
            input_part = np.einsum('ij,jk->ki',self.input_weights, X_in[:,i,:].T)
            hidden_part = np.einsum('ii,ij->ji',self.hidden_weights, H[:,i,:].T)
            
            H[:,i+1,:] = self.activation.forward(input_part + hidden_part + self.bias)
       
        return H, H[:,self.seq_len,:]
    
    def backward(self, X, H, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)
        
        dEdB_in = np.zeros_like(self.bias)
        
        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:,self.seq_len,:] = dEdY[:,self.seq_len - 1,:]
        
        for i in range(self.seq_len, 0, -1):
            
            activation_backward = self.activation.backward(H[:,i,:]).reshape(self.batch_size, self.hidden_dim, 1)
        
            a = activation_backward * (np.einsum('bh,bi->bhi', H_grad[:,i,:], X[:,i-1,:]))
            b = activation_backward * (np.einsum('bh,bk->bhk', H_grad[:,i,:], H[:,i-1,:]))
            
            dEdW_in += np.sum(activation_backward * (np.einsum('bh,bi->bhi', H_grad[:,i,:], X[:,i-1,:])), axis=0)
            dEdW_hh += np.sum(activation_backward * (np.einsum('bh,bk->bhk', H_grad[:,i,:], H[:,i-1,:])), axis=0)
            
            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(H[:,i,:]) * H_grad[:,i,:], axis=(0))
            else:
                pass
            
            if i > 1:
                H_grad[:,i-1,:] = np.einsum('bh,hh->bh', H_grad[:,i,:], self.hidden_weights) * self.activation.backward(H[:,i,:]) + dEdY[:,i-2,:]
            else:
                H_grad[:,i-1,:] = np.einsum('bh,hh->bh', H_grad[:,i,:], self.hidden_weights) * self.activation.backward(H[:,i,:])
        
        return dEdW_in, dEdW_hh, dEdB_in
            
    def backward_checker(self, X, H, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)
        
        print(f'self.bias={self.bias}')
        
        dEdB_in = np.zeros_like(self.bias)
        
        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:,self.seq_len,:] = dEdY[:,self.seq_len - 1,:]
        
        for i in range(self.seq_len, 0, -1):
            
            for k in range (self.batch_size):
                act_grad = np.diag(self.activation.backward(H[k,i,:]))
                h_grad = H_grad[k,i,:].reshape(self.hidden_dim, 1)
                
                dEdW_in += np.dot(act_grad, np.dot(h_grad, X[k,i-1,:].reshape(1, self.input_dim)))
                dEdW_hh += np.dot(act_grad, np.dot(h_grad, H[k,i-1,:].reshape(1, self.hidden_dim)))
            
            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(H[:,i,:]) * H_grad[:,i,:], axis=(0))
            else:
                pass
            
            if i > 1:
                H_grad[:,i-1,:] = np.einsum('bh,hh->bh', H_grad[:,i,:], self.hidden_weights) * self.activation.backward(H[:,i,:]) + dEdY[:,i-2,:]
            else:
                H_grad[:,i-1,:] = np.einsum('bh,hh->bh', H_grad[:,i,:], self.hidden_weights) * self.activation.backward(H[:,i,:])
        
        return dEdW_in, dEdW_hh, dEdB_in

In [146]:
#rnn forward checker

rnn = RnnLayer(4, 5, 3, 2, use_bias=False)
rnn1 = RnnLayer(4, 5, 3, 2, use_bias=False)
#input dim 4
#hidden dim 5
#batch 2
#timestamps 3
rnn1.input_weights = rnn.input_weights
rnn1.hidden_weights = rnn.hidden_weights

X_in = np.array([[[1,2,1,3],[2,2,3,1],[0,2,3,1]],[[1,3,4,3],[1,2,1,1],[1,0,1,2]]])
H, last = rnn.forward(X_in)
H1, last1 = rnn1.forward(X_in)

In [147]:
dEdY = np.array([[[ 0.34545989,  0.07336296, -0.16346513, -0.06904482,
          0.0458759 ],
        [ 0.37271336,  0.07915059, -0.17636096, -0.07449179,
          0.04949507],
        [ 0.35166208,  0.07468007, -0.16639989, -0.07028441,
          0.04669953]],

       [[ 0.36616935,  0.07776088, -0.17326446, -0.07318388,
          0.04862605],
        [ 0.33954613,  0.07210709, -0.16066685, -0.06786287,
          0.04509058],
        [ 0.35872758,  0.07618053, -0.16974315, -0.07169654,
          0.04763781]]])

Win, Wh, Bin= rnn.backward_checker(X_in, H, dEdY)
Win1, Wh1, Bin1 = rnn1.backward(X_in, H1, dEdY)

self.bias=[0. 0. 0. 0. 0.]


In [148]:
print(f'Wh1={Wh1}')
print(f'Wh={Wh}')

Wh1=[[-0.09290856 -0.39063081  1.01463126  1.14421039 -0.36029288]
 [-0.01613264 -0.07558906  0.19758189  0.22148378 -0.06832331]
 [ 0.02526107  0.1142049  -0.3143839  -0.35383634  0.09568708]
 [ 0.01047346  0.0459219  -0.11493675 -0.13019808  0.0440292 ]
 [-0.00809848 -0.03703844  0.0914215   0.10319943 -0.03601933]]
Wh=[[-0.09290856 -0.39063081  1.01463126  1.14421039 -0.36029288]
 [-0.01613264 -0.07558906  0.19758189  0.22148378 -0.06832331]
 [ 0.02526107  0.1142049  -0.3143839  -0.35383634  0.09568708]
 [ 0.01047346  0.0459219  -0.11493675 -0.13019808  0.0440292 ]
 [-0.00809848 -0.03703844  0.0914215   0.10319943 -0.03601933]]


In [149]:
#dEdW_in += np.einsum('bh,bi->hi', H_grad[:,i,:], X[:,i-1,:])
#dEdW_hh += np.einsum('bh,bk->hk', H_grad[:,i,:], H[:,i-1,:])

X = np.array([[[1,3,3,2], [1,3,1,3],[2,2,1,1]], [[2,3,2,1],[2,3,1,3],[0,2,1,2]], [[1,1,2,0],[2,2,1,2], [1,2,1,2]]])
print(f'X.shape={X.shape}')

H_grad = np.array([[[3,3,2], [3,1,3],[2,1,1],[2,1,1]], [[3,2,1],[3,1,3],[0,1,2],[2,1,1]], [[1,2,0],[2,1,2], [2,1,2],[2,1,1]]])
print(f'H_grad.shape={H_grad.shape}')

X_e1 = np.einsum('bh,bi->bhi', H_grad[:,1,:], X[:,0,:])
X_e2 = np.einsum('bh,bi->hi', H_grad[:,1,:], X[:,0,:])
print(f'X_e1={X_e1}')

act = Tanh()

back = act.backward(H_grad[:,1,:])
print(f'back={back}')

X.shape=(3, 3, 4)
H_grad.shape=(3, 4, 3)
X_e1=[[[3 9 9 6]
  [1 3 3 2]
  [3 9 9 6]]

 [[6 9 6 3]
  [2 3 2 1]
  [6 9 6 3]]

 [[2 2 4 0]
  [1 1 2 0]
  [2 2 4 0]]]
back=[[0.00986604 0.41997434 0.00986604]
 [0.00986604 0.41997434 0.00986604]
 [0.07065082 0.41997434 0.07065082]]


In [150]:
n = X[0,:,:]
print(n)
a = back[0]
print(np.diag(a))
print(np.dot(np.diag(a),n))

print(a.reshape(3,1))
print(a.reshape(3,1) * n)
print(br[0,:,:])

print(br[0,:,:] * n)
br1 = br[0,:,:]*X[0,:,:]
print(br1)

[[1 3 3 2]
 [1 3 1 3]
 [2 2 1 1]]
[[0.00986604 0.         0.        ]
 [0.         0.41997434 0.        ]
 [0.         0.         0.00986604]]
[[0.00986604 0.02959811 0.02959811 0.01973207]
 [0.41997434 1.25992302 0.41997434 1.25992302]
 [0.01973207 0.01973207 0.00986604 0.00986604]]
[[0.00986604]
 [0.41997434]
 [0.00986604]]
[[0.00986604 0.02959811 0.02959811 0.01973207]
 [0.41997434 1.25992302 0.41997434 1.25992302]
 [0.01973207 0.01973207 0.00986604 0.00986604]]
[[0.00986604]
 [0.41997434]
 [0.00986604]]
[[0.00986604 0.02959811 0.02959811 0.01973207]
 [0.41997434 1.25992302 0.41997434 1.25992302]
 [0.01973207 0.01973207 0.00986604 0.00986604]]
[[0.00986604 0.02959811 0.02959811 0.01973207]
 [0.41997434 1.25992302 0.41997434 1.25992302]
 [0.01973207 0.01973207 0.00986604 0.00986604]]
