In [2]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np

In [28]:
class Softmax(object):
    @staticmethod
    def forward(x_in):
        exps = np.exp(x_in-np.max(x_in, axis=-1, keepdims=True))
        return exps / np.sum(exps, axis=-1, keepdims=True)

class Sigmoid(object):
    
    @staticmethod
    def forward(x_in):
        return 1./(1 + np.exp(-x_in))
    
    @staticmethod
    def backward(x_in):
        fw = Sigmoid().forward(x_in)
        return fw * (1 - fw)
    
    @staticmethod
    def backward_calculated(sigmoid_x):
        return sigmoid_x * (1 - sigmoid_x)
    
    
class Tanh(object):

    @staticmethod
    def forward(X_in):
        return np.tanh(X_in)

    @staticmethod
    def backward(X_in):
        # dEdX = dEdY * dYdX = dEdY * 1 - (tanh(X))^2
        return 1 - (np.tanh(X_in)) ** 2

    @staticmethod
    def backward_calculated(tanh_x_in):
        return 1 - tanh_x_in ** 2


class ReLu(object):

    @staticmethod
    def forward(x_in):
        return np.maximum(x_in, 0)

    @staticmethod
    def backward(x_in):
        return x_in > 0
    
    
class CrossEntropyLoss(object):
    def __init__(self):
        self.y_pred = None

    def forward(self, y, o):
        self.y_pred = Softmax.forward(o)
        return np.sum(-y * np.log(self.y_pred + 1e-15))/(y.shape[0])

    def backward(self, y):
        return (self.y_pred - y) / y.shape[0]

In [151]:
class RnnLayer(object):

    def __init__(self, input_dim, hidden_dim, seq_len, batch_size, use_bias=True, activation=Tanh):
        sq = np.sqrt(1. / hidden_dim)
        self.use_bias = use_bias
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.activation = activation()
        self.input_weights = np.random.uniform(-sq, sq, (hidden_dim, input_dim))
        self.hidden_weights = np.random.uniform(-sq, sq, (hidden_dim, hidden_dim))

        if self.use_bias:
            self.bias = np.random.uniform(-sq, sq, hidden_dim)
        else:
            self.bias = np.zeros(hidden_dim)

    def forward(self, x_in):
        # treba li dodati provjeru je li X_in stvarno ima sekvencu jednaku seq_len?
        # treba li dodati provjeru je li X_in prva koordinata jednaka batch_size

        # u ovom slucaju sam pretpostavio da je za sve inpute, pocetno stanje 0 u 0. vremenskom trenutku
        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            input_part = np.einsum('ij,jk->ik', x_in[:, i, :], self.input_weights.T)
            hidden_part = np.einsum('ij,jk->ik', H[:, i, :], self.hidden_weights.T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def book_forward(self, x_in):

        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            # ovdje dobivam transponirano iz mog forwarda, ali sam u einsum zamijenio vrijednosti, tako da zapravo dobijem isto
            input_part = np.einsum('ij,jk->ki', self.input_weights, x_in[:, i, :].T)
            hidden_part = np.einsum('ij,jk->ik', self.hidden_weights, H[:, i, :].T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def backward(self, x, h, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        dEdB_in = np.zeros_like(self.bias)
        
        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:, self.seq_len, :] = dEdY[:,self.seq_len - 1, :]
        

        for i in range(self.seq_len, 0, -1):
            
            activation_backward = self.activation.backward(h[:, i, :])
            back_reshaped = activation_backward.reshape(self.batch_size, self.hidden_dim, 1)
            
            dEdW_in += np.sum(back_reshaped * (np.einsum('bh,bi->bhi', H_grad[:, i, :], x[:, i - 1, :])), axis=0)
            dEdW_hh += np.sum(back_reshaped * (np.einsum('bh,bk->bhk', H_grad[:, i, :], h[:, i - 1, :])), axis=0)

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(h[:, i, :]) * H_grad[:, i, :], axis=0)
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * activation_backward + dEdY[:, i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * activation_backward

    
        return dEdW_in, dEdW_hh, dEdB_in, H_grad

    def backward_checker(self, X, H, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:, self.seq_len, :] = dEdY[:, self.seq_len - 1, :]
        
        for i in range(self.seq_len, 0, -1):

            for k in range(self.batch_size):
                act_grad = np.diag(self.activation.backward(H[k, i, :]))
                h_grad = H_grad[k, i, :].reshape(self.hidden_dim, 1)

                dEdW_in += np.dot(act_grad, np.dot(h_grad, X[k, i - 1, :].reshape(1, self.input_dim)))
                dEdW_hh += np.dot(act_grad, np.dot(h_grad, H[k, i - 1, :].reshape(1, self.hidden_dim)))

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(H[:, i, :]) * H_grad[:, i, :], axis=(0))
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.dot(H_grad[:, i, :],self.hidden_weights) * self.activation.backward(H[:, i, :]) + dEdY[:,i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.dot(H_grad[:, i, :],self.hidden_weights) * self.activation.backward(H[:, i, :])

        return dEdW_in, dEdW_hh, dEdB_in
    
    
class DenseLayer(object):

    def __init__(self, input_dim, output_dim, use_bias=True):
        sq = np.sqrt(1. / input_dim)
        self.use_bias = use_bias
        self.weights = np.random.uniform(-sq, sq, (output_dim, input_dim))
        if use_bias:
            self.bias = np.random.uniform(-sq, sq, output_dim)
        else:
            self.bias = np.zeros(output_dim)

    def forward(self, x_in):
        return np.tensordot(x_in, self.weights.T, axes=((-1), 0)) + self.bias

    def backward(self, de_dy, x_in):
        # de_dw = de_dy * dYdW = de_dy * X
        # dEdb = de_dy * dYdb = de_dy
        # dEdX = de_dy * dYdX = de_dy * W
        axis = tuple(range(len(x_in.shape) - 1))
        de_dw = np.tensordot(de_dy, x_in, axes=(axis, axis))
        de_db = np.sum(de_dy, axis=axis)
        de_dx = np.tensordot(de_dy, self.weights, axes=(-1, 0))

        return de_dx, de_dw, de_db

    def refresh(self, de_dw, de_db, learning_rate):
        self.weights = self.weights - learning_rate * de_dw
        if self.use_bias:
            self.bias = self.bias - learning_rate * de_db
            
            
class LSTMLayer(object):

    def __init__(self, input_dim, hidden_dim, use_bias=True):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.use_bias = use_bias

        sq = np.sqrt(1. / hidden_dim)
        # input weights (W_in_hi|W_fgt_hi|W_g_hi|W_out_hi)
        self.input_weights = np.random.uniform(-sq, sq, (4, hidden_dim, input_dim))
        # hidden weights (W_in_hh|W_fgt_hh|W_g_hh|W_out_hh)
        self.hidden_weights = np.random.uniform(-sq, sq, (4, hidden_dim, hidden_dim))

        self.tanh = Tanh
        self.sigmoid = Sigmoid

        self.gates = None
        self.H = None
        self.C = None

        if self.use_bias:
            # bias = (in_bias|fgt_bias|g_bias|out_bias)
            self.bias = np.random.uniform(-sq, sq, (4, hidden_dim))
        else:
            self.bias = np.zeros((4, hidden_dim))

    def forward(self, X_in, h_0=None, c_0=None):
        batch_size = X_in.shape[0]
        seq_len = X_in.shape[1]

        self.H = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        if h_0 is not None:
            self.H[:, 0, :] = h_0

        self.C = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        if c_0 is not None:
            self.C[:, 0, :] = c_0

        self.gates = np.zeros((4, batch_size, seq_len, self.hidden_dim))

        for i in range(seq_len):
            # input_gate
            self.gates[0, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[0, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[0, :, :].T) + self.bias[0, :])
            # forget gate
            self.gates[1, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[1, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[1, :, :].T) + self.bias[1, :])
            # c~ gate
            self.gates[2, :, i, :] = self.tanh.forward(
                np.dot(X_in[:, i, :], self.input_weights[2, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[2, :, :].T) + self.bias[2, :])
            # output gate
            self.gates[3, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[3, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[3, :, :].T) + self.bias[3, :])

            self.C[:, i + 1, :] = self.gates[1, :, i, :] * self.C[:, i, :] + self.gates[0, :, i, :] * self.gates[2, :, i, :]
            self.H[:, i + 1, :] = self.gates[3, :, i, :] * self.tanh.forward(self.C[:, i + 1, :])

        return self.H, self.H[:, seq_len, :], self.C[:, seq_len, :]

    def backward(self, X_in, dEdY):

        batch_size = X_in.shape[0]
        seq_len = X_in.shape[1]

        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)
        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((batch_size, seq_len, self.hidden_dim))
        C_grad = np.zeros((batch_size, seq_len, self.hidden_dim))
        X_grad = np.zeros((batch_size, seq_len, self.input_dim))

        gates_grad = np.zeros((4, batch_size, seq_len, self.hidden_dim))

        for i in range(seq_len - 1, -1, -1):

            if i < seq_len - 1:
                H_grad[:, i, :] = np.matmul(gates_grad[:, :, i + 1, :], self.hidden_weights).sum(axis=0) + dEdY[:, i, :]
                C_grad[:, i, :] = H_grad[:, i, :] * self.gates[3, :, i, :] * self.tanh.backward(self.C[:, i + 1, :]) + C_grad[:, i + 1, :] * self.gates[1, :, i + 1, :]
            else:
                H_grad[:, i, :] = dEdY[:, i, :]
                C_grad[:, i, :] = H_grad[:, i, :] * self.gates[3, :, i, :] * self.tanh.backward(self.C[:, i + 1, :])

            gates_grad[0, :, i, :] = C_grad[:, i, :] * self.gates[2, :, i, :] * self.sigmoid.backward_calculated(self.gates[0, :, i, :])
            gates_grad[1, :, i, :] = C_grad[:, i, :] * self.C[:, i, :] * self.sigmoid.backward_calculated(self.gates[1, :, i, :])
            gates_grad[2, :, i, :] = C_grad[:, i, :] * self.gates[0, :, i, :] * self.tanh.backward_calculated(self.gates[2, :, i, :])
            gates_grad[3, :, i, :] = H_grad[:, i, :] * self.tanh.forward(self.C[:, i + 1, :]) * self.sigmoid.backward_calculated(self.gates[3, :, i, :])

            X_grad[:, i, :] = np.matmul(gates_grad[:, :, i, :], self.input_weights).sum(axis=0)
                
            dEdW_in[0, :, :] += np.einsum('bi,bo->bio', gates_grad[0, :, i, :], X_in[:, i, :]).sum(axis=0)
            dEdW_in[1, :, :] += np.einsum('bi,bo->bio', gates_grad[1, :, i, :], X_in[:, i, :]).sum(axis=0)
            dEdW_in[2, :, :] += np.einsum('bi,bo->bio', gates_grad[2, :, i, :], X_in[:, i, :]).sum(axis=0)
            dEdW_in[3, :, :] += np.einsum('bi,bo->bio', gates_grad[3, :, i, :], X_in[:, i, :]).sum(axis=0)

            if i < seq_len - 1:
                dEdW_hh[0, :, :] += np.einsum('bi,bo->bio', gates_grad[0, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)
                dEdW_hh[1, :, :] += np.einsum('bi,bo->bio', gates_grad[1, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)
                dEdW_hh[2, :, :] += np.einsum('bi,bo->bio', gates_grad[2, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)
                dEdW_hh[3, :, :] += np.einsum('bi,bo->bio', gates_grad[3, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)

            if self.use_bias:
                dEdB_in[0, :] += np.sum(gates_grad[0, :, i, :], axis=0)
                dEdB_in[1, :] += np.sum(gates_grad[1, :, i, :], axis=0)
                dEdB_in[2, :] += np.sum(gates_grad[2, :, i, :], axis=0)
                dEdB_in[3, :] += np.sum(gates_grad[3, :, i, :], axis=0)
        
        return dEdW_in, dEdW_hh, dEdB_in, X_grad
        

## Softmax, Cross Entropy Loss

In [4]:
N = 5
num_classes = 4

x = torch.randn(N, num_classes) # logits
y = torch.randint(num_classes, (N,)) # labels

x_ = x.numpy()
y_ = np.eye(num_classes)[y.numpy()] # one-hot encoding

In [5]:
softmax_out = F.softmax(x, dim=-1)
softmax_out

softmax = Softmax()
softmax_out_ = softmax.forward(x_)

print('Softmax check:', np.isclose(softmax_out, softmax_out_).all())

Softmax check: True


In [6]:
cel = nn.CrossEntropyLoss()
loss = cel(x, y).item()

cel_ = CrossEntropyLoss()
loss_ = cel_.forward(y_, x_)
loss_

print('Cross entropy loss check:', np.isclose(loss, loss_))

Cross entropy loss check: True


## Linear Layer

In [15]:
N = 5
num_classes = 4
seq_len = 3

x = torch.randn(N, seq_len, requires_grad=True)
y = torch.randint(num_classes, (N,))

x_ = x.detach().numpy()
y_ = np.eye(num_classes)[y.numpy()]

linear = nn.Linear(seq_len, num_classes)
linear_ = DenseLayer(seq_len, num_classes)

linear_.weights = linear.weight.detach().numpy()
linear_.bias = linear.bias.detach().numpy()

lin_out = linear(x)
lin_out_ = linear_.forward(x_)

print('Linear layer forward check: ', np.isclose(lin_out.detach().numpy(), lin_out_).all())

Linear layer forward check:  True


In [16]:
loss = cel(lin_out, y)
lin_out.retain_grad()
loss.retain_grad()
loss.backward()
print(loss)
print(f'[TORCH] dE/dy:\n{lin_out.grad}\n')
print(f'[TORCH] dE/dW:\n{linear.weight.grad}\n')
print(f'[TORCH] dE/dB:\n{linear.bias.grad}\n')
print(f'[TORCH] dE/dX:\n{x.grad}\n')

tensor(1.7812, grad_fn=<NllLossBackward>)
[TORCH] dE/dy:
tensor([[ 0.0154, -0.1254,  0.0595,  0.0505],
        [ 0.0260, -0.0758,  0.0206,  0.0292],
        [ 0.0358,  0.0895,  0.0610, -0.1863],
        [ 0.0995,  0.0885, -0.1909,  0.0029],
        [ 0.0300,  0.0869,  0.0454, -0.1623]])

[TORCH] dE/dW:
tensor([[ 0.0685,  0.1054, -0.2851],
        [ 0.4073,  0.0524, -0.2640],
        [-0.2708, -0.4270,  0.2497],
        [-0.2051,  0.2692,  0.2994]])

[TORCH] dE/dB:
tensor([ 0.2067,  0.0636, -0.0044, -0.2660])

[TORCH] dE/dX:
tensor([[ 0.0082, -0.0504,  0.0747],
        [ 0.0138, -0.0136,  0.0439],
        [ 0.0379,  0.0321, -0.1905],
        [ 0.0463,  0.1636, -0.0177],
        [ 0.0319,  0.0328, -0.1681]])



In [19]:
loss_ = cel_.forward(y_, lin_out_)

de_dy = cel_.backward(y_)
de_dx, de_dw, de_db = linear_.backward(de_dy, x_)

In [10]:
print(f'[CUSTOM] dE/dy:\n{de_dy}\n')
print(f'[CUSTOM] dE/dW:\n{de_dw}\n')
print(f'[CUSTOM] dE/dB:\n{de_db}\n')
print(f'[CUSTOM] dE/dX:\n{de_dx}\n')

print('Check dE/dy:', np.isclose(lin_out.grad, de_dy).all())
print('Check dE/dX:', np.isclose(x.grad, de_dx).all())
print('Check dE/dW:', np.isclose(linear.weight.grad, de_dw).all())
print('Check dE/dB:', np.isclose(linear.bias.grad, de_db).all())

[CUSTOM] dE/dy:
[[ 0.01289863 -0.12475173  0.07498912  0.03686398]
 [ 0.01184563 -0.15936113  0.11452467  0.03299084]
 [ 0.09872308  0.02475784  0.04569956 -0.16918049]
 [-0.1750032   0.03356197  0.10637606  0.03506516]
 [-0.18160434  0.03501907  0.11235602  0.03422925]]

[CUSTOM] dE/dW:
[[-0.02335379  0.34712666  0.28857261]
 [ 0.18554716  0.20977268  0.25030635]
 [ 0.01165509 -0.29133464 -0.25821118]
 [-0.17384847 -0.26556471 -0.28066779]]

[CUSTOM] dE/dB:
[-0.2331402  -0.19077397  0.45394544 -0.03003125]

[CUSTOM] dE/dX:
[[ 0.09699437 -0.03969966 -0.00161258]
 [ 0.12726112 -0.05942757 -0.00761959]
 [ 0.0481368   0.0253786   0.03034998]
 [-0.07159334 -0.10329892 -0.09504108]
 [-0.07380924 -0.1078932  -0.09902084]]

Check dE/dy: True
Check dE/dX: True
Check dE/dW: True
Check dE/dB: True


## RNN


In [11]:
#N = 5
#emb_dim = 6
#seq_len = 3
#hidden_dim = 8

N = 5
emb_dim = 6
seq_len = 3
hidden_dim = 5


x = torch.randn(N, seq_len, emb_dim, requires_grad=True)

x_ = x.detach().numpy()

rnn = nn.RNN(emb_dim, hidden_dim, bias=False, batch_first=True)
rnn_ = RnnLayer(emb_dim, hidden_dim, seq_len, N, use_bias=False)
rnn_.input_weights = rnn.weight_ih_l0.detach().numpy()
rnn_.hidden_weights = rnn.weight_hh_l0.detach().numpy()

rnn_out, h_n = rnn(x)
rnn_out_, h_n_ = rnn_.forward(x_)
rnn_out__ = rnn_out_[:, 1:, :] # remove zeros prepended to every hidden output

print('RNN layer forward check: ', np.isclose(rnn_out.detach().numpy(), rnn_out__).all())
print('RNN layer forward check last hidden: ', np.isclose(h_n.detach().numpy(), h_n_).all())

RNN layer forward check:  True
RNN layer forward check last hidden:  True


In [12]:
de_dy = torch.randn(N, seq_len, hidden_dim)
de_dy_ = de_dy.numpy()

rnn_out.retain_grad()
rnn_out.backward(de_dy)

print(f'[TORCH] dE/dWih:\n{rnn.weight_ih_l0.grad}\n')
print(f'[TORCH] dE/dWhh:\n{rnn.weight_hh_l0.grad}\n')
print(f'[TORCH] dE/dy:\n{de_dy}\n')
print(f'[TORCH] dE/dH:\n{rnn_out.grad}\n')

[TORCH] dE/dWih:
tensor([[-3.3167,  0.5416, -7.4245,  0.0461, -1.4704, -1.7172],
        [-0.7243,  1.1735,  2.7081,  2.2718, -2.2479, -1.1513],
        [ 0.6725,  3.8284, -5.6269,  1.1262, -1.7440,  1.2039],
        [-0.2735, -0.0848,  0.6839,  4.6405,  2.1151,  1.0014],
        [-1.8822, -2.2299,  2.0466, -2.6408,  2.7569,  7.1491]])

[TORCH] dE/dWhh:
tensor([[ 1.2223,  0.4866, -1.0587, -0.5920, -0.6407],
        [-1.0210,  0.4920,  0.9444,  0.0907,  1.6406],
        [ 0.9988, -0.2676, -0.1303, -1.7490, -0.9632],
        [ 0.5061, -0.0735,  0.3186, -1.2058, -0.2807],
        [-0.0880, -0.1201, -0.5381,  1.0406, -1.2745]])

[TORCH] dE/dy:
tensor([[[ 1.3564e+00,  4.0721e-02,  1.6143e+00,  4.7909e-01,  7.8997e-01],
         [-6.2923e-01,  1.7595e-01, -7.4402e-01, -7.8599e-01,  1.1243e-01],
         [ 7.2088e-01,  1.3641e+00, -1.3155e+00,  5.0142e-02, -1.3644e+00]],

        [[ 1.1343e+00,  8.4810e-01,  1.2058e+00, -6.4061e-01, -2.5855e+00],
         [-6.5947e-02,  1.6423e-01, -2.4149e-0

In [13]:
dEdW_in, dEdW_hh, _, H_grad = rnn_.backward(x_, rnn_out_, de_dy_)

print(f'[CUSTOM] dE/dWih:\n{dEdW_in}\n')
print(f'[CUSTOM] dE/dWhh:\n{dEdW_hh}\n')
print(f'[CUSTOM] dE/dH:\n{H_grad[:,1:,:]}\n')

print('RNN layer gradient check dEdW_in: ', np.isclose(rnn.weight_ih_l0.grad.numpy(), dEdW_in).all())
print('RNN layer gradient check dEdW_hh: ', np.isclose(rnn.weight_hh_l0.grad.numpy(), dEdW_hh).all())
print('RNN layer gradient check dEdH: ', np.isclose(rnn_out.grad.numpy(), H_grad[:,1:,:]).all())

[CUSTOM] dE/dWih:
[[-2.9129841   0.3366964  -7.298915    0.2762408  -1.308749   -2.1736734 ]
 [-1.0284758   0.95830005  2.6368146   1.8294667  -2.5619981  -0.7641725 ]
 [-0.73598444  4.11507    -5.719607    0.6079343  -1.8728046   1.4738001 ]
 [-0.34538245 -0.6158072   0.46779677  4.2226386   2.1171274   0.37274322]
 [-1.2222011  -2.0906603   3.08966    -2.145599    2.9344556   7.363285  ]]

[CUSTOM] dE/dWhh:
[[ 1.1868922   0.453833   -1.0819117  -0.5528156  -0.64420515]
 [-0.89827275  0.5567818   0.87454146  0.05949303  1.6204437 ]
 [ 0.74795824 -0.27476835  0.00224801 -1.4019312  -0.8334215 ]
 [ 0.4918106  -0.08144882  0.18285216 -1.0069394  -0.3097951 ]
 [-0.04485604 -0.03566557 -0.38454136  0.77045953 -1.1370219 ]]

[CUSTOM] dE/dH:
[[[ 0.92513262  0.08011615  1.68275788  0.54211286  0.73723214]
  [-1.26205693  1.07338584 -1.26625229 -0.36155829  0.11710336]
  [ 0.72087544  1.36405849 -1.31551814  0.05014242 -1.36439168]]

 [[ 1.25257488  0.80552966  1.14910393 -0.55060726 -2.470521

## LSTM

In [165]:
#N = 5
#emb_dim = 6
#seq_len = 3
#hidden_dim = 8

N = 4
emb_dim = 3
seq_len = 3
hidden_dim = 5

x = torch.randn(N, seq_len, emb_dim, requires_grad=True)
x_ = x.detach().numpy()

lstm = nn.LSTM(emb_dim, hidden_dim, bias=False, batch_first=True)
lstm_ = LSTMLayer(emb_dim, hidden_dim, use_bias=False)
wih = lstm.weight_ih_l0.detach().numpy()
whh = lstm.weight_hh_l0.detach().numpy()

lstm_.input_weights[0,:,:] = wih[0:hidden_dim, :]
lstm_.input_weights[1,:,:] = wih[hidden_dim: 2*hidden_dim, :]
lstm_.input_weights[2,:,:] = wih[2*hidden_dim: 3*hidden_dim, :]
lstm_.input_weights[3,:,:] = wih[3*hidden_dim: 4*hidden_dim, :]

lstm_.hidden_weights[0,:,:] = whh[0:hidden_dim, :]
lstm_.hidden_weights[1,:,:] = whh[hidden_dim: 2*hidden_dim, :]
lstm_.hidden_weights[2,:,:] = whh[2*hidden_dim: 3*hidden_dim, :]
lstm_.hidden_weights[3,:,:] = whh[3*hidden_dim: 4*hidden_dim, :]


lstm_out, h_n = lstm(x)
lstm_out_, h_n_, c_n_ = lstm_.forward(x_)
lstm_out__ = lstm_out_[:, 1:, :] # remove zeros prepended to every hidden output

print('LSTM layer forward check: ', np.isclose(lstm_out.detach().numpy(), lstm_out__, atol=1e-3).all())
print('LSTM layer forward check last hidden: ', np.isclose(h_n[0].detach().numpy(), h_n_, atol=1e-3).all())
print('LSTM layer forward check last c_n: ', np.isclose(h_n[1].detach().numpy(), c_n_, atol=1e-3).all())

LSTM layer forward check:  True
LSTM layer forward check last hidden:  True
LSTM layer forward check last c_n:  True


In [166]:
de_dy = torch.randn(N, seq_len, hidden_dim)
de_dy_ = de_dy.numpy()

x.retain_grad()
lstm_out.backward(de_dy)

dEdW_in, dEdW_hh, a, X_grad = lstm_.backward(x_, de_dy_)

In [167]:
print('LSTM layer gradient check dEdW_in: ', np.isclose(lstm.weight_ih_l0.grad.numpy(), dEdW_in.reshape(4*hidden_dim,emb_dim), atol=1e-3).all())
print('LSTM layer gradient check dEdW_hh: ', np.isclose(lstm.weight_hh_l0.grad.numpy(), dEdW_hh.reshape(4*hidden_dim,hidden_dim), atol=1e-3).all())
print('LSTM layer gradient check dEdX: ', np.isclose(x.grad.numpy(), X_grad, atol=1e-3).all())

LSTM layer gradient check dEdW_in:  True
LSTM layer gradient check dEdW_hh:  True
LSTM layer gradient check dEdX:  True
