In [1]:
import torch
from torch import nn
import numpy as np

In [2]:
m = nn.Linear(3, 4, bias=False)

In [3]:
inputt = torch.randn(5, 3)

In [4]:
print(m.weight)
print(inputt)
print(m(inputt))

Parameter containing:
tensor([[ 0.0445,  0.4361, -0.0762],
        [-0.3798,  0.1714,  0.2562],
        [ 0.1661,  0.3013,  0.2487],
        [ 0.3682, -0.0062,  0.2577]], requires_grad=True)
tensor([[-0.7067, -0.3054,  0.5206],
        [ 1.0049,  1.0786,  0.8055],
        [ 0.2857, -0.0968, -1.2853],
        [ 0.5373,  0.2709, -0.5338],
        [ 1.0436,  0.1004,  0.1597]])
tensor([[-0.2043,  0.3494, -0.0799, -0.1241],
        [ 0.4536,  0.0096,  0.6922,  0.5709],
        [ 0.0685, -0.4543, -0.3013, -0.2255],
        [ 0.1827, -0.2944,  0.0381,  0.0586],
        [ 0.0780, -0.3382,  0.2433,  0.4248]], grad_fn=<MmBackward>)


In [5]:
class Softmax(object):
    @staticmethod
    def forward(x_in):
        exps = np.exp(x_in-np.max(x_in, axis=-1, keepdims=True))
        return exps / np.sum(exps, axis=-1, keepdims=True)


class Tanh(object):

    @staticmethod
    def forward(x_in):
        return np.tanh(x_in)

    @staticmethod
    def backward(x_in):
        # dEdX = dEdY * dYdX = dEdY * 1 - (tanh(X))^2
        return 1 - (np.tanh(x_in)) ** 2


class ReLu(object):

    @staticmethod
    def forward(x_in):
        return np.maximum(x_in, 0)

    @staticmethod
    def backward(x_in):
        return x_in > 0

In [6]:
class RnnLayer(object):

    def __init__(self, input_dim, hidden_dim, seq_len, batch_size, use_bias=True, activation=Tanh):
        sq = np.sqrt(1. / hidden_dim)
        self.use_bias = use_bias
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.activation = activation()
        self.input_weights = np.random.uniform(-sq, sq, (hidden_dim, input_dim))
        self.hidden_weights = np.random.uniform(-sq, sq, (hidden_dim, hidden_dim))

        if self.use_bias:
            self.bias = np.random.uniform(-sq, sq, hidden_dim)
        else:
            self.bias = np.zeros(hidden_dim)

    def forward(self, x_in):
        # treba li dodati provjeru je li X_in stvarno ima sekvencu jednaku seq_len?
        # treba li dodati provjeru je li X_in prva koordinata jednaka batch_size

        # u ovom slucaju sam pretpostavio da je za sve inpute, pocetno stanje 0 u 0. vremenskom trenutku
        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            input_part = np.einsum('ij,jk->ik', x_in[:, i, :], self.input_weights.T)
            hidden_part = np.einsum('ij,jk->ik', H[:, i, :], self.hidden_weights.T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def book_forward(self, x_in):

        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            # ovdje dobivam transponirano iz mog forwarda, ali sam u einsum zamijenio vrijednosti, tako da zapravo dobijem isto
            input_part = np.einsum('ij,jk->ki', self.input_weights, x_in[:, i, :].T)
            hidden_part = np.einsum('ij,jk->ik', self.hidden_weights, H[:, i, :].T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def backward(self, x, h, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:, self.seq_len, :] = dEdY[:, self.seq_len - 1, :]

        for i in range(self.seq_len, 0, -1):
            activation_backward = self.activation.backward(h[:, i, :]).reshape(self.batch_size, self.hidden_dim, 1)

            dEdW_in += np.sum(activation_backward * (np.einsum('bh,bi->bhi', H_grad[:, i, :], x[:, i - 1, :])), axis=0)
            dEdW_hh += np.sum(activation_backward * (np.einsum('bh,bk->bhk', H_grad[:, i, :], h[:, i - 1, :])), axis=0)

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(h[:, i, :]) * H_grad[:, i, :], axis=0)
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :], self.hidden_weights) * self.activation.backward(
                    h[:, i, :]) + dEdY[:, i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward(h[:, i, :])

        return dEdW_in, dEdW_hh, dEdB_in

    def backward_checker(self, X, H, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        print(f'self.bias={self.bias}')

        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:, self.seq_len, :] = dEdY[:, self.seq_len - 1, :]

        for i in range(self.seq_len, 0, -1):

            for k in range(self.batch_size):
                act_grad = np.diag(self.activation.backward(H[k, i, :]))
                h_grad = H_grad[k, i, :].reshape(self.hidden_dim, 1)

                dEdW_in += np.dot(act_grad, np.dot(h_grad, X[k, i - 1, :].reshape(1, self.input_dim)))
                dEdW_hh += np.dot(act_grad, np.dot(h_grad, H[k, i - 1, :].reshape(1, self.hidden_dim)))

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(H[:, i, :]) * H_grad[:, i, :], axis=(0))
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward(H[:, i, :]) + dEdY[:,
                                                                                                              i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward(H[:, i, :])

        return dEdW_in, dEdW_hh, dEdB_in

In [7]:
rnnmine = RnnLayer(3, 2, 3, 5, use_bias=False)
rnntorch = nn.RNN(3, 2, bias=False, batch_first=True)
rnnmine.input_weights = rnntorch.weight_ih_l0.detach().numpy()
rnnmine.hidden_weights = rnntorch.weight_hh_l0.detach().numpy()

x = torch.randn(5, 3, 3)
x_m = x.detach().numpy()
h0 = torch.zeros((1,5,2))

In [8]:
outputtorch, _ = rnntorch(x, h0)
outmine, _ = rnnmine.forward(x_m)

In [9]:
om = torch.from_numpy(outmine[:,1:,:])
print(om)
print(outputtorch)

tensor([[[ 0.0141, -0.7907],
         [-0.7024, -0.2190],
         [ 0.8083,  0.0122]],

        [[-0.1536, -0.7138],
         [-0.1847,  0.6211],
         [-0.4310, -0.8088]],

        [[-0.2234,  0.1672],
         [-0.8791, -0.9268],
         [ 0.9770,  0.6179]],

        [[ 0.9083,  0.7321],
         [-0.9035, -0.9906],
         [ 0.2919, -0.8567]],

        [[ 0.8316,  0.6995],
         [-0.9881, -0.9910],
         [ 0.9049,  0.3673]]], dtype=torch.float64)
tensor([[[ 0.0141, -0.7907],
         [-0.7024, -0.2190],
         [ 0.8083,  0.0122]],

        [[-0.1536, -0.7138],
         [-0.1847,  0.6211],
         [-0.4310, -0.8088]],

        [[-0.2234,  0.1672],
         [-0.8791, -0.9268],
         [ 0.9770,  0.6179]],

        [[ 0.9083,  0.7321],
         [-0.9035, -0.9906],
         [ 0.2919, -0.8567]],

        [[ 0.8316,  0.6995],
         [-0.9881, -0.9910],
         [ 0.9049,  0.3673]]], grad_fn=<TransposeBackward1>)
