In [1]:
import torch
from torch import nn
import numpy as np

In [10]:
m = nn.Linear(3, 4, bias=False)

In [11]:
inputt = torch.randn(5, 3)

In [12]:
print(m.weight)
print(inputt)
print(m(inputt))

Parameter containing:
tensor([[ 0.4793,  0.2916, -0.5416],
        [-0.1635, -0.2718, -0.0553],
        [-0.3974, -0.4508,  0.3561],
        [ 0.3549, -0.2679, -0.0435]], requires_grad=True)
tensor([[-0.8689, -0.1350, -0.6070],
        [-0.1299, -0.5284, -0.6637],
        [-0.4502,  0.3287,  1.4166],
        [ 0.8028, -1.2446, -1.7689],
        [ 1.0773,  0.6362, -0.2447]])
tensor([[-0.1271,  0.2123,  0.1899, -0.2458],
        [ 0.1431,  0.2015,  0.0535,  0.1243],
        [-0.8871, -0.0940,  0.5352, -0.3094],
        [ 0.9798,  0.3048, -0.3879,  0.6952],
        [ 0.8344, -0.3355, -0.8021,  0.2225]], grad_fn=<MmBackward>)


In [20]:
class Softmax(object):
    @staticmethod
    def forward(x_in):
        exps = np.exp(x_in-np.max(x_in, axis=-1, keepdims=True))
        return exps / np.sum(exps, axis=-1, keepdims=True)


class Tanh(object):

    @staticmethod
    def forward(x_in):
        return np.tanh(x_in)

    @staticmethod
    def backward(x_in):
        # dEdX = dEdY * dYdX = dEdY * 1 - (tanh(X))^2
        return 1 - (np.tanh(x_in)) ** 2


class ReLu(object):

    @staticmethod
    def forward(x_in):
        return np.maximum(x_in, 0)

    @staticmethod
    def backward(x_in):
        return x_in > 0

In [33]:
class RnnLayer(object):

    def __init__(self, input_dim, hidden_dim, seq_len, batch_size, use_bias=True, activation=Tanh):
        sq = np.sqrt(1. / hidden_dim)
        self.use_bias = use_bias
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.activation = activation()
        self.input_weights = np.random.uniform(-sq, sq, (hidden_dim, input_dim))
        self.hidden_weights = np.random.uniform(-sq, sq, (hidden_dim, hidden_dim))

        if self.use_bias:
            self.bias = np.random.uniform(-sq, sq, hidden_dim)
        else:
            self.bias = np.zeros(hidden_dim)

    def forward(self, x_in):
        # treba li dodati provjeru je li X_in stvarno ima sekvencu jednaku seq_len?
        # treba li dodati provjeru je li X_in prva koordinata jednaka batch_size

        # u ovom slucaju sam pretpostavio da je za sve inpute, pocetno stanje 0 u 0. vremenskom trenutku
        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            input_part = np.einsum('ij,jk->ik', x_in[:, i, :], self.input_weights.T)
            hidden_part = np.einsum('ij,jk->ik', H[:, i, :], self.hidden_weights.T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def book_forward(self, x_in):

        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            # ovdje dobivam transponirano iz mog forwarda, ali sam u einsum zamijenio vrijednosti, tako da zapravo dobijem isto
            input_part = np.einsum('ij,jk->ki', self.input_weights, x_in[:, i, :].T)
            hidden_part = np.einsum('ij,jk->ik', self.hidden_weights, H[:, i, :].T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def backward(self, x, h, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:, self.seq_len, :] = dEdY[:, self.seq_len - 1, :]

        for i in range(self.seq_len, 0, -1):
            activation_backward = self.activation.backward(h[:, i, :]).reshape(self.batch_size, self.hidden_dim, 1)

            dEdW_in += np.sum(activation_backward * (np.einsum('bh,bi->bhi', H_grad[:, i, :], x[:, i - 1, :])), axis=0)
            dEdW_hh += np.sum(activation_backward * (np.einsum('bh,bk->bhk', H_grad[:, i, :], h[:, i - 1, :])), axis=0)

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(h[:, i, :]) * H_grad[:, i, :], axis=0)
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :], self.hidden_weights) * self.activation.backward(
                    h[:, i, :]) + dEdY[:, i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward(h[:, i, :])

        return dEdW_in, dEdW_hh, dEdB_in

    def backward_checker(self, X, H, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        print(f'self.bias={self.bias}')

        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))
        H_grad[:, self.seq_len, :] = dEdY[:, self.seq_len - 1, :]

        for i in range(self.seq_len, 0, -1):

            for k in range(self.batch_size):
                act_grad = np.diag(self.activation.backward(H[k, i, :]))
                h_grad = H_grad[k, i, :].reshape(self.hidden_dim, 1)

                dEdW_in += np.dot(act_grad, np.dot(h_grad, X[k, i - 1, :].reshape(1, self.input_dim)))
                dEdW_hh += np.dot(act_grad, np.dot(h_grad, H[k, i - 1, :].reshape(1, self.hidden_dim)))

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward(H[:, i, :]) * H_grad[:, i, :], axis=(0))
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward(H[:, i, :]) + dEdY[:,
                                                                                                              i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward(H[:, i, :])

        return dEdW_in, dEdW_hh, dEdB_in

In [34]:
rnnmine = RnnLayer(3, 2, 3, 5, use_bias=False)
rnntorch = nn.RNN(3, 2, bias=False, batch_first=True)
rnnmine.input_weights = rnntorch.weight_ih_l0.detach().numpy()
rnnmine.hidden_weights = rnntorch.weight_hh_l0.detach().numpy()

x = torch.randn(5, 3, 3)
x_m = x.detach().numpy()
h0 = torch.zeros((1,5,2))

In [35]:
print(x, x_m)

outputtorch, _ = rnntorch(x, h0)
outmine, _ = rnnmine.forward(x_m)

tensor([[[ 0.1871, -1.4239,  0.3179],
         [-0.1685, -0.3000,  0.5422],
         [-0.9485, -0.4403,  0.4206]],

        [[ 1.1973,  0.1080,  0.4636],
         [ 0.4488, -0.6182, -1.0582],
         [ 0.5242,  0.0701,  0.0330]],

        [[-1.5357, -0.4836,  0.7346],
         [-0.0235,  0.2072,  0.5150],
         [ 0.2925,  1.5732, -0.2112]],

        [[-1.1987,  0.3369, -1.6119],
         [-0.2858,  1.4730, -0.2773],
         [-0.9517, -0.6675, -0.9721]],

        [[ 0.8934,  0.0963, -0.8608],
         [ 0.0765,  1.4835, -0.6139],
         [ 0.1046,  0.2719,  0.9029]]]) [[[ 0.1871342  -1.4239417   0.31785467]
  [-0.16854592 -0.30003515  0.54215044]
  [-0.9485377  -0.44030938  0.42060366]]

 [[ 1.1972725   0.10804261  0.4635505 ]
  [ 0.4488446  -0.61821663 -1.0582086 ]
  [ 0.52422094  0.07005763  0.03298781]]

 [[-1.535706   -0.48362377  0.73459995]
  [-0.02351362  0.20716505  0.5150445 ]
  [ 0.29249704  1.5731645  -0.21118361]]

 [[-1.1987295   0.3369196  -1.611908  ]
  [-0.2858458 

In [43]:
om = torch.from_numpy(outmine[:,1:,:])
print(om)
print(outputtorch)

tensor([[[ 0.6995,  0.4176],
         [ 0.5998,  0.2281],
         [ 0.7656,  0.2997]],

        [[-0.3584, -0.0712],
         [-0.4717, -0.2834],
         [-0.3494, -0.0279]],

        [[ 0.8946,  0.6687],
         [ 0.3980,  0.1242],
         [-0.7973, -0.5998]],

        [[-0.4900, -0.5242],
         [-0.7639, -0.4490],
         [ 0.1207,  0.0573]],

        [[-0.7586, -0.5609],
         [-0.8823, -0.5290],
         [ 0.1609,  0.4728]]], dtype=torch.float64)
tensor([[[ 0.6995,  0.4176],
         [ 0.5998,  0.2281],
         [ 0.7656,  0.2997]],

        [[-0.3584, -0.0712],
         [-0.4717, -0.2834],
         [-0.3494, -0.0279]],

        [[ 0.8946,  0.6687],
         [ 0.3980,  0.1242],
         [-0.7973, -0.5998]],

        [[-0.4900, -0.5242],
         [-0.7639, -0.4490],
         [ 0.1207,  0.0573]],

        [[-0.7586, -0.5609],
         [-0.8823, -0.5290],
         [ 0.1609,  0.4728]]], grad_fn=<TransposeBackward1>)
