In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np

In [2]:
class Softmax(object):
    @staticmethod
    def forward(x_in):
        exps = np.exp(x_in-np.max(x_in, axis=-1, keepdims=True))
        return exps / np.sum(exps, axis=-1, keepdims=True)

class Sigmoid(object):
    
    @staticmethod
    def forward(x_in):
        return 1./(1 + np.exp(-x_in))
    
    @staticmethod
    def backward(x_in):
        fw = Sigmoid().forward(x_in)
        return fw * (1 - fw)
    
    @staticmethod
    def backward_calculated(sigmoid_x):
        return sigmoid_x * (1 - sigmoid_x)
    
    
class Tanh(object):

    @staticmethod
    def forward(X_in):
        return np.tanh(X_in)

    @staticmethod
    def backward(X_in):
        # dEdX = dEdY * dYdX = dEdY * 1 - (tanh(X))^2
        return 1 - (np.tanh(X_in)) ** 2

    @staticmethod
    def backward_calculated(tanh_x_in):
        return 1 - tanh_x_in ** 2


class ReLu(object):

    @staticmethod
    def forward(x_in):
        return np.maximum(x_in, 0)

    @staticmethod
    def backward(x_in):
        return x_in > 0
    
    
class CrossEntropyLoss(object):
    def __init__(self):
        self.y_pred = None

    def forward(self, y, o):
        self.y_pred = Softmax.forward(o)
        return np.sum(-y * np.log(self.y_pred + 1e-15))/(y.shape[0])

    def backward(self, y):
        return (self.y_pred - y) / y.shape[0]

In [7]:
class RnnLayer(object):

    def __init__(self, input_dim, hidden_dim, seq_len, batch_size, use_bias=True, activation=Tanh):
        sq = np.sqrt(1. / hidden_dim)
        self.use_bias = use_bias
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.activation = activation()
        self.input_weights = np.random.uniform(-sq, sq, (hidden_dim, input_dim))
        self.hidden_weights = np.random.uniform(-sq, sq, (hidden_dim, hidden_dim))

        if self.use_bias:
            self.bias = np.random.uniform(-sq, sq, hidden_dim)
        else:
            self.bias = np.zeros(hidden_dim)

    def forward(self, x_in):
        # treba li dodati provjeru je li X_in stvarno ima sekvencu jednaku seq_len?
        # treba li dodati provjeru je li X_in prva koordinata jednaka batch_size

        # u ovom slucaju sam pretpostavio da je za sve inpute, pocetno stanje 0 u 0. vremenskom trenutku
        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            input_part = np.einsum('ij,jk->ik', x_in[:, i, :], self.input_weights.T)
            hidden_part = np.einsum('ij,jk->ik', H[:, i, :], self.hidden_weights.T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def book_forward(self, x_in):

        H = np.zeros((self.batch_size, self.seq_len + 1, self.hidden_dim))

        for i in range(self.seq_len):
            # ovdje dobivam transponirano iz mog forwarda, ali sam u einsum zamijenio vrijednosti, tako da zapravo dobijem isto
            input_part = np.einsum('ij,jk->ki', self.input_weights, x_in[:, i, :].T)
            hidden_part = np.einsum('ij,jk->ik', self.hidden_weights, H[:, i, :].T)

            H[:, i + 1, :] = self.activation.forward(input_part + hidden_part + self.bias)

        return H, H[:, self.seq_len, :]

    def backward(self, x, h, dEdY):

        batch_size = x.shape[0]
        seq_len = x.shape[1]

        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)
        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((batch_size, seq_len, self.hidden_dim))
        act = self.activation.backward_calculated(h)

        for i in range(seq_len - 1, -1, -1):
            if i < seq_len - 1:
                H_grad[:, i, :] = np.dot(H_grad[:, i + 1, :] * act[:, i+2, :], self.hidden_weights) + dEdY[:, i, :]
            else:
                H_grad[:, i, :] = dEdY[:, i, :]

            dEdW_in += np.sum(act[:, i+1, :].reshape(batch_size, self.hidden_dim, 1) * (np.einsum('bh,bi->bhi', H_grad[:, i, :], x[:, i, :])), axis=0)
            dEdW_hh += np.sum(act[:, i+1, :].reshape(batch_size, self.hidden_dim, 1) * (np.einsum('bh,bk->bhk', H_grad[:, i, :], h[:, i, :])), axis=0)

            if self.use_bias:
                dEdB_in += np.sum(act[:, i+1, :] * H_grad[:, i, :], axis=0)

        return dEdW_in, dEdW_hh, dEdB_in

    def backward_2nd(self, x, h, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        dEdB_in = np.zeros_like(self.bias)

        batch_size = x.shape[0]
        seq_len = x.shape[1]

        H_grad = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        H_grad[:, seq_len, :] = dEdY[:, seq_len - 1, :]

        for i in range(seq_len, 0, -1):

            activation_backward = self.activation.backward_calculated(h[:, i, :])
            back_reshaped = activation_backward.reshape(batch_size, self.hidden_dim, 1)

            dEdW_in += np.sum(back_reshaped * (np.einsum('bh,bi->bhi', H_grad[:, i, :], x[:, i - 1, :])), axis=0)
            dEdW_hh += np.sum(back_reshaped * (np.einsum('bh,bk->bhk', H_grad[:, i, :], h[:, i - 1, :])), axis=0)

            if self.use_bias:
                dEdB_in += np.sum(activation_backward * H_grad[:, i, :], axis=0)
            else:
                pass
            b = np.dot(H_grad[:, i, :], self.hidden_weights)
            a = b * activation_backward

            if i > 1:
                H_grad[:, i - 1, :] = a + dEdY[:, i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.dot(H_grad[:, i, :], self.hidden_weights) * activation_backward

            # if i > 1:
            #    H_grad[:, i - 1, :] = ((np.einsum('bh,hk->bk', H_grad[:, i, :], self.hidden_weights) * activation_backward) + dEdY[:, i - 2, :])
            # else:
            #    H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :], self.hidden_weights) * activation_backward

        return dEdW_in, dEdW_hh, dEdB_in

    def backward_checker(self, X, H, dEdY):
        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)

        batch_size = X.shape[0]
        seq_len = X.shape[1]

        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        H_grad[:, seq_len, :] = dEdY[:, seq_len - 1, :]

        for i in range(seq_len, 0, -1):

            for k in range(batch_size):
                act_grad = np.diag(self.activation.backward_calculated(H[k, i, :]))
                h_grad = H_grad[k, i, :].reshape(self.hidden_dim, 1)

                dEdW_in += np.dot(act_grad, np.dot(h_grad, X[k, i - 1, :].reshape(1, self.input_dim)))
                dEdW_hh += np.dot(act_grad, np.dot(h_grad, H[k, i - 1, :].reshape(1, self.hidden_dim)))

            if self.use_bias:
                dEdB_in += np.sum(self.activation.backward_calculated(H[:, i, :]) * H_grad[:, i, :], axis=0)
            else:
                pass

            if i > 1:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward_calculated(H[:, i, :]) + dEdY[:,
                                                                                                                         i - 2, :]
            else:
                H_grad[:, i - 1, :] = np.einsum('bh,hk->bk', H_grad[:, i, :],
                                                self.hidden_weights) * self.activation.backward_calculated(H[:, i, :])

        return dEdW_in, dEdW_hh, dEdB_in
    
    
class DenseLayer(object):

    def __init__(self, input_dim, output_dim, use_bias=True):
        sq = np.sqrt(1. / input_dim)
        self.use_bias = use_bias
        self.weights = np.random.uniform(-sq, sq, (output_dim, input_dim))
        if use_bias:
            self.bias = np.random.uniform(-sq, sq, output_dim)
        else:
            self.bias = np.zeros(output_dim)

    def forward(self, x_in):
        return np.tensordot(x_in, self.weights.T, axes=((-1), 0)) + self.bias

    def backward(self, de_dy, x_in):
        # de_dw = de_dy * dYdW = de_dy * X
        # dEdb = de_dy * dYdb = de_dy
        # dEdX = de_dy * dYdX = de_dy * W
        axis = tuple(range(len(x_in.shape) - 1))
        de_dw = np.tensordot(de_dy, x_in, axes=(axis, axis))
        de_db = np.sum(de_dy, axis=axis)
        de_dx = np.tensordot(de_dy, self.weights, axes=(-1, 0))

        return de_dx, de_dw, de_db

    def refresh(self, de_dw, de_db, learning_rate):
        self.weights = self.weights - learning_rate * de_dw
        if self.use_bias:
            self.bias = self.bias - learning_rate * de_db
            
            
class LSTMLayer(object):

    def __init__(self, input_dim, hidden_dim, use_bias=True):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.use_bias = use_bias

        sq = np.sqrt(1. / hidden_dim)
        # input weights (W_in_hi|W_fgt_hi|W_g_hi|W_out_hi)
        self.input_weights = np.random.uniform(-sq, sq, (4, hidden_dim, input_dim))
        # hidden weights (W_in_hh|W_fgt_hh|W_g_hh|W_out_hh)
        self.hidden_weights = np.random.uniform(-sq, sq, (4, hidden_dim, hidden_dim))

        self.tanh = Tanh
        self.sigmoid = Sigmoid

        self.gates = None
        self.H = None
        self.C = None

        if self.use_bias:
            # bias = (in_bias|fgt_bias|g_bias|out_bias)
            self.bias = np.random.uniform(-sq, sq, (4, hidden_dim))
        else:
            self.bias = np.zeros((4, hidden_dim))

    def forward(self, X_in, h_0=None, c_0=None):
        batch_size = X_in.shape[0]
        seq_len = X_in.shape[1]

        self.H = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        if h_0 is not None:
            self.H[:, 0, :] = h_0

        self.C = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        if c_0 is not None:
            self.C[:, 0, :] = c_0

        self.gates = np.zeros((4, batch_size, seq_len, self.hidden_dim))

        for i in range(seq_len):
            # input_gate
            self.gates[0, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[0, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[0, :, :].T) + self.bias[0, :])
            # forget gate
            self.gates[1, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[1, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[1, :, :].T) + self.bias[1, :])
            # c~ gate
            self.gates[2, :, i, :] = self.tanh.forward(
                np.dot(X_in[:, i, :], self.input_weights[2, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[2, :, :].T) + self.bias[2, :])
            # output gate
            self.gates[3, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[3, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[3, :, :].T) + self.bias[3, :])

            self.C[:, i + 1, :] = self.gates[1, :, i, :] * self.C[:, i, :] + self.gates[0, :, i, :] * self.gates[2, :, i, :]
            self.H[:, i + 1, :] = self.gates[3, :, i, :] * self.tanh.forward(self.C[:, i + 1, :])

        return self.H, self.H[:, seq_len, :], self.C[:, seq_len, :]

    def backward(self, X_in, dEdY):

        batch_size = X_in.shape[0]
        seq_len = X_in.shape[1]

        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)
        dEdB_in = np.zeros_like(self.bias)

        C_grad = np.zeros((batch_size, seq_len, self.hidden_dim))
        X_grad = np.zeros((batch_size, seq_len, self.input_dim))

        gates_grad = np.zeros((4, batch_size, seq_len, self.hidden_dim))

        for i in range(seq_len - 1, -1, -1):

            if i < seq_len - 1:
                H_grad = np.matmul(gates_grad[:, :, i + 1, :], self.hidden_weights).sum(axis=0) + dEdY[:, i, :]
                C_grad[:, i, :] = H_grad * self.gates[3, :, i, :] * self.tanh.backward(self.C[:, i + 1, :]) + C_grad[:, i + 1, :] * self.gates[1, :, i + 1, :]
            else:
                H_grad = dEdY[:, i, :]
                C_grad[:, i, :] = H_grad * self.gates[3, :, i, :] * self.tanh.backward(self.C[:, i + 1, :])

            gates_grad[0, :, i, :] = C_grad[:, i, :] * self.gates[2, :, i, :] * self.sigmoid.backward_calculated(self.gates[0, :, i, :])
            gates_grad[1, :, i, :] = C_grad[:, i, :] * self.C[:, i, :] * self.sigmoid.backward_calculated(self.gates[1, :, i, :])
            gates_grad[2, :, i, :] = C_grad[:, i, :] * self.gates[0, :, i, :] * self.tanh.backward_calculated(self.gates[2, :, i, :])
            gates_grad[3, :, i, :] = H_grad * self.tanh.forward(self.C[:, i + 1, :]) * self.sigmoid.backward_calculated(self.gates[3, :, i, :])

            X_grad[:, i, :] = np.matmul(gates_grad[:, :, i, :], self.input_weights).sum(axis=0)

            dEdW_in[0, :, :] += np.einsum('bi,bo->bio', gates_grad[0, :, i, :], X_in[:, i, :]).sum(axis=0)
            dEdW_in[1, :, :] += np.einsum('bi,bo->bio', gates_grad[1, :, i, :], X_in[:, i, :]).sum(axis=0)
            dEdW_in[2, :, :] += np.einsum('bi,bo->bio', gates_grad[2, :, i, :], X_in[:, i, :]).sum(axis=0)
            dEdW_in[3, :, :] += np.einsum('bi,bo->bio', gates_grad[3, :, i, :], X_in[:, i, :]).sum(axis=0)

            if i < seq_len - 1:
                dEdW_hh[0, :, :] += np.einsum('bi,bo->bio', gates_grad[0, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)
                dEdW_hh[1, :, :] += np.einsum('bi,bo->bio', gates_grad[1, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)
                dEdW_hh[2, :, :] += np.einsum('bi,bo->bio', gates_grad[2, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)
                dEdW_hh[3, :, :] += np.einsum('bi,bo->bio', gates_grad[3, :, i + 1, :], self.H[:, i + 1, :]).sum(axis=0)

            if self.use_bias:
                dEdB_in[0, :] += np.sum(gates_grad[0, :, i, :], axis=0)
                dEdB_in[1, :] += np.sum(gates_grad[1, :, i, :], axis=0)
                dEdB_in[2, :] += np.sum(gates_grad[2, :, i, :], axis=0)
                dEdB_in[3, :] += np.sum(gates_grad[3, :, i, :], axis=0)

        return dEdW_in, dEdW_hh, dEdB_in, X_grad

    
    
    
class GRULayer(object):

    def __init__(self, input_dim, hidden_dim, use_bias=True):
        
        #r_t = sigmoid(W_r_hi.x_t + W_r_hh.h_(t-1) + b_r)
        #z_t = sigmoid(W_z_hi.x_t + W_z_hh.h_(t-1) + b_z)
        #c_t = tanh(W_n_hi.x_t + W_n_hh.h_(t-1) * r_t + b_c)
        #h_t = (1-z_t) * n_t + z_t * h_(t-1)
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.use_bias = use_bias

        sq = np.sqrt(1. / hidden_dim)
        # input weights [W_r_hi,W_z_hi,W_c_hi]
        self.input_weights = np.random.uniform(-sq, sq, (3, hidden_dim, input_dim))
        # hidden weights [W_r_hi,W_z_hi,W_c_hi]
        self.hidden_weights = np.random.uniform(-sq, sq, (3, hidden_dim, hidden_dim))

        self.tanh = Tanh
        self.sigmoid = Sigmoid

        self.gates = None
        self.H = None
        self.C = None

        if self.use_bias:
            # bias = [r_bias,z_bias,c_bias]
            self.bias = np.random.uniform(-sq, sq, (3, hidden_dim))
        else:
            self.bias = np.zeros((3, hidden_dim))

    def forward(self, X_in, h_0=None, c_0=None):
        batch_size = X_in.shape[0]
        seq_len = X_in.shape[1]

        self.H = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        if h_0 is not None:
            self.H[:, 0, :] = h_0

        self.gates = np.zeros((3, batch_size, seq_len, self.hidden_dim))

        for i in range(seq_len):
            # reset_gate
            self.gates[0, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[0, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[0, :, :].T) + self.bias[0, :])
            # z_gate
            self.gates[1, :, i, :] = self.sigmoid.forward(
                np.dot(X_in[:, i, :], self.input_weights[1, :, :].T) + np.dot(self.H[:, i, :], self.hidden_weights[1, :, :].T) + self.bias[1, :])
            # update gate
            self.gates[2, :, i, :] = self.tanh.forward(
                np.dot(X_in[:, i, :], self.input_weights[2, :, :].T) + self.gates[0,:,i,:] * np.dot(self.H[:, i, :], self.hidden_weights[2, :, :].T) + self.bias[2, :])
            
            self.H[:, i + 1, :] = self.gates[1, :, i, :] * self.H[:, i, :] +  (1 - self.gates[1, :, i, :]) * self.gates[2,:,i,:]

        return self.H, self.H[:, seq_len, :]
    
    def backward(self, X_in, dEdY):
        batch_size = X_in.shape[0]
        seq_len = X_in.shape[1]

        dEdW_in = np.zeros_like(self.input_weights)
        dEdW_hh = np.zeros_like(self.hidden_weights)
        dEdB_in = np.zeros_like(self.bias)

        H_grad = np.zeros((batch_size, seq_len, self.hidden_dim))
        X_grad = np.zeros((batch_size, seq_len, self.input_dim))
        
        gates_grad = np.zeros((3, batch_size, seq_len, self.hidden_dim))
        
        for i in range(seq_len - 1, -1, -1):

            if i < seq_len - 1:
                H_grad[:, i, :] = np.matmul(gates_grad[:, :, i + 1, :], self.hidden_weights).sum(axis=0) + dEdY[:, i, :]
            else:
                H_grad[:, i, :] = dEdY[:, i, :]

            gates_grad[2, :, i, :] = (1 - self.gates[1, :, i, :]) * dEdY[:, i, :] * self.tanh.backward_calculated(self.gates[2, :, i, :])
            gates_grad[1, :, i, :] = ((self.H[:, i, :] - self.gates[2, :, i, :]) * dEdY[:, i, :]) * self.sigmoid.backward_calculated(self.gates[1, :, i, :])
            gates_grad[0, :, i, :] = (np.dot(gates_grad[2,:,i,:], self.hidden_weights[2,:,:].T) * self.H[:,i,:]) * self.sigmoid.backward_calculated(self.gates[0, :, i, :])
            
            X_grad[:, i, :] = np.dot(gates_grad[2,:,i,:], self.input_weights[2,:,:].T) + np.dot(gates_grad[1,:,i,:], self.input_weights[1,:,:].T) + np.dot(gates_grad[0,:,i,:], self.input_weights[0,:,:].T)
                
            h_t_T = self.H[:, i, :].T    
                
            dEdW_in[0, :, :] += np.dot(gates_grad[0, :, i, :].T, X_in[:, i, :])
            dEdW_in[1, :, :] += np.dot(gates_grad[1, :, i, :].T, X_in[:, i, :])
            dEdW_in[2, :, :] += np.dot(gates_grad[2, :, i, :].T, X_in[:, i, :])
            
            if i < seq_len - 1:
                dEdW_hh[0, :, :] += np.dot(h_t_T, gates_grad[0, :, i, :])
                dEdW_hh[1, :, :] += np.dot(h_t_T, gates_grad[1, :, i, :])
                dEdW_hh[2, :, :] += np.dot((self.H[:, i, :] * self.gates[0, :, i, :]).T, gates_grad[2, :, i, :])
                
            if self.use_bias:
                dEdB_in[0, :] += np.sum(gates_grad[0, :, i, :], axis=0)
                dEdB_in[1, :] += np.sum(gates_grad[1, :, i, :], axis=0)
                dEdB_in[2, :] += np.sum(gates_grad[2, :, i, :], axis=0)
                
        return dEdW_in, dEdW_hh, dEdB_in, X_grad
    
        

## Softmax, Cross Entropy Loss

In [8]:
N = 5
num_classes = 4

x = torch.randn(N, num_classes) # logits
y = torch.randint(num_classes, (N,)) # labels

x_ = x.numpy()
y_ = np.eye(num_classes)[y.numpy()] # one-hot encoding

In [9]:
softmax_out = F.softmax(x, dim=-1)
softmax_out

softmax = Softmax()
softmax_out_ = softmax.forward(x_)

print('Softmax check:', np.isclose(softmax_out, softmax_out_).all())

Softmax check: True


In [10]:
cel = nn.CrossEntropyLoss()
loss = cel(x, y).item()

cel_ = CrossEntropyLoss()
loss_ = cel_.forward(y_, x_)
loss_

print('Cross entropy loss check:', np.isclose(loss, loss_))

Cross entropy loss check: True


## Linear Layer

In [11]:
N = 5
num_classes = 4
seq_len = 3

x = torch.randn(N, seq_len, requires_grad=True)
y = torch.randint(num_classes, (N,))

x_ = x.detach().numpy()
y_ = np.eye(num_classes)[y.numpy()]

linear = nn.Linear(seq_len, num_classes)
linear_ = DenseLayer(seq_len, num_classes)

linear_.weights = linear.weight.detach().numpy()
linear_.bias = linear.bias.detach().numpy()

lin_out = linear(x)
lin_out_ = linear_.forward(x_)

print('Linear layer forward check: ', np.isclose(lin_out.detach().numpy(), lin_out_).all())

ValueError: einstein sum subscripts string contains too many subscripts for operand 0

In [16]:
loss = cel(lin_out, y)
lin_out.retain_grad()
loss.retain_grad()
loss.backward()
print(loss)
print(f'[TORCH] dE/dy:\n{lin_out.grad}\n')
print(f'[TORCH] dE/dW:\n{linear.weight.grad}\n')
print(f'[TORCH] dE/dB:\n{linear.bias.grad}\n')
print(f'[TORCH] dE/dX:\n{x.grad}\n')

tensor(1.7812, grad_fn=<NllLossBackward>)
[TORCH] dE/dy:
tensor([[ 0.0154, -0.1254,  0.0595,  0.0505],
        [ 0.0260, -0.0758,  0.0206,  0.0292],
        [ 0.0358,  0.0895,  0.0610, -0.1863],
        [ 0.0995,  0.0885, -0.1909,  0.0029],
        [ 0.0300,  0.0869,  0.0454, -0.1623]])

[TORCH] dE/dW:
tensor([[ 0.0685,  0.1054, -0.2851],
        [ 0.4073,  0.0524, -0.2640],
        [-0.2708, -0.4270,  0.2497],
        [-0.2051,  0.2692,  0.2994]])

[TORCH] dE/dB:
tensor([ 0.2067,  0.0636, -0.0044, -0.2660])

[TORCH] dE/dX:
tensor([[ 0.0082, -0.0504,  0.0747],
        [ 0.0138, -0.0136,  0.0439],
        [ 0.0379,  0.0321, -0.1905],
        [ 0.0463,  0.1636, -0.0177],
        [ 0.0319,  0.0328, -0.1681]])



In [19]:
loss_ = cel_.forward(y_, lin_out_)

de_dy = cel_.backward(y_)
de_dx, de_dw, de_db = linear_.backward(de_dy, x_)

In [44]:
print(f'[CUSTOM] dE/dy:\n{de_dy}\n')
print(f'[CUSTOM] dE/dW:\n{de_dw}\n')
print(f'[CUSTOM] dE/dB:\n{de_db}\n')
print(f'[CUSTOM] dE/dX:\n{de_dx}\n')

print('Check dE/dy:', np.isclose(lin_out.grad, de_dy).all())
print('Check dE/dX:', np.isclose(x.grad, de_dx).all())
print('Check dE/dW:', np.isclose(linear.weight.grad, de_dw).all())
print('Check dE/dB:', np.isclose(linear.bias.grad, de_db).all())

[CUSTOM] dE/dy:
tensor([[[-0.4858, -0.9538, -0.4078,  0.4785, -0.1805],
         [-1.4176,  1.4444, -1.2428, -0.0258, -0.1189],
         [-1.3722, -1.4655, -1.6678,  1.1944, -1.8137]],

        [[-0.6549,  0.6896, -1.5850,  0.1409,  0.7199],
         [ 1.3465, -1.1447,  0.3793,  0.3712,  1.4250],
         [ 0.0330,  1.5172,  0.5199,  1.6614, -1.3760]],

        [[ 0.3420, -0.6921,  1.2166, -1.3336,  0.8845],
         [-0.6354,  0.0516, -1.1423, -0.2269,  0.1890],
         [ 0.5430,  0.5265, -1.1912,  2.3043, -0.3202]],

        [[-0.2344, -0.2877,  1.8131, -0.6401, -1.1516],
         [ 0.4000,  1.3476, -0.4403, -0.9127, -0.5773],
         [-0.7366, -0.8067, -0.1965,  0.1194,  0.4144]]])



NameError: name 'de_dw' is not defined

## RNN


In [49]:
#N = 5
#emb_dim = 6
#seq_len = 3
#hidden_dim = 8

N = 5
emb_dim = 6
seq_len = 3
hidden_dim = 5

x = torch.randn(N, seq_len, emb_dim, requires_grad=True)
x_ = x.detach().numpy()

rnn = nn.RNN(emb_dim, hidden_dim, bias=False, batch_first=True)
rnn_ = RnnLayer(emb_dim, hidden_dim, seq_len, N, use_bias=False)
rnn_.input_weights = rnn.weight_ih_l0.detach().numpy()
rnn_.hidden_weights = rnn.weight_hh_l0.detach().numpy()

x.retain_grad()
rnn_out, h_n = rnn(x)
rnn_out_, h_n_ = rnn_.forward(x_)
rnn_out__ = rnn_out_[:, 1:, :] # remove zeros prepended to every hidden output

print('RNN layer forward check: ', np.isclose(rnn_out.detach().numpy(), rnn_out__, atol=1e-3).all())
print('RNN layer forward check last hidden: ', np.isclose(h_n.detach().numpy(), h_n_, atol=1e-3).all())

RNN layer forward check:  True
RNN layer forward check last hidden:  True


In [50]:
de_dy = torch.randn(N, seq_len, hidden_dim)
de_dy_ = de_dy.numpy()

rnn_out.retain_grad()
rnn_out.backward(de_dy)

print(f'[TORCH] dE/dWih:\n{rnn.weight_ih_l0.grad}\n')
print(f'[TORCH] dE/dWhh:\n{rnn.weight_hh_l0.grad}\n')
print(f'[TORCH] dE/dy:\n{de_dy}\n')
print(f'[TORCH] dE/dH:\n{rnn_out.grad}\n')
print(f'X_grad={x.grad}')

[TORCH] dE/dWih:
tensor([[ 0.2449, -1.7056,  2.5466,  1.6024, -2.7113,  1.4530],
        [-0.9463, -0.3401,  0.0679, -7.3683,  1.4519,  4.1646],
        [ 3.3150,  1.0614,  5.3475,  1.8725, -3.1538,  2.6769],
        [ 7.0559, -7.0339,  3.7580,  6.3774,  3.6920, 10.4161],
        [-5.8275,  0.6323, -1.1140, -0.4600,  1.7497, -3.0474]])

[TORCH] dE/dWhh:
tensor([[ 2.1474, -0.7241,  0.8992, -1.1755,  0.3038],
        [ 1.9606,  0.0551, -0.5860, -0.6625, -1.1284],
        [ 1.1062,  1.4638, -0.6872,  0.3111, -0.5890],
        [ 2.0179,  1.7246,  2.4917, -1.2085, -0.8041],
        [ 1.0057,  1.5027, -1.4213, -0.5773, -2.0118]])

[TORCH] dE/dy:
tensor([[[ 6.2996e-01, -1.2433e+00,  1.2866e+00, -5.8629e-02, -3.5287e-01],
         [ 1.0334e+00,  1.4286e-01, -1.5877e+00, -1.5272e+00,  6.0149e-01],
         [-1.1036e+00, -1.1458e+00, -4.2392e-01,  1.5527e+00,  7.6814e-01]],

        [[-1.3452e-04, -6.8384e-01, -1.0651e+00, -5.0410e-01,  2.1486e+00],
         [ 5.2262e-01,  2.7422e-01,  1.1414e-0

In [28]:
dEdW_in, dEdW_hh, _ = rnn_.backward(x_, rnn_out_, de_dy_)

print(f'[CUSTOM] dE/dWih:\n{dEdW_in}\n')
print(f'[CUSTOM] dE/dWhh:\n{dEdW_hh}\n')

print('RNN layer gradient check dEdW_in: ', np.isclose(rnn.weight_ih_l0.grad.numpy(), dEdW_in).all())
print('RNN layer gradient check dEdW_hh: ', np.isclose(rnn.weight_hh_l0.grad.numpy(), dEdW_hh).all())

[CUSTOM] dE/dWih:
[[-0.84065264 -7.068542    5.7418056   0.9002589  -5.5570254  -3.4874358 ]
 [-1.5509931   0.21796954 -1.5923568  -1.5642024   0.2130806  -0.2485686 ]
 [ 0.6651537  -0.68229634 -2.7304165   3.6788378  -0.33462602 -5.857197  ]
 [-1.3273039   1.052206   -0.39686674 -6.426206    1.8830379   4.0530033 ]
 [ 3.228954   -3.563257    2.8654382   1.1615889   0.6939566  -4.1528783 ]]

[CUSTOM] dE/dWhh:
[[-0.05816048  0.36913016  0.46005616  0.3695405   0.81958103]
 [-0.3268031   2.1463962   0.11377817  1.9113404  -0.9561599 ]
 [ 0.89730275 -1.4937084  -0.2324918   0.04177425  0.8274532 ]
 [ 2.3509252   0.17922425 -0.41860518 -0.6692463  -2.4644444 ]
 [ 1.0182368   0.737316    1.3249016   0.15629236  0.80138135]]

RNN layer gradient check dEdW_in:  True
RNN layer gradient check dEdW_hh:  True


## LSTM

In [13]:
#N = 5
#emb_dim = 6
#seq_len = 3
#hidden_dim = 8

N = 32
emb_dim = 300
seq_len = 32
hidden_dim = 200

x = torch.randn(N, seq_len, emb_dim, requires_grad=True)
x_ = x.detach().numpy()

lstm = nn.LSTM(emb_dim, hidden_dim, bias=False, batch_first=True)
lstm_ = LSTMLayer(emb_dim, hidden_dim, use_bias=False)
wih = lstm.weight_ih_l0.detach().numpy()
whh = lstm.weight_hh_l0.detach().numpy()

lstm_.input_weights[0,:,:] = wih[0:hidden_dim, :]
lstm_.input_weights[1,:,:] = wih[hidden_dim: 2*hidden_dim, :]
lstm_.input_weights[2,:,:] = wih[2*hidden_dim: 3*hidden_dim, :]
lstm_.input_weights[3,:,:] = wih[3*hidden_dim: 4*hidden_dim, :]

lstm_.hidden_weights[0,:,:] = whh[0:hidden_dim, :]
lstm_.hidden_weights[1,:,:] = whh[hidden_dim: 2*hidden_dim, :]
lstm_.hidden_weights[2,:,:] = whh[2*hidden_dim: 3*hidden_dim, :]
lstm_.hidden_weights[3,:,:] = whh[3*hidden_dim: 4*hidden_dim, :]


lstm_out, h_n = lstm(x)
lstm_out_, h_n_, c_n_ = lstm_.forward(x_)
lstm_out__ = lstm_out_[:, 1:, :] # remove zeros prepended to every hidden output

print('LSTM layer forward check: ', np.isclose(lstm_out.detach().numpy(), lstm_out__, atol=1e-3).all())
print('LSTM layer forward check last hidden: ', np.isclose(h_n[0].detach().numpy(), h_n_, atol=1e-3).all())
print('LSTM layer forward check last c_n: ', np.isclose(h_n[1].detach().numpy(), c_n_, atol=1e-3).all())

LSTM layer forward check:  True
LSTM layer forward check last hidden:  True
LSTM layer forward check last c_n:  True


In [14]:
de_dy = torch.randn(N, seq_len, hidden_dim)
de_dy_ = de_dy.numpy()
x.retain_grad()
lstm_out.backward(de_dy)

dEdW_in, dEdW_hh, a, X_grad = lstm_.backward(x_, de_dy_)

In [15]:
print('LSTM layer gradient check dEdW_in: ', np.isclose(lstm.weight_ih_l0.grad.numpy(), dEdW_in.reshape(4*hidden_dim,emb_dim), atol=1e-3).all())
print('LSTM layer gradient check dEdW_hh: ', np.isclose(lstm.weight_hh_l0.grad.numpy(), dEdW_hh.reshape(4*hidden_dim,hidden_dim), atol=1e-3).all())
print('LSTM layer gradient check dEdX: ', np.isclose(x.grad.numpy(), X_grad, atol=1e-3).all())

LSTM layer gradient check dEdW_in:  True
LSTM layer gradient check dEdW_hh:  True
LSTM layer gradient check dEdX:  True


## Pokusaj pravljenja torch modela rnn, fc

In [32]:
import torch

In [42]:
N = 5
emb_dim = 6
seq_len = 3
hidden_dim = 5

In [41]:
x = torch.randn(N, seq_len, emb_dim, requires_grad=True)

model = torch.nn.Sequential(
    torch.nn.RNN(emb_dim, hidden_dim, bias=False, batch_first=True),
    torch.nn.Linear(hidden_dim, 5, bias=False)
)

loss_fn = torch.nn.CrossEntropyLoss()

learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

## GRU

In [69]:
N = 5
emb_dim = 6
seq_len = 1
hidden_dim = 8

#N = 20
#emb_dim = 40
#seq_len = 32
#hidden_dim = 200

x = torch.randn(N, seq_len, emb_dim, requires_grad=True)
x_ = x.detach().numpy()

gru = nn.GRU(emb_dim, hidden_dim, bias=False, batch_first=True)
gru_ = GRULayer(emb_dim, hidden_dim, use_bias=False)
wih = gru.weight_ih_l0.detach().numpy()
whh = gru.weight_hh_l0.detach().numpy()

gru_.input_weights[0,:,:] = wih[0:hidden_dim, :]
gru_.input_weights[1,:,:] = wih[hidden_dim: 2*hidden_dim, :]
gru_.input_weights[2,:,:] = wih[2*hidden_dim: 3*hidden_dim, :]

gru_.hidden_weights[0,:,:] = whh[0:hidden_dim, :]
gru_.hidden_weights[1,:,:] = whh[hidden_dim: 2*hidden_dim, :]
gru_.hidden_weights[2,:,:] = whh[2*hidden_dim: 3*hidden_dim, :]

gru_out, h_n = gru(x)
gru_out_, h_n_ = gru_.forward(x_)
gru_out__ = gru_out_[:, 1:, :] # remove zeros prepended to every hidden output

print('GRU layer forward check: ', np.isclose(gru_out.detach().numpy(), gru_out__, atol=1e-3).all())
print('GRU layer forward check last hidden: ', np.isclose(h_n.detach().numpy(), h_n_, atol=1e-3).all())

GRU layer forward check:  True
GRU layer forward check last hidden:  True


In [70]:
de_dy = torch.randn(N, seq_len, hidden_dim)
de_dy_ = de_dy.numpy()

x.retain_grad()
gru_out.backward(de_dy)

In [71]:
dEdW_in, dEdW_hh, a, X_grad = gru_.backward(x_, de_dy_)

ValueError: shapes (5,8) and (6,8) not aligned: 8 (dim 1) != 6 (dim 0)

In [81]:
a = np.array([[[1,2],[2,2]],[[3,2],[1,2]],[[1,2],[2,2]]])
np.prod(np.array(a.shape[0:len(a.shape)-1]))

6

In [2]:
import numpy as np
a = np.array([[[1]]])
b = np.array([[2]])

c = [a, b]
c

[array([[[1]]]), array([[2]])]

In [7]:
a = np.array([[[1, 2],[2, 3],[3, 4],[4, 5]], [[1, 2],[2, 3],[3, 4],[4, 5]]])
#a = 2,4,2
b = np.array([[1, 2, 4],[2, 3, 5],[3, 4, 4],[4, 5, 5]])