# Test om koden er riktig implementert

Her er et forslag til testfunksjoner for å sjekke om koden er riktig implementert.
```assert variabel``` vil gi en feilmelding med mindre variabelen ```variabel = True```. For eksempel vil ```assert a == b``` gi en feilmelding med mindre ```a``` og ```b``` er like.

In [1]:
#For eksempel:
variable = True
assert variable, "You need to change 'variable' to True"

In [2]:
from layers import *
from neural_network import NeuralNetwork
from utils import onehot
import numpy as np
from optimizer import Adam

In [3]:
#We choose some arbitrary values for the dimensions
b = 2
n_max = 5
m = 2
n = 1

d = 7
k = 3
p = 10

r = 3
L_1 = 1

#Create an arbitrary dataset
x = np.random.randint(0, m, (b,n))
y = np.random.randint(0, m, (b,n_max))

#initialize the layers
feed_forward = FeedForward(d,p)
self_attention = SelfAttention(d,k)
embed_pos = EmbedPosition(n_max,m,d)
un_embed = LinearLayer(d,m, False)
softmax = Softmax()
transformerblock = TransformerBlock(d,k,p)


#a manual forward pass
X = onehot(x, m)  # (b, m, n)
z0 = embed_pos.forward(X)
z1 = feed_forward.forward(z0)
z2 = self_attention.forward(z1)
z3 = un_embed.forward(z2)
Z = softmax.forward(z3) 



#check the shapes
assert X.shape == (b,m,n), f"X.shape={X.shape}, expected {(b,m,n)}"
assert z0.shape == (b,d,n), f"z0.shape={z0.shape}, expected {(b,d,n)}"
assert z1.shape == (b,d,n), f"z1.shape={z1.shape}, expected {(b,d,n)}"
assert z2.shape == (b,d,n), f"z2.shape={z2.shape}, expected {(b,d,n)}"
assert z3.shape == (b,m,n), f"z3.shape={z3.shape}, expected {(b,m,n)}"
assert Z.shape == (b,m,n), f"Z.shape={Z.shape}, expected {(b,m,n)}"

#is X one-hot?
assert X.sum() == b*n, f"X.sum()={X.sum()}, expected {b*n}"


assert np.allclose(Z.sum(axis=1), 1), f"Z.sum(axis=1)={Z.sum(axis=1)}, expected {np.ones(b)}"
assert np.abs(Z.sum() - b*n) < 1e-5, f"Z.sum()={Z.sum()}, expected {b*n}"
assert np.all(Z>=0), f"Z={Z}, expected all entries to be non-negative"




In [4]:
  # y_pred: (batch, m, n)
        # y_true: (batch, n)
        # m = number of classes


#test the forward pass
x = np.random.randint(0, m, (b, n_max))
X = onehot(x, m)
print(X.shape)

#we test with a y that is shorter than the maximum length
n_y = n_max - 1
y = np.random.randint(0, m, (b, n_y))

#initialize a neural network based on the layers above
network = NeuralNetwork(r, d, m, L_1, p, k)

#and a loss function
loss = CrossEntropy()

#do a forward pass
Z = network.forward(X)
print(Z.shape)

#compute the loss
print(Z)
print('This is y:')
print(y.shape)
L = loss.forward(Z, y)

#get the derivative of the loss wrt Z
grad_Z = loss.backward()

'''
# Does not work on our network because we did not save our gradient in a array
#and perform a backward pass
_ = network.backward(grad_Z)
'''
#and and do a gradient descent step

#_ = network.step_gd(Adam())


(2, 2, 5)
(2, 2, 5)
[[[0.47041517 0.46532783 0.46937922 0.47122817 0.4826279 ]
  [0.52958483 0.53467216 0.53062078 0.52877183 0.5173721 ]]

 [[0.45490703 0.46548663 0.485104   0.45564406 0.46686848]
  [0.54509296 0.53451337 0.514896   0.54435594 0.53313152]]]
This is y:
(2, 4)


: 

In [None]:
"""
Here you may add additional tests to for example:

- Check if the ['d'] keys in the parameter dictionaries are not None, or receive something when running backward pass
- Check if the parameters change when you perform a gradient descent step
- Check if the loss decreases when you perform a gradient descent step

This is voluntary, but could be useful.
"""

import numpy as np
from tqdm import trange
from typing import Any
import layers as l
from utils import onehot
from training import make_model, training
from data_generators import get_train_test_sorting


def get_test_data():
   data_set = get_train_test_sorting(length=5, num_ints=2, samples_per_batch=250, n_batches_train=10, n_batches_test=2)
   train_set = list(zip(data_set['x_train'], data_set['y_train']))
   return train_set


def test_forward_shape():
   model = make_model()

   train_set = get_test_data()

   batch_x = onehot(train_set[0][0], 2)
   out = model.forward(batch_x)
   assert out.shape == (250, 2, 9)


def test_backward():
   model = make_model()
   grad_loss = np.random.randn(250)


   train_set = get_test_data()
   batch_x = onehot(train_set[0][0], 2)
   y_hat = model.forward(batch_x)


   loss_function = l.CrossEntropy()
   loss_function.forward(y_hat, y_true=train_set[1][0])
   grad_loss = loss_function.backward()


   model.backward(grad_loss)
  

def test_adam():
   np.seterr(all='raise')


   # Initialize model and optimizer
   model = make_model()
   optimizer = l.Adam()
   # Overfit on a single example


   # Get all training data
   train_set = get_test_data()


   loss_function = l.CrossEntropy()


   m = 2


   # First input value from training set
   input = train_set[0][0]
   output = train_set[0][1]
   batch_x = onehot(input, m)


   for _ in range(1000):
       y_hat = model.forward(batch_x)
       # y_hat: (b, m, n)
       y_hat_indices = np.argmax(y_hat, axis=1)


       correct = y_hat_indices == output
       accuracy = np.mean(correct)


       # y_true is not one-hot encoded, but instead class indices
       loss_value = loss_function.forward(y_hat, y_true=train_set[1][0]).mean()


       # dLdY: (b, m, n)
       grad_loss = loss_function.backward()




       model.backward(grad_loss)


       model.step_gd(optimizer)


       print(f'{accuracy=:.5f}, {loss_value=:.5f}')


def module_backward_works(input, out_shape: tuple, module):
   # Not done, work in progress


   # Want dY/dX of this value
   grad_output = np.ones(out_shape)


   # Compute the forward pass
   forward_result = module.forward(input)


   # Now do backward with this in mind
   dL_dx = module.backward(grad_output)


   perturbed = input + delta_input
   forward_perturbed = module.forward(perturbed)
   # print(forward_perturbed)
   print(((forward_perturbed - forward_result).sum() / delta))
   # print(grad_output)
   # assert np.allclose((forward_perturbed - forward_result) / delta, grad_output, atol=1e-6)


def test_backward_correct():
   batch_size = 10
   in_dims = 2
   out_dims = 3
   seq_len = 5
   module = l.LinearLayer(in_dims, out_dims, has_bias=False)
   input = np.random.randn(batch_size, in_dims, seq_len)
   module_backward_works(input, (batch_size, out_dims, seq_len), module)

if __name__ == '__main__':
   # test_forward_shape()
   # test_backward()
   test_adam()
   # test_backward_correct()

In [None]:
#check if loss is non-negative
assert L >= 0, f"L={L}, expected L>=0"
assert grad_Z.shape == Z.shape, f"grad_Z.shape={grad_Z.shape}, expected {Z.shape}"

#check if onehot(y) gives zero loss
Y = onehot(y, m)
L = loss.forward(Y, y)
assert L < 1e-5, f"L={L}, expected L<1e-5"


In [2]:
#For eksempel:
variable = True
assert variable, "You need to change 'variable' to True"

In [4]:
#We choose some arbitrary values for the dimensions
b = 2
n_max = 5
m = 2
n = 1

d = 7
k = 3
p = 10

r = 3
L_1 = 1

#Create an arbitrary dataset
x = np.random.randint(0, m, (b,n))
y = np.random.randint(0, m, (b,n_max))

#initialize the layers
feed_forward = FeedForward(d,p)
self_attention = SelfAttention(d,k)
embed_pos = EmbedPosition(n_max,m,d)
un_embed = LinearLayer(d,m, False)
softmax = Softmax()
transformerblock = TransformerBlock(d,k,p)


#a manual forward pass
X = onehot(x, m)  # (b, m, n)
z0 = embed_pos.forward(X)
z1 = feed_forward.forward(z0)
z2 = self_attention.forward(z1)
z3 = un_embed.forward(z2)
Z = softmax.forward(z3) 



#check the shapes
assert X.shape == (b,m,n), f"X.shape={X.shape}, expected {(b,m,n)}"
assert z0.shape == (b,d,n), f"z0.shape={z0.shape}, expected {(b,d,n)}"
assert z1.shape == (b,d,n), f"z1.shape={z1.shape}, expected {(b,d,n)}"
assert z2.shape == (b,d,n), f"z2.shape={z2.shape}, expected {(b,d,n)}"
assert z3.shape == (b,m,n), f"z3.shape={z3.shape}, expected {(b,m,n)}"
assert Z.shape == (b,m,n), f"Z.shape={Z.shape}, expected {(b,m,n)}"

#is X one-hot?
assert X.sum() == b*n, f"X.sum()={X.sum()}, expected {b*n}"


assert np.allclose(Z.sum(axis=1), 1), f"Z.sum(axis=1)={Z.sum(axis=1)}, expected {np.ones(b)}"
assert np.abs(Z.sum() - b*n) < 1e-5, f"Z.sum()={Z.sum()}, expected {b*n}"
assert np.all(Z>=0), f"Z={Z}, expected all entries to be non-negative"


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
"""
Here you may add additional tests to for example:

- Check if the ['d'] keys in the parameter dictionaries are not None, or receive something when running backward pass
- Check if the parameters change when you perform a gradient descent step
- Check if the loss decreases when you perform a gradient descent step

This is voluntary, but could be useful.
"""

import numpy as np
from tqdm import trange
from typing import Any
import layers as l
from utils import onehot
from training import make_model, training
from data_generators import get_train_test_sorting


def get_test_data():
   data_set = get_train_test_sorting(length=5, num_ints=2, samples_per_batch=250, n_batches_train=10, n_batches_test=2)
   train_set = list(zip(data_set['x_train'], data_set['y_train']))
   return train_set


def test_forward_shape():
   model = make_model()

   train_set = get_test_data()

   batch_x = onehot(train_set[0][0], 2)
   out = model.forward(batch_x)
   assert out.shape == (250, 2, 9)


def test_backward():
   model = make_model()
   grad_loss = np.random.randn(250)


   train_set = get_test_data()
   batch_x = onehot(train_set[0][0], 2)
   y_hat = model.forward(batch_x)


   loss_function = l.CrossEntropy()
   loss_function.forward(y_hat, y_true=train_set[1][0])
   grad_loss = loss_function.backward()


   model.backward(grad_loss)
  

def test_adam():
   np.seterr(all='raise')


   # Initialize model and optimizer
   model = make_model()
   optimizer = l.Adam()
   # Overfit on a single example


   # Get all training data
   train_set = get_test_data()


   loss_function = l.CrossEntropy()


   m = 2


   # First input value from training set
   input = train_set[0][0]
   output = train_set[0][1]
   batch_x = onehot(input, m)


   for _ in range(1000):
       y_hat = model.forward(batch_x)
       # y_hat: (b, m, n)
       y_hat_indices = np.argmax(y_hat, axis=1)


       correct = y_hat_indices == output
       accuracy = np.mean(correct)


       # y_true is not one-hot encoded, but instead class indices
       loss_value = loss_function.forward(y_hat, y_true=train_set[1][0]).mean()


       # dLdY: (b, m, n)
       grad_loss = loss_function.backward()




       model.backward(grad_loss)


       model.step_gd(optimizer)


       print(f'{accuracy=:.5f}, {loss_value=:.5f}')


def module_backward_works(input, out_shape: tuple, module):
   # Not done, work in progress


   # Want dY/dX of this value
   grad_output = np.ones(out_shape)


   # Compute the forward pass
   forward_result = module.forward(input)


   # Now do backward with this in mind
   dL_dx = module.backward(grad_output)


   perturbed = input + delta_input
   forward_perturbed = module.forward(perturbed)
   # print(forward_perturbed)
   print(((forward_perturbed - forward_result).sum() / delta))
   # print(grad_output)
   # assert np.allclose((forward_perturbed - forward_result) / delta, grad_output, atol=1e-6)


def test_backward_correct():
   batch_size = 10
   in_dims = 2
   out_dims = 3
   seq_len = 5
   module = l.LinearLayer(in_dims, out_dims, has_bias=False)
   input = np.random.randn(batch_size, in_dims, seq_len)
   module_backward_works(input, (batch_size, out_dims, seq_len), module)

if __name__ == '__main__':
   # test_forward_shape()
   # test_backward()
   test_adam()
   # test_backward_correct()

"\nHere you may add additional tests to for example:\n\n- Check if the ['d'] keys in the parameter dictionaries are not None, or receive something when running backward pass\n- Check if the parameters change when you perform a gradient descent step\n- Check if the loss decreases when you perform a gradient descent step\n\nThis is voluntary, but could be useful.\n"