In [116]:
import numpy as np
from datasets import load_dataset
import re

import Embeddings.positional_embedding as pe
import feed_forward as ff
import Attention.attention_block as ab
import Attention.attention_head as ah
import transformer_block as tb
import layer_norm as ln
import base_transformer as bt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [137]:
# test embedding
test_embedding = np.random.normal(0, 1, size=(9,1024))

In [135]:
# positional embeddings
test_pos_embed = pe.positional_embedding(1000, 1000)
test_pos_embed.embeddings.std()

0.020021235620774045

In [None]:
# feed forward layer
test_layer = ff.neuron_layer(
      input_shape=512, output_shape=512
    , activation='relu', batch_size=8
    , clip_val=.0001, learning_rate=.001)

In [None]:
# attention head
test_head = ah.attention_head(512, 64)
test_head.masked_attention_score(test_embedding).shape

In [None]:
# attention block
test_block = ab.attention_block(num_heads=8, block_shape=512)
test_block.multi_head_attention(test_embedding).shape

(9, 512)

In [None]:
# layer norm
test_layer_norm = ln.layer_norm(512)
test_layer_norm.layer_norm(test_embedding).shape

(9, 512)

In [None]:
# transformer block
test_transformer_block = tb.transformer_block(
      num_heads=8, block_shape=512, activation='relu'
    , batch_size=8, clip_val=.0001, learning_rate=.001)
test_transformer_block.forward_pass(test_embedding).shape

In [114]:
# transformer model
test_transformer = bt.transformer(
      input_layer_shape=1024, input_layer_activation='relu'
    , hidden_layer_shapes=[512,512,512], hidden_layer_activations=['relu', 'relu', 'relu']
    , hidden_layer_num_heads=[8, 8, 8]
    , output_shape=1024
    , output_layer_activation='softmax'
)
print(test_embedding.shape)
print(test_transformer.forward_pass(test_embedding).shape)

(9, 1024)
(9, 1024)


In [60]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 10000
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josep/Desktop/Self/Learning/NLP/RNN/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)+1):
    word2ind[corpusWords[i-1]] = i
word2ind['<PAD>'] = 0
embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

In [14]:
test_transformer.input_layer.activation

'relu'