In [4]:
import numpy as np
from datasets import load_dataset
import re

import Embeddings.positional_embedding as pe
import feed_forward as ff
import Attention.attention_block as ab
import Attention.attention_head as ah
import transformer_block as tb
import layer_norm as ln
import base_transformer as bt

%load_ext autoreload
%autoreload 2

### Component parts of the transformer model

In [144]:
# test embedding
test_embedding = np.random.normal(0, 1, size=(8,9,1024))

In [135]:
# positional embeddings
test_pos_embed = pe.positional_embedding(1000, 1000)
test_pos_embed.embeddings.std()

0.020021235620774045

In [None]:
# feed forward layer
test_layer = ff.neuron_layer(
      input_shape=512, output_shape=512
    , activation='relu', batch_size=8
    , clip_val=.0001, learning_rate=.001)

In [None]:
# attention head
test_head = ah.attention_head(512, 64)
test_head.masked_attention_score(test_embedding).shape

In [None]:
# attention block
test_block = ab.attention_block(num_heads=8, block_shape=512)
test_block.multi_head_attention(test_embedding).shape

(9, 512)

In [None]:
# layer norm
test_layer_norm = ln.layer_norm(512)
test_layer_norm.layer_norm(test_embedding).shape

(9, 512)

In [None]:
# transformer block
test_transformer_block = tb.transformer_block(
      num_heads=8, block_shape=512, activation='relu'
    , batch_size=8, clip_val=.0001, learning_rate=.001)
test_transformer_block.forward_pass(test_embedding).shape

NameError: name 'tb' is not defined

### Transformer model

In [5]:
# test embedding
test_embedding = np.random.normal(0, 1, size=(4,9,1024))

In [6]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 10000
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josep/Desktop/Self/Learning/NLP/RNN/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)+1):
    word2ind[corpusWords[i-1]] = i
word2ind['<PAD>'] = 0
embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

In [7]:
# function to map words to indices
imdb_corpus_arrays = [np.array(text) for text in imdbCorpus]
def word_2_ind_func(x, word2ind):
    # for key, value in word2ind.items():
    #     print(value)
    #     texts[texts==key] = value
    return word2ind.get(x, x)
word_2_ind_func = np.vectorize(word_2_ind_func)#, otypes=[str])

# imdb_corpus_arrays_vectorized = word_2_ind_func(imdb_corpus_arrays)

In [8]:
# transformer model
test_transformer = bt.transformer(
      input_layer_shape=300, input_layer_activation='relu'
    , hidden_layer_shapes=[512,512,512], hidden_layer_activations=['relu', 'relu', 'relu']
    , hidden_layer_num_heads=[8, 8, 8]
    , output_shape=71083
    , output_layer_activation='softmax'
)
# test_transformer.input_layer.layer_weights.shape
# output = test_transformer.next_token_vocab_index(test_embedding)
# output

In [113]:
# batching, mapping
batch_size = 3
x_batches = []
Y_batches = []

for text_num in range(0, len(imdbCorpus), batch_size):

    batch = imdb_corpus_arrays[text_num : text_num+batch_size]
    
    # finding max length in batch
    max_text_len = 0
    for text in batch:
        if len(text) > max_text_len:
            max_text_len = len(text)
    
    # padding
    padded_batch = []
    for text in batch:
        delta_from_max = max_text_len - len(text)
        if delta_from_max > 0:
            pad_array = np.full(delta_from_max, fill_value='<PAD>')
            text = np.concatenate((text, pad_array))
        padded_batch.append(text)
    
    # making array and converting to indexed
    batch_array = np.vstack(padded_batch)
    batch_array_ind = word_2_ind_func(batch_array, word2ind)


    # turning into embeddings for input
    x_batch_embeddings = embeddings[batch_array_ind]
    x_batches.append(x_batch_embeddings)

    # shifting indices and padding for output. the are indices, NOT embeddings
    pad_vals = np.zeros((x_batch_embeddings.shape[0], 1))
    Y_batch_ind = np.append(np.delete(batch_array_ind, 0, 1), pad_vals, axis=1)
    Y_batches.append(Y_batch_ind)
    

In [None]:
# print(Y_batches[0][0])

[60697. 65591. 60697. 38825. 52811. 57249. 63507. 58178.  9209. 44644.
 32832. 18114. 41956.  1978. 13343. 41438. 58114. 46867. 58114. 56049.
  1581.  4015. 38872. 33186. 60697. 19216. 20245. 13343.   685.  1581.
 58114. 56049. 62077.  3599.  3272.  2931. 21786. 58114.  3153. 17438.
 20555. 29032. 61431. 42833. 40704. 52417. 68107. 26832. 32832. 66607.
 56330. 29120. 60697. 50815. 10534. 20555. 46078. 61431. 70682. 62096.
 51851. 41956. 19709. 15346. 63402. 63026. 68107. 37033. 58760. 62383.
 59686. 39060. 51663. 68572. 50440. 20555. 30159. 38661. 22838. 10147.
 47749. 18118. 38872. 63363. 22838. 50440. 20555. 28533. 37131. 57683.
 20555. 26457.  2555.  4918. 32832. 55535.  6366. 17304. 41956. 29546.
 61117. 35724. 47749. 19014. 44743. 44140. 59344. 30323. 41956. 43399.
  6315.  4852. 12662. 44140. 38872. 41956. 28143. 48020. 38872. 67824.
 60069. 39425.  4852. 46215. 36200. 32832. 57651. 47749. 39499.  5707.
  6366. 44354. 22838. 60049. 38229. 27717. 37131. 62383. 34415. 23700.
  4852

In [112]:
for batch_num in range(1):
    print(batch_num)
    x_batch = x_batches[batch_num]
    Y_batch = Y_batches[batch_num]
    batch_output = test_transformer.forward_pass(x_batch, train=True)
    logits = batch_output[0]
    loss = batch_output[1]
    print(np.argmax(logits, axis=1))
    print(Y_batch)
    # print(.shape)

0
[[288 282 282 ... 288 288 284]
 [225 133 208 ... 289 288 289]
 [289 288 289 ...  70 288   4]]
[[6.0697e+04 6.5591e+04 6.0697e+04 3.8825e+04 5.2811e+04 5.7249e+04
  6.3507e+04 5.8178e+04 9.2090e+03 4.4644e+04 3.2832e+04 1.8114e+04
  4.1956e+04 1.9780e+03 1.3343e+04 4.1438e+04 5.8114e+04 4.6867e+04
  5.8114e+04 5.6049e+04 1.5810e+03 4.0150e+03 3.8872e+04 3.3186e+04
  6.0697e+04 1.9216e+04 2.0245e+04 1.3343e+04 6.8500e+02 1.5810e+03
  5.8114e+04 5.6049e+04 6.2077e+04 3.5990e+03 3.2720e+03 2.9310e+03
  2.1786e+04 5.8114e+04 3.1530e+03 1.7438e+04 2.0555e+04 2.9032e+04
  6.1431e+04 4.2833e+04 4.0704e+04 5.2417e+04 6.8107e+04 2.6832e+04
  3.2832e+04 6.6607e+04 5.6330e+04 2.9120e+04 6.0697e+04 5.0815e+04
  1.0534e+04 2.0555e+04 4.6078e+04 6.1431e+04 7.0682e+04 6.2096e+04
  5.1851e+04 4.1956e+04 1.9709e+04 1.5346e+04 6.3402e+04 6.3026e+04
  6.8107e+04 3.7033e+04 5.8760e+04 6.2383e+04 5.9686e+04 3.9060e+04
  5.1663e+04 6.8572e+04 5.0440e+04 2.0555e+04 3.0159e+04 3.8661e+04
  2.2838e+04 1.0147e

In [105]:
# print(np.sum(x_batches[1][0], axis=1)[:5])
# print(np.sum(Y_batches[1][0], axis=1)[:5])