In [1]:
from libs.TokenEmbedding import TokenEmbedding
from libs.CorpusDataset import CorpusDataset
from libs.MHA import MultiHeadAttention
import glob
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch


In [2]:
torch.set_default_device('cuda')
text_files = glob.glob("data/text/*.txt")
tokenizer = tokenizer = AutoTokenizer.from_pretrained("ikit-claw-nlp/toy-llm")
corpus_dataset = CorpusDataset(text_files, tokenizer=tokenizer, window_size=1024, step_length=128)
data_loader = DataLoader(dataset=corpus_dataset, batch_size=8)

In [3]:
embedding_layer = TokenEmbedding(tokenizer=tokenizer, seq_length=1024, d_model=1024)
mha = MultiHeadAttention(n_heads=12, head_dim=64, d_model=1024, seq_len=1024, dropout=0.5, use_mask=True)


In [4]:
for x, y in data_loader:
    input_token_embedding = embedding_layer(x)
    output_token_embedding = embedding_layer(y)
    context_vec, attention_score = mha(input_token_embedding, output_token_embedding, output_token_embedding)
    break

Loading the dataset data/text/article_1-1000.txt into memory...
Converting the dataset to token ids...
Conversion Complete. torch.Size([7540024]) Tokens in the corpus.


In [5]:
print(context_vec.shape, attention_score.shape)

torch.Size([8, 1024, 768]) torch.Size([8, 12, 1024, 1024])


In [9]:
attention_score[0, 3, :, :]

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5703, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5531, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0013, 0.0017, 0.0000,  ..., 0.0026, 0.0000, 0.0000],
        [0.0027, 0.0015, 0.0008,  ..., 0.0045, 0.0000, 0.0030]],
       device='cuda:0', grad_fn=<SliceBackward0>)