In [None]:
#@title Imports
import os
import re


# SimpleTokenzierV1

In [None]:
# for item in os.listdir('.'): # '.' refers to the current directory
#     print(item)

os.getcwd()

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_txt = f.read()

print(len(raw_txt))


In [None]:
# Remove whitespace or not?
#   Removing whitespaces reduces memory and computing requirement
#   White spaces can be useful for text sensitive to the structure, like python indention
preprossed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_txt)
preprossed = [t.strip() for t in preprossed if t.strip()]
print(preprossed[:10])
print(len(preprossed))

In [None]:
# Vocabulary: all the unique tokens in alphbetically order


# For tokens not in the vocab
UNKNOWN_TOKEN = "<|unk|>"

# Added between text sources.
# Allow the LLM to process and understand the data better.
END_OF_TEXT_TOKEN = "<|endoftext|>"

# Following special tokens are used by different types of tokenizers
# [BOS]: beginning of sequence
# [EOS]: end of sequence
# [PAD]: padding

sorted_unique_tokens = sorted(set(preprossed))
sorted_unique_tokens.extend([END_OF_TEXT_TOKEN, UNKNOWN_TOKEN])
print(len(sorted_unique_tokens))

In [None]:
# Encode token to token id
vocab = {token:id for id,token in enumerate(sorted_unique_tokens)}

# for i, item in enumerate(vocab.items()):
#   print(i, item)
#   if i > 20:
#     break

In [None]:
class SimpleTokenzierV1:
  def __init__(self, vocab):
    self.token_to_id = vocab
    self.id_to_token = {id:token for token,id in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [t.strip() for t in preprocessed if t.strip()]
    preprocessed = [t if t in self.token_to_id else UNKNOWN_TOKEN for t in preprocessed]
    return [self.token_to_id[t] for t in preprocessed]

  def decode(self, ids):
    tokens = [self.id_to_token[id] for id in ids]
    text = " ".join(tokens)
    return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

In [None]:
tokenizer_v1 = SimpleTokenzierV1(vocab)

In [None]:
text1 = "how are you, jessica!"
text2 = "do you like tea?"
test_ids = tokenizer_v1.encode(" <|endoftext|> ".join((text1, text2)))
print(test_ids)
print(tokenizer_v1.decode(test_ids))

# BPE Tokenizer

In [None]:
import tiktoken


# The immediate space preceding the word and the word itself are encoded as a single token

END_OF_TEXT_TOKEN = "<|endoftext|>"

tokenizer_bpe = tiktoken.get_encoding('gpt2') # download pre-trained vocabulary and merge rules

# texts = ["", "I'm", "I'm"]
# test_ids = tokenizer_bpe.encode(END_OF_TEXT_TOKEN.join(texts), allowed_special={END_OF_TEXT_TOKEN})
# print(test_ids)
# print(tokenizer_bpe.decode(test_ids))

# Dataset and DataLoader


In [None]:
from torch.utils.data import Dataset, DataLoader

class DatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
      # One pair contains max_length training targets
      self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
      self.target_ids.append(torch.tensor(token_ids[i+1:i+1+max_length]))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

# Dataload will load the dataset efficiently
# batch_size: The data the model has to process before updating the parameters
#             The number of tensor pairs each dataloader iteration return
#             Smaller batch_size requires less memory but more noisy small updates.
#             Larger batch_size will make less noisy updates but take more time.
# max_length: The context length (the sliding window size)
# drop_last:  To drop the last batch if it's shorter to prevent loss spike during training
# stride: word overlapping will create overfitting, larger stride also help go through the text faster
# num_workders: process the input in parallel
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=False, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = DatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  return dataloader


In [None]:
# dataloader = create_dataloader_v1(raw_txt)
# data_iter = iter(dataloader)
# first_batch = next(data_iter)
# print(first_batch)

# Token Embeddings and Position Embeddings

In [None]:
# import gensim.downloader as api

# # 300 dimension
# # huggingface.co/fse/word2vec-google-news-300
# # Download the vector
# word_vectors = api.load("word2vec-google-news-300")


In [None]:
# print(word_vectors['computer'])
# print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))
# print(word_vectors.similarity(['woman', 'man']))
# print(word_vectors.similarity(['tokyo', 'kyoto']))
# print(word_vectors.similarity(['fish', 'bicycle']))
# print(np.linalg.norm(word_vectors['women'] - word_vectors['man']))
# print(np.linalg.norm(word_vectors['snow'] - word_vectors['pixel']))

In [None]:
import torch

# Create a embedding layer weight matrix
vocab_size = 50000
embedding_size = 128
context_length = 256

torch.manual_seed(123)

# A simple lookup table that stores embedding of a fixed dictionary and size.
# Initialized to random numbers.
embedding_layer = torch.nn.Embedding(vocab_size, embedding_size)

# Position embedding weight matrix
pos_embedding_layer = torch.nn.Embedding(context_length, embedding_size)


# print(embedding_layer.weight)
# print(embedding_layer(torch.tensor([3])))

# input_ids = torch.tensor([2, 3, 5, 1])
# print(embedding_layer(input_ids))

In [None]:
dataloader = create_dataloader_v1(raw_txt, batch_size=4)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)


token_embeddings = embedding_layer(inputs) # batch_size x context_length x embedding_size
# print(token_embeddings.shape)
# print(input)
# print(torch.arange(0, context_length))
pos_embeddings = pos_embedding_layer(torch.arange(0, context_length)) # context_length x embedding_size
input_embeddings = token_embeddings + pos_embeddings # python broadcasting
print(input_embeddings[0])

# Self Attention

In [None]:
qkv_length = 64

class SelfAttentionV1:
  def __init__(self, embedding_size, qkv_length):
    self.Wq = torch.nn.Parameter(torch.rand(embedding_size, qkv_length), requires_grad=False)
    self.Wk = torch.nn.Parameter(torch.rand(embedding_size, qkv_length), requires_grad=False)
    self.Wv = torch.nn.Parameter(torch.rand(embedding_size, qkv_length), requires_grad=False)

  def forward(input_embeddings):
    Q = input_embeddings @ self.Wq
    K = input_embeddings @ self.Wk
    V = input_embeddings @ self.Wv

    print(Q.shape, K.shape, V.shape)

    attention_scores = Q @ K.transpose(-1, -2)
    attention_weights = torch.softmax(attention_scores / qkv_length**0.5, dim = -1)
    context_vectors = attention_weights @ V

    print(context_vectors[0])
    return context_vectors


In [None]:
qkv_length = 64

class SelfAttentionV1:
  def __init__(self, embedding_size, qkv_length, qkv_bias=False):
    self.Wq = torch.nn.Linear(embedding_size, qkv_length, bias=qkv_bias)
    self.Wk = torch.nn.Linear(embedding_size, qkv_length, bias=qkv_bias)
    self.Wv = torch.nn.Linear(embedding_size, qkv_length, bias=qkv_bias)

  def forward(input_embeddings):
    Q = self.Wq(input_embeddings)
    K = self.Wk(input_embeddings)
    V = self.Wv(input_embeddings)

    print(Q.shape, K.shape, V.shape)

    attention_scores = Q @ K.transpose(-1, -2)
    attention_weights = torch.softmax(attention_scores / qkv_length**0.5, dim = -1)
    context_vectors = attention_weights @ V

    print(context_vectors[0])
    return context_vectors
