In [None]:
#@title Imports
import os
import re

import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

#import gensim.downloader as api


# [X] SimpleTokenzierV1

In [None]:
# for item in os.listdir('.'): # '.' refers to the current directory
#     print(item)

os.getcwd()

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_txt = f.read()

print(len(raw_txt))


In [None]:
# Remove whitespace or not?
#   Removing whitespaces reduces memory and computing requirement
#   White spaces can be useful for text sensitive to the structure, like python indention
preprossed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_txt)
preprossed = [t.strip() for t in preprossed if t.strip()]
print(preprossed[:10])
print(len(preprossed))

In [None]:
# Vocabulary: all the unique tokens in alphbetically order


# For tokens not in the vocab
UNKNOWN_TOKEN = "<|unk|>"

# Added between text sources.
# Allow the LLM to process and understand the data better.
END_OF_TEXT_TOKEN = "<|endoftext|>"

# Following special tokens are used by different types of tokenizers
# [BOS]: beginning of sequence
# [EOS]: end of sequence
# [PAD]: padding

sorted_unique_tokens = sorted(set(preprossed))
sorted_unique_tokens.extend([END_OF_TEXT_TOKEN, UNKNOWN_TOKEN])
print(len(sorted_unique_tokens))

In [None]:
# Encode token to token id
vocab = {token:id for id,token in enumerate(sorted_unique_tokens)}

# for i, item in enumerate(vocab.items()):
#   print(i, item)
#   if i > 20:
#     break

In [None]:
class SimpleTokenzierV1:
  def __init__(self, vocab):
    self.token_to_id = vocab
    self.id_to_token = {id:token for token,id in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [t.strip() for t in preprocessed if t.strip()]
    preprocessed = [t if t in self.token_to_id else UNKNOWN_TOKEN for t in preprocessed]
    return [self.token_to_id[t] for t in preprocessed]

  def decode(self, ids):
    tokens = [self.id_to_token[id] for id in ids]
    text = " ".join(tokens)
    return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

In [None]:
tokenizer_v1 = SimpleTokenzierV1(vocab)

In [None]:
text1 = "how are you, jessica!"
text2 = "do you like tea?"
test_ids = tokenizer_v1.encode(" <|endoftext|> ".join((text1, text2)))
print(test_ids)
print(tokenizer_v1.decode(test_ids))

# [X] BPE Tokenizer

In [None]:
# The immediate space preceding the word and the word itself are encoded as a single token

END_OF_TEXT_TOKEN = "<|endoftext|>"

tokenizer_bpe = tiktoken.get_encoding('gpt2') # download pre-trained vocabulary and merge rules

# texts = ["", "I'm", "I'm"]
# test_ids = tokenizer_bpe.encode(END_OF_TEXT_TOKEN.join(texts), allowed_special={END_OF_TEXT_TOKEN})
# print(test_ids)
# print(tokenizer_bpe.decode(test_ids))

# Dataset and DataLoader


In [None]:
class DatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
      # One pair contains max_length training targets
      self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
      self.target_ids.append(torch.tensor(token_ids[i+1:i+1+max_length]))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

# Dataload will load the dataset efficiently
# batch_size: The data the model has to process before updating the parameters
#             The number of tensor pairs each dataloader iteration return
#             Smaller batch_size requires less memory but more noisy small updates.
#             Larger batch_size will make less noisy updates but take more time.
# max_length: The context length (the sliding window size)
# drop_last:  To drop the last batch if it's shorter to prevent loss spike during training
# stride: word overlapping will create overfitting, larger stride also help go through the text faster
# num_workders: process the input in parallel
def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle=False, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = DatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  return dataloader


In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_txt = f.read()

print(len(raw_txt))

dataloader = create_dataloader_v1(raw_txt, batch_size=2, max_length=10, stride=10)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print(inputs)

# [X] Token Embeddings and Position Embeddings

In [None]:
# # 300 dimension
# # huggingface.co/fse/word2vec-google-news-300
# # Download the vector
# word_vectors = api.load("word2vec-google-news-300")


In [None]:
# print(word_vectors['computer'])
# print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))
# print(word_vectors.similarity(['woman', 'man']))
# print(word_vectors.similarity(['tokyo', 'kyoto']))
# print(word_vectors.similarity(['fish', 'bicycle']))
# print(np.linalg.norm(word_vectors['women'] - word_vectors['man']))
# print(np.linalg.norm(word_vectors['snow'] - word_vectors['pixel']))

In [None]:

# Create a embedding layer weight matrix
# vocab_size = 50000
embedding_size = 8
context_length = 4
batch_size = 2
d_out = 15
num_heads = 3

# torch.manual_seed(123)

# # A simple lookup table that stores embedding of a fixed dictionary and size.
# # Initialized to random numbers.
# embedding_layer = torch.nn.Embedding(vocab_size, embedding_size)

# # Position embedding weight matrix
# pos_embedding_layer = torch.nn.Embedding(context_length, embedding_size)


# print(embedding_layer.weight)
# print(embedding_layer(torch.tensor([3])))

# input_ids = torch.tensor([2, 3, 5, 1])
# print(embedding_layer(input_ids))

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_txt = f.read()

print(len(raw_txt))

dataloader = create_dataloader_v1(raw_txt, batch_size=batch_size, max_length=context_length, stride=context_length)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)


token_embeddings = embedding_layer(inputs) # batch_size x context_length x embedding_size
# print(token_embeddings.shape)
# print(input)
# print(torch.arange(0, context_length))
pos_embeddings = pos_embedding_layer(torch.arange(0, context_length)) # context_length x embedding_size
input_embeddings = token_embeddings + pos_embeddings # python broadcasting
print(input_embeddings)

# Self Attention

In [None]:
class SelfAttentionV1(torch.nn.Module):
  def __init__(self, embedding_size, d_out):
    super().__init__()
    self.Wq = torch.nn.Parameter(torch.rand(embedding_size, d_out), requires_grad=False)
    self.Wk = torch.nn.Parameter(torch.rand(embedding_size, d_out), requires_grad=False)
    self.Wv = torch.nn.Parameter(torch.rand(embedding_size, d_out), requires_grad=False)

  def forward(input_embeddings):
    Q = input_embeddings @ self.Wq
    K = input_embeddings @ self.Wk
    V = input_embeddings @ self.Wv

    print(Q.shape, K.shape, V.shape)

    attention_scores = Q @ K.transpose(-1, -2)
    attention_weights = torch.softmax(attention_scores / d_out**0.5, dim = -1)
    context_vectors = attention_weights @ V

    print(context_vectors[0])
    return context_vectors


In [None]:
class SelfAttentionV2(torch.nn.Module):
  def __init__(self, embedding_size, d_out, qkv_bias=False):
    super().__init__()
    self.Wq = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    self.Wk = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    self.Wv = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)

  def forward(input_embeddings):
    Q = self.Wq(input_embeddings)
    K = self.Wk(input_embeddings)
    V = self.Wv(input_embeddings)

    print(Q.shape, K.shape, V.shape)

    attention_scores = Q @ K.transpose(-1, -2)
    attention_weights = torch.softmax(attention_scores / d_out**0.5, dim = -1)
    context_vectors = attention_weights @ V

    print(context_vectors[0])
    return context_vectors


In [None]:
class CausalAttentionV1(torch.nn.Module):
  def __init__(self, embedding_size, d_out, context_length, drop_out=0.2, qkv_bias=False):
    super().__init__()
    self.d_out = d_out
    self.Wq = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    self.Wk = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    self.Wv = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    # dropout rate = 0.2
    # * 20% of the elements in the matrix is switched to 0
    # * the remaining values are scaled by 120%
    self.dropout = torch.nn.Dropout(drop_out)
    # mask
    self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

  def forward(self, input_embeddings):
    batch_size, context_length, embedding_size = input_embeddings.shape

    Q = self.Wq(input_embeddings)
    K = self.Wk(input_embeddings)
    V = self.Wv(input_embeddings)

    print(Q.shape, K.shape, V.shape)

    attention_scores = Q @ K.transpose(1, 2)

    # mask
    # context_length = input_embeddings.shape[1]
    # mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
    # masked = attention_scores.masked_fill(mask.bool(), -torch.inf)
    # _ ops are in-place
    # when the current batch has smaller context length than the mask's
    attention_scores.masked_fill_(self.mask.bool()[:context_length, :context_length], -torch.inf)

    # scale and softmax
    attention_weights = torch.softmax(attention_scores / d_out**0.5, dim = -1)

    # dropout
    # Prevents over-fitting and improves generalization performance
    attention_weights = self.dropout(attention_weights)

    # Data leakage: the weight is influenced by the masked attention weights during softmax
    # triangular lower function
    # self.mask = torch.tril(torch.ones(context_length, context_length))
    # masked_attention_weights = attention_weights * self.mask
    # row_sums = masked_attention_weights.sum(dim=1, keepdim=True)
    # masked_attention_norm = masked_attention_weights / row_sums

    context_vectors = attention_weights @ V

    print(context_vectors[0])
    return context_vectors

In [None]:
embedding_size = 8
context_length = 10
batch_size = 2
d_out = 15
num_heads = 3
qkv_size = 5
seq_len = 4
dropout_rate = 0.2

input_embeddings = torch.rand(batch_size, seq_len, embedding_size)
ca = CausalAttentionV1(embedding_size, d_out=qkv_size, context_length=context_length, drop_out=dropout_rate)
context_vectors = ca(input_embeddings)

In [None]:
# Multi Head attention

# output is context length * num_heads
class MultiHeadAttentionWrapper(torch.nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
    super().__init__()
    self.heads = torch.nn.ModuleList([CausalAttentionV1(d_in, d_out, context_length, dropout, qkv_bias)
                                for _ in range(num_heads)])

  def forward(self, input_embedding):
    # TODO: need parallization using weight splits
    return torch.cat([head(input_embedding) for head in self.heads], dim=-1)

In [None]:
multi_head_attention = MultiHeadAttentionWrapper(embedding_size, qkv_size, context_length, dropout_rate, num_heads)
concat_context_vectors = multi_head_attention(input_embeddings)

print(concat_context_vectors.shape)
print(concat_context_vectors[0])

In [None]:
#@title Masked Multi Head Attention with Weight Split

class MultiHeadAttention(torch.nn.Module):
  def __init__(self, embedding_size, d_out, context_length, dropout, num_heads, qkv_bias=False):
    super().__init__()
    self.d_out = d_out
    self.num_heads = num_heads
    self.head_dim = d_out // num_heads

    # Optimized for neural netowrk, much better for backward propogation
    self.Wq = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    self.Wk = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)
    self.Wv = torch.nn.Linear(embedding_size, d_out, bias=qkv_bias)

    # To combine head outputs
    self.out_proj = torch.nn.Linear(d_out, d_out)

    self.dropout = torch.nn.Dropout(dropout)
    self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

  def forward(self, input_embeddings):
    batch_size, context_length, embedding_size = input_embeddings.shape

    Q = self.Wq(input_embeddings)
    K = self.Wk(input_embeddings)
    V = self.Wv(input_embeddings)

    # Weight split
    # Unroll the last dimention to include num_heads and head_dim
    # (batch_size, context_length, d_out) -> (batch_size, context_length, num_heads, head_dim)
    # 4-dimension tensor
    Q = Q.view(batch_size, context_length, self.num_heads, self.head_dim)
    K = K.view(batch_size, context_length, self.num_heads, self.head_dim)
    V = V.view(batch_size, context_length, self.num_heads, self.head_dim)
    # Group the matrics by "number of heads"
    # (batch_size, context_length, num_heads, head_dim) ->
    # (batch_size, num_heads, context_length, head_dim)
    Q = Q.transpose(1, 2)
    K = K.transpose(1, 2)
    V = V.transpose(1, 2)

    attention_scores = Q @ K.transpose(2, 3)

    # mask, for causal attention
    attention_scores.masked_fill_(
        self.mask.bool()[:context_length, :context_length], -torch.inf
    )

    # scale and softmax
    attention_weights = torch.softmax(attention_scores / self.head_dim**0.5, dim=-1)

    # drop out, to avoid overfitting
    attention_weights = self.dropout(attention_weights)

    # (batch_size, head_size, context_length, head_dim)
    context_vectors = attention_weights @ V

    context_vectors = context_vectors.transpose(1, 2)
    # (batch_size, context_length, d_out)
    # contiguous() to make sure they're in same memory block
    context_vectors = context_vectors.contiguous().view(batch_size, context_length, self.d_out)

    #print(context_vectors.shape)

    context_vectors = self.out_proj(context_vectors)
    return context_vectors



In [None]:
mha = MultiHeadAttention(embedding_size, d_out, context_length, 0.1, num_heads)
context_vectors = mha(input_embeddings)

print(context_vectors.shape)
print(context_vectors)

# GPT-2

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256, #1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

class GPTModel(torch.nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.tok_emb = torch.nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
    self.pos_emb = torch.nn.Embedding(cfg['context_length'], cfg['emb_dim'])
    self.dropout_layer = torch.nn.Dropout(cfg['drop_rate'])

    self.trf_blocks = torch.nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])
    self.final_norm = LayerNorm(cfg['emb_dim'])
    self.out_head = torch.nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

  # in_idx: sequences of token ids
  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embeds = self.tok_emb(in_idx)
    pos_embeds = self.pos_emb(torch.arange(0, seq_len, device=in_idx.device))
    x = tok_embeds + pos_embeds
    x = self.dropout_layer(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits


class TransformerBlock(torch.nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.norm1 = LayerNorm(cfg['emb_dim'])
    self.norm2 = LayerNorm(cfg['emb_dim'])
    # embedding_size, d_out, context_length, dropout, num_heads, qkv_bias=False
    self.attention = MultiHeadAttention(cfg['emb_dim'], cfg['emb_dim'], cfg['context_length'],
                                        cfg['drop_rate'], cfg['n_heads'], cfg['qkv_bias'])
    self.ffn = FeedForward(cfg)
    self.dropout = torch.nn.Dropout(cfg['drop_rate'])

    return

  def forward(self, x):
    shortcut = x
    x = self.norm1(x)
    x = self.attention(x)
    x = self.dropout(x)
    x = x + shortcut

    shortcut = x
    x = self.norm2(x)
    x = self.ffn(x)
    x = self.dropout(x)
    x = x + shortcut
    return x

class LayerNorm(torch.nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    # trainable parameters
    # allows the model to learn appropriate scaling and shifting that best suit the data
    self.scale = torch.nn.Parameter(torch.ones(emb_dim))
    self.shift = torch.nn.Parameter(torch.zeros(emb_dim))

  def forward(self, input_embeddings):
    mean = input_embeddings.mean(dim = -1, keepdim=True)
    # If unbiased=True, apply Bessel's correction
    var = input_embeddings.var(dim = -1, keepdim=True, unbiased=False)
    # Add a small eps to avoid divide-by-zero
    norm_input_embedding = (input_embeddings - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_input_embedding + self.shift

class GELU(torch.nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, input_embeddings):
    return 0.5 * input_embeddings * (1 + torch.tanh(
        torch.sqrt(torch.tensor(2.0 / torch.pi)) * (input_embeddings + 0.044715 * torch.pow(input_embeddings, 3))
    ))

class FeedForward(torch.nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.layers = torch.nn.Sequential(
        # Expansion
        torch.nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
        # Activation
        GELU(),
        # Contraction
        torch.nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']),
    )

  def forward(self, input_embeddings):
    return self.layers(input_embeddings)


In [None]:
gpt = GPTModel(GPT_CONFIG_124M)
outputs = gpt(inputs)
print(outputs.shape)

In [None]:
# Test: test the transformer block
trf = TransformerBlock(GPT_CONFIG_124M)
torch.manual_seed(123)
batch_example = torch.rand(2, 5, GPT_CONFIG_124M['emb_dim'])
outputs = trf(batch_example)
print(outputs.shape)

In [None]:
# Test: Number of parameteres
total_params = sum(p.numel() for p in gpt.parameters())
print(total_params)

output_layer_params = sum(p.numel() for p in gpt.out_head.parameters())
print(output_layer_params)

In [None]:
# Test: Memory
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)
print(f'{total_size_mb:.2f} MB')

In [None]:
# Test: layer normalization example
torch.set_printoptions(sci_mode=False)
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = torch.nn.Sequential(torch.nn.Linear(5, 6), torch.nn.ReLU())
out = layer(batch_example)
print(out)

mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True, unbiased=False)
print(mean, "\n", var)

out_norm = ((out - mean) / torch.sqrt(var))

mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True, unbiased=False)
print(mean, "\n", var)

In [None]:
# Test: Shortcut Connection and Vanishing Gradient
class ExampleDeepNeuralNetwork(torch.nn.Module):
  def __init__(self, layer_sizes, use_shortcut):
    super().__init__()
    self.use_shortcut = use_shortcut
    self.layers = torch.nn.ModuleList([
        torch.nn.Sequential(torch.nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
        torch.nn.Sequential(torch.nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
        torch.nn.Sequential(torch.nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
        torch.nn.Sequential(torch.nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
        torch.nn.Sequential(torch.nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
    ])
  def forward(self, input_embeddings):
    for layer in self.layers:
      layer_output = layer(input_embeddings)
      if self.use_shortcut and input_embeddings.shape == layer_output.shape:
        input_embeddings = input_embeddings + layer_output
      else:
        input_embeddings = layer_output
    return input_embeddings


layer_size = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1]])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_size, False)
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_size, True)


def print_gradients(model, x):
  output = model(x)
  target = torch.tensor([[0.]])

  loss = torch.nn.MSELoss()
  loss = loss(output, target)

  loss.backward()

  for name, param in model.named_parameters():
    if 'weight' in name:
      print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

print("No shortcut connection")
print_gradients(model_without_shortcut, sample_input)

print("\nWith shortcut connection")
print_gradients(model_with_shortcut, sample_input)

# Generate Text

In [None]:
def generate_text(model, input_idx, max_new_tokens, context_size):

  # Generate up to max_new_tokens of tokens
  for _ in range(max_new_tokens):
    # get the last context_size tokens if the input is longer than context_size
    idx_cond = input_idx[:, -context_size:]

    # Step 1: produce the output logits
    with torch.no_grad():
      logits = model(idx_cond)

    # Step 2: get the last vector of the logits
    logits = logits[:, -1, :]

    # Step 3: get probability from logits using softmax
    prob = torch.softmax(logits, dim=-1)  # (batch_size, vocab_size)

    # Step 4: find the position with largest probability
    predicted_tokens = torch.argmax(prob, dim=-1, keepdim=True)  # (batch_size, 1)

    # Step 5: append the predicted token to the previous input tokens
    input_idx = torch.cat((input_idx, predicted_tokens), dim=1)  # (batch_size, n_tokens + 1)

  return input_idx

In [None]:
import tiktoken

# Disable dropout since we are not training the model
gpt.eval()

tokenizer = tiktoken.get_encoding("gpt2")
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_txt = f.read()

dataloader = create_dataloader_v1(raw_txt, batch_size=2, max_length=5, stride=10)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print('Input:')
print(inputs.shape)
print(tokenizer.decode(inputs[0].squeeze(0).tolist()))

outputs = generate_text(gpt, inputs, 10, GPT_CONFIG_124M['context_length'])

print('\nOutput:')
print(outputs.shape)
print(tokenizer.decode(outputs[0].squeeze(0).tolist()))

# Training

In [None]:
# Cross Entropy and Perplexity

def cross_entropy_loss(logits, targets):
  logits_flat = logits.flatten(0, 1)
  targets_flat = targets.flatten()
  return torch.nn.functional.cross_entropy(logits_flat, targets_flat)

def cross_entropy_loss_v1(logits, targets):
  batch_size, seq_len, vocab_size = logits.shape
  probas = torch.softmax(logits, dim=-1)

  target_probas = []
  for i in range(batch_size):
    target_proba = probas[i, [j for j in range(seq_len)], targets[i]]
    print(target_proba)
    target_probas.append(target_proba)
  log_probas = torch.log(torch.cat(target_probas))
  avg_log_probas = torch.mean(log_probas)
  return -avg_log_probas

# Disable dropout since we are not training the model
# gpt.eval()

# data_iter = iter(dataloader)
# inputs, targets = next(data_iter)
# print(inputs)
# print(targets)

# with torch.no_grad():
#   logits = gpt(inputs)

# loss = cross_entropy_loss(logits, targets)
# print("loss: ", loss)

# perplexity = torch.exp(loss)
# print("perplexity: ", perplexity)

In [None]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_txt = f.read()

print("Characters: ", len(raw_txt))
print("Tokens: ", len(tokenizer.encode(raw_txt)))

train_ratio = 0.9
split_idx = int(train_ratio * len(raw_txt))
train_data = raw_txt[:split_idx]
val_data = raw_txt[split_idx:]

torch.manual_seed(123)

train_dataloader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M['context_length'],
    stride=GPT_CONFIG_124M['context_length'],
    drop_last=True,
    shuffle=True,
    num_workers=0)

val_dataloader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M['context_length'],
    stride=GPT_CONFIG_124M['context_length'],
    drop_last=True,
    shuffle=True,
    num_workers=0)

# Sanity check
print('Train dataloader:')
for x, y in train_dataloader:
  print(x.shape, y.shape)
print('\nValidation dataloader:')
for x, y in val_dataloader:
  print(x.shape, y.shape)

torch.manual_seed(123)
gpt = GPTModel(GPT_CONFIG_124M)

def calculate_loss_batch(input_batch, target_batch, model, device):
  # 2 x 256
  input_batch, target_batch = input_batch.to(device), target_batch.to(device)
  # 2 x 256 x 50257
  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
  return loss

def calculate_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0
  if len(data_loader) == 0:
    return float('nan')
  elif num_batches is None:
    num_batches =len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i < num_batches:
      loss = calculate_loss_batch(input_batch, target_batch, model, device)
      total_loss += loss
    else:
      break
  return total_loss / num_batches


In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
elif torch.backends.mps.is_available():
  device = torch.device('mps')
else:
  device = torch.device('cpu')

gpt.to(device)
torch.manual_seed(123)

with torch.no_grad(): # Disable gradient tracking for efficiency bc we're not training yet
  train_loss = calculate_loss_loader(train_dataloader, gpt, device)
  val_loss = calculate_loss_loader(val_dataloader, gpt, device)
print(f"Training loss: {train_loss}, Validation loss: {val_loss}")