In [1]:
with open("./data/tinyshakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(len(text))

1115394


In [2]:
# we will use character generation (not token) at first
# therefore our vocabulary will be based on chars
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(f"Vocab size: {vocab_size}")

# naive character level tokenizer
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}


def encode(input: str) -> list:
    """Encodes a string character by character."""
    return [stoi[ch] for ch in input]


def decode(input: list[int]) -> str:
    """Decodes an encoded list of integers back into original string"""
    return "".join([itos[i] for i in input])


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size: 65


In [3]:
# encode the whole training data
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [4]:
# split the dataset to train/validation
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
# split data into batches of block size
# block size (context size) - the size of chunk of data which can be used for training
# given a block like "abcd1234" we have 8 different training examples:
# - given "a" -> predict "b"
# - "ab" -> "c"
# - "abc" -> "d"
# etc
# so if X = "abcd1234", then Y = "bcd12345" (shifted +1 by index)

block_size = 8
batch_size = 4
torch.manual_seed(1337)


def get_batch(data: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Returns `batch_size` number of random batches for X and Y
    where each batch is of size `block_size`
    """
    ids = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ids])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ids])
    return x, y


x_train, y_train = get_batch(train_data)
print(x_train)
print(y_train)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [7]:
# Use Bigram language model, which given one token, can predict the next token
from bigram import BigramLM


model = BigramLM(vocab_size=vocab_size)
logits, loss = model(x_train, y_train)
print(logits.shape)
print(loss)

# test run on untrained model
# starting point is a single character, whose token is equal to 0
idx = torch.zeros((1, 1), dtype=torch.long)
idx = model.generate(idx, 100)
text = decode(idx[0].tolist())
print(text)

torch.Size([32, 65])
tensor(4.7288, grad_fn=<NllLossBackward0>)

t:lwIJyIhwyVOhVQ:!KvGyNgNggOC
?Boz aUg3?SjDYyxp-w,KpwTsw:n-dH.CWNuAV?'VhrqFH$gOYuXsMdvypXVwtsw'vAn3y
