In [1]:
import torch
import tiktoken

In [2]:
passage = "The light shines in the darkness, and the darkness has not overcome it."

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
encoded_text = tokenizer.encode(passage)
print(encoded_text) 

[464, 1657, 32481, 287, 262, 11854, 11, 290, 262, 11854, 468, 407, 10980, 340, 13]


In [5]:
VOCAB_SIZE = len(encoded_text)  # number of unique tokens ü™ô
OUTPUT_DIM = 5  # 5 is pretty small but we're just horsin' around. üê¥

embedding_layer = torch.nn.Embedding(VOCAB_SIZE, OUTPUT_DIM)
print(embedding_layer.weight)

Parameter containing:
tensor([[-0.4452,  0.9201,  1.0529, -0.4744,  0.3090],
        [ 0.5486, -1.4887, -0.5482, -0.3868, -0.0213],
        [-0.1522, -0.5897,  0.8528,  1.2863, -1.0256],
        [-1.2497, -0.0952,  1.1822, -0.9109, -0.6689],
        [-1.0377,  0.6671,  0.6595,  1.6328,  2.1541],
        [-1.5323,  1.8491,  1.2160, -0.5021, -0.6755],
        [-0.6509, -0.5629, -1.0669,  1.0510,  1.1282],
        [-0.5771, -1.2061, -1.6483, -0.9648, -0.5654],
        [-1.4177, -0.0035, -0.3458,  0.1130,  2.2384],
        [ 1.1675,  0.0677,  0.1884, -1.1070,  0.5996],
        [ 0.9250, -0.7536,  1.0432,  1.3017,  2.0841],
        [ 0.3467, -0.7117,  1.2106, -0.3477,  1.0422],
        [ 0.5768, -1.4526,  0.0191, -2.2551, -1.8860],
        [-1.2837, -0.3758, -0.1534,  0.1628, -0.7431],
        [ 1.1332, -0.9861,  2.0011,  2.1169, -0.3177]], requires_grad=True)


In [6]:
print(embedding_layer(torch.tensor([1])))  # "light"

tensor([[ 0.5486, -1.4887, -0.5482, -0.3868, -0.0213]],
       grad_fn=<EmbeddingBackward0>)


In [7]:
from torch.utils.data import Dataset, DataLoader


class LLMDataset(Dataset):
    def __init__(self, txt: str, tokenizer: tiktoken.Encoding, context_length: int = 1024, stride: int = 1):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - context_length, stride):
            input_chunk = token_ids[i:i + context_length]
            target_chunk = token_ids[i + 1: i + context_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]



In [10]:
def create_dataloader(
    txt: str,
    tokenizer: tiktoken.Encoding,
    batch_size: int = 4,
    context_length: int = 256,
    stride: int = 128,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
):
    dataset = LLMDataset(txt, tokenizer, context_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [12]:
CONTEXT_LENGTH = 1024
STRIDE = 4
BATCH_SIZE = 8

with open("docs/gospels.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# small passage & params for testing
# CONTEXT_LENGTH = 16
# raw_text = "In the beginning was the Word, and the Word was with God, and the Word was God. The same was in the beginning with God. All things were made by him; and without him was not any thing made that was made. In him was life; and the life was the light of men. And the light shineth in darkness; and the darkness comprehended it not."

tokenizer = tiktoken.get_encoding("gpt2")

dataset = LLMDataset(raw_text, tokenizer, context_length=CONTEXT_LENGTH, stride=STRIDE)

dataloader = create_dataloader(raw_text, tokenizer, batch_size=BATCH_SIZE, context_length=CONTEXT_LENGTH, stride=STRIDE)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\\n", targets)


Inputs:
 tensor([[48141,   786,   274,  ...,   477,  1243,   880],
        [  867,   517,  4762,  ...,  3326,   813,    11],
        [  290, 12472,   329,  ...,   290,   339,   473],
        ...,
        [  345,    13, 43970,  ..., 10643,  2788,   340],
        [  287,  4167,    11,  ...,  9400,    13,   843],
        [  374,  1385,   298,  ...,   326,   308,   265]])

Targets:\n tensor([[  786,   274,    11,  ...,  1243,   880,   339],
        [  517,  4762,   780,  ...,   813,    11,   314],
        [12472,   329,  9838,  ...,   339,   473,   342],
        ...,
        [   13, 43970,   287,  ...,  2788,   340,  2236],
        [ 4167,    11,  1864,  ...,    13,   843,   355],
        [ 1385,   298,    11,  ...,   308,   265,  1456]])


In [13]:
OUTPUT_DIM = 5

embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, OUTPUT_DIM)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 1.4457e+00,  1.0147e+00,  8.5352e-01,  5.8288e-02, -3.6087e-01],
        [-2.9525e-01,  1.8130e+00, -3.1597e-02, -1.7437e-01, -8.0902e-01],
        [ 3.2828e+00,  2.6578e-01, -8.9746e-01, -4.2666e-01, -2.5710e-01],
        ...,
        [ 1.6382e+00, -1.1229e+00,  2.3758e-01, -1.0001e-01, -5.0283e-01],
        [-1.0231e+00, -7.1166e-01,  1.5435e+00,  3.1774e-01, -2.0479e-03],
        [-1.6672e-01, -8.5890e-01, -4.3347e-01, -6.5482e-01,  9.3029e-01]],
       requires_grad=True)
