## 1. Tokenizing text:

In [7]:
import os
import urllib.request


In [8]:
with open("the--verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

In [9]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [10]:
len(raw_text)

20479

In [11]:
import re

text = "Hello, world. This , is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', ',', ' ', 'is', ' ', 'a', ' ', 'test.']


In [13]:
result = re.split(r"([,.]|\s)", text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', '', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [14]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [17]:
result = re.split(r'([,.:;?"()\']|--|\s)', raw_text)

result = [item.strip() for item in result if item.strip()]
preprocessed = result

In [18]:
len(preprocessed)

4630

## 2. Converting tokens into token IDs:

In [19]:
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [20]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1157


In [23]:
vocab = {token:integer for integer, token in enumerate(all_words)}
vocab

{'"': 0,
 "'": 1,
 '(': 2,
 ')': 3,
 ',': 4,
 '--': 5,
 '.': 6,
 ':': 7,
 ';': 8,
 '?': 9,
 'A': 10,
 'Ah': 11,
 'Among': 12,
 'And': 13,
 'Are': 14,
 'Arrt': 15,
 'As': 16,
 'At': 17,
 'Be': 18,
 'Begin': 19,
 'Burlington': 20,
 'But': 21,
 'By': 22,
 'Carlo': 23,
 'Chicago': 24,
 'Claude': 25,
 'Come': 26,
 'Croft': 27,
 'Destroyed': 28,
 'Devonshire': 29,
 'Don': 30,
 'Dubarry_': 31,
 'Emperors': 32,
 'Florence': 33,
 'For': 34,
 'Gallery': 35,
 'Gideon': 36,
 'Gisburn': 37,
 'Gisburn!': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jack!': 58,
 'Jove': 59,
 'Jove!': 60,
 'Just': 61,
 'Lord': 62,
 'Made': 63,
 'Miss': 64,
 'Money': 65,
 'Monte': 66,
 'Moon-dancers': 67,
 'Mr': 68,
 'Mrs': 69,
 'My': 70,
 'Never': 71,
 'No': 72,
 'Now': 73,
 'Nutley': 74,
 'Of': 75,
 'Oh': 76,
 'O

In [24]:
class SimpleTokenizerV1:
    def __init__(self, vocab_size):
        self.str_int = vocab
        self.int_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?"()\']|--|\s)', text)

        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_str[i] for i in ids])
        # replace the spaces before the specified punctuations
        text = re.sub(r'\s+([,.?"()\'])', r'\1', text)
        return text

In [25]:
tokenizer = SimpleTokenizerV1(vocab)

In [27]:
text = """ It's the last he painted, you know,
"""

ids = tokenizer.encode(text)
print(ids)

[56, 1, 870, 1010, 619, 549, 766, 4, 1153, 613, 4]


In [28]:
tokenizer.decode(ids)

"It' s the last he painted, you know,"

### Adding Special context tokens:

In [30]:
text = "Hello, do ypu like tea. is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

In [32]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|junk|>"])

vocab ={token:integer for integer, token in enumerate(all_tokens)}

In [33]:
len(vocab.items())

1159

In [36]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(i, item)

0 ('younger', 1154)
1 ('your', 1155)
2 ('yourself', 1156)
3 ('<|endoftext|>', 1157)
4 ('<|junk|>', 1158)


In [41]:
class SimpleTokenizerV2:
    def __init__(self, vocab_size):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|junk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace the spaces before the specified punctuations
        text = re.sub(r'\s+([,.?"()\'])', r'\1', text)
        return text

In [42]:
tokenizer = SimpleTokenizerV2(vocab)

In [43]:
tokenizer.encode(text)

[1158, 4, 373, 1158, 645, 997, 6, 602, 1021, 5, 134, 1158, 9]

In [44]:
tokenizer.decode(tokenizer.encode(text))

'<|junk|>, do <|junk|> like tea. is this -- a <|junk|>?'

## 3. Byte pair encoding:

In [45]:
import tiktoken

In [48]:
tokenizer = tiktoken.get_encoding("gpt2")

In [49]:
tokenizer.encode("Hello world")

[15496, 995]

In [50]:
tokenizer.decode(tokenizer.encode("Hello world"))

'Hello world'

In [51]:
text = ("Hello, do you like tea?, <|endoftext|>")

tokenizer.encode(text = text)

ValueError: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


In [53]:
text = ("Hello, do you like tea?, <|endoftext|>, fhkjghakhgskd")

tokenizer.encode(text = text, allowed_special = {"<|endoftext|>"})

[15496,
 11,
 466,
 345,
 588,
 8887,
 21747,
 220,
 50256,
 11,
 277,
 71,
 42421,
 456,
 11322,
 70,
 8135,
 67]

## 4. Data Sampling with sliding window:

In [54]:
with open("the--verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [56]:
enc_sample = enc_text[:50]
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x:{x}")
print(f"y:{y}")

x:[40, 367, 2885, 1464]
y:[367, 2885, 1464, 1807]


In [57]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context,  "------>", desired)

[40] ------> 367
[40, 367] ------> 2885
[40, 367, 2885] ------> 1464
[40, 367, 2885, 1464] ------> 1807


In [59]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context),  "------>", tokenizer.decode([desired]))

I ------>  H
I H ------> AD
I HAD ------>  always
I HAD always ------>  thought


In [60]:
import torch

In [61]:
torch.__version__

'2.8.0'

In [75]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text

        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        # Use a Sliding window to chunk the book into overlapping seuences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
            return len(self.input_ids)
        
    def __getitem__(self, idx):
            return self.input_ids[idx], self.target_ids[idx]


In [76]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):

    # Intialize the tokenizer:
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset:
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloaderL
    dataloader = DataLoader(
        dataset, 
        batch_size = batch_size, 
        shuffle = shuffle, 
        drop_last = drop_last, 
        num_workers = num_workers
    )

    return dataloader 

In [77]:
with open("the--verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

In [82]:
Dataloader = create_dataloader_v1(raw_text, batch_size = 1, max_length = 4, stride = 4, shuffle = False)

data_iter = iter(Dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [83]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


In [84]:
Dataloader = create_dataloader_v1(raw_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False)

data_iter = iter(Dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## 5. Creating token embeddings:

In [88]:
dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_core_bpe',
 '_encode_bytes',
 '_encode_only_native_bpe',
 '_encode_single_piece',
 '_mergeable_ranks',
 '_pat_str',
 '_special_tokens',
 'decode',
 'decode_batch',
 'decode_bytes',
 'decode_bytes_batch',
 'decode_single_token_bytes',
 'decode_tokens_bytes',
 'decode_with_offsets',
 'encode',
 'encode_batch',
 'encode_ordinary',
 'encode_ordinary_batch',
 'encode_single_token',
 'encode_to_numpy',
 'encode_with_unstable',
 'eot_token',
 'is_special_token',
 'max_token_value',
 'n_vocab',
 'name',
 'special_tokens_set',
 'token_byte_values']

In [89]:
tokenizer.n_vocab

50257

In [85]:
input_ids = torch.tensor([2, 3, 5, 1])

In [90]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [92]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [93]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [94]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

In [95]:
input_ids

tensor([2, 3, 5, 1])

## Encoding word positions:

In [97]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [98]:
max_length = 4
Dataloader = create_dataloader_v1(
    raw_text, batch_size = 8, max_length = max_length, 
    stride = max_length, shuffle = False
)

data_iter = iter(Dataloader)
inputs, targets = next(data_iter)

In [99]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [101]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings

tensor([[[ 0.4913,  1.1239,  1.4588,  ..., -0.3995, -1.8735, -0.1445],
         [ 0.4481,  0.2536, -0.2655,  ...,  0.4997, -1.1991, -1.1844],
         [-0.2507, -0.0546,  0.6687,  ...,  0.9618,  2.3737, -0.0528],
         [ 0.9457,  0.8657,  1.6191,  ..., -0.4544, -0.7460,  0.3483]],

        [[ 1.5460,  1.7368, -0.7848,  ..., -0.1004,  0.8584, -0.3421],
         [-1.8622, -0.1914, -0.3812,  ...,  1.1220, -0.3496,  0.6091],
         [ 1.9847, -0.6483, -0.1415,  ..., -0.3841, -0.9355,  1.4478],
         [ 0.9647,  1.2974, -1.6207,  ...,  1.1463,  1.5797,  0.3969]],

        [[-0.7713,  0.6572,  0.1663,  ..., -0.8044,  0.0542,  0.7426],
         [ 0.8046,  0.5047,  1.2922,  ...,  1.4648,  0.4097,  0.3205],
         [ 0.0795, -1.7636,  0.5750,  ...,  2.1823,  1.8231, -0.3635],
         [ 0.4267, -0.0647,  0.5686,  ..., -0.5209,  1.3065,  0.8473]],

        ...,

        [[-1.6156,  0.9610, -2.6437,  ..., -0.9645,  1.0888,  1.6383],
         [-0.3985, -0.9235, -1.3163,  ..., -1.1582, -1.13

In [104]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [103]:
token_embeddings[[0], [0]]

tensor([[ 4.9130e-01,  1.1239e+00,  1.4588e+00, -3.6530e-01, -4.0372e-02,
          1.9042e-01,  6.8024e-01, -8.6577e-01,  2.7644e-01,  6.3706e-01,
         -1.5947e+00, -2.6210e-02, -1.4804e+00, -1.0301e+00, -1.2018e+00,
          1.2016e+00,  1.8119e+00,  1.9705e+00,  1.6373e-01, -5.1866e-01,
          5.2069e-01, -1.0986e+00,  4.8871e-01,  4.3105e-01,  1.2858e+00,
         -1.9670e+00,  9.2041e-02,  4.5699e-01, -1.4753e+00, -4.4934e-01,
         -8.3517e-01, -1.1789e+00, -6.0601e-01, -5.3533e-01, -2.9543e-01,
          5.1638e-01,  4.2293e-01,  2.7946e-01, -2.4484e+00,  3.0050e-01,
          3.5847e-01,  7.4261e-01,  2.1654e-01, -9.2025e-01,  2.2604e-01,
         -8.0408e-01,  6.7722e-01,  8.9504e-01, -1.3292e+00, -1.6186e+00,
         -1.6062e+00, -8.2329e-01,  1.1738e-01, -7.6778e-01,  9.9370e-01,
          5.0167e-02,  7.0607e-02, -7.8336e-01,  4.4577e-01, -3.5213e-01,
          7.3934e-01,  3.3461e-01, -5.5481e-01, -3.8830e-01,  9.9484e-01,
          5.0054e-02, -1.5199e+00,  2.

In [105]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [106]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [107]:
pos_embedding_layer.weight

Parameter containing:
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       requires_grad=True)

In [108]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [109]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [110]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
