In [27]:
# download the verdict.txt file from
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
  url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
  file_path = "the-verdict.txt"
  urllib.request.urlretrieve(url, file_path)

In [4]:
# load the file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text = f.read()
print("Total numver of characters: ", len(text))
print(text[:100])

Total numver of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [5]:
# preprocess the text
import re

preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed_text = [word for word in preprocessed_text if word.split()]
print(len(preprocessed_text))
print(preprocessed_text[:100])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--']


In [6]:
# build the vocabulary
all_words = sorted(set(preprocessed_text))
all_words.extend(['<|endoftext|>', "<|unk|>"])
vocab_size = len(all_words)
print(vocab_size)

1132


In [7]:
vocab = { token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [8]:
# implement a tokenizer class
class SimpleTokenizer:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {integer:token for token,integer in vocab.items()}

  def encode(self, text):
    preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed_text = [word for word in preprocessed_text if word.split()]
    preprocessed_text = [word if word in self.str_to_int else "<|unk|>" for word in preprocessed_text]
    ids = [self.str_to_int[word] for word in preprocessed_text]
    return ids

  def decode(self, ids):
    words = [self.int_to_str[id] for id in ids]
    sentence = " ".join(words)
    # Replace spaces before the specified punctuations
    sentence = re.sub(r'\s+([,.:;?!"()\'])', r'\1', sentence)
    return sentence


In [9]:
# test the SimpleTokenizer
tokenizer = SimpleTokenizer(vocab)
test_txt1 = "Hello, Would you like some tea?"
test_txt2 = "It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."
test_txt = "<|endoftext|> ".join((test_txt1, test_txt2))
ids = tokenizer.encode(test_txt)
print(ids)
print(tokenizer.decode(ids))

[1131, 5, 1131, 1126, 628, 910, 975, 10, 1130, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]
<|unk|>, <|unk|> you like some tea? <|endoftext|> It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


In [10]:
# in gpt models byte pair encoding is used
# the library which is used for bpe is tiktoken

# install tiktoken
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [11]:
import tiktoken

bpe_tokenizer = tiktoken.get_encoding("gpt2")
test_txt_3 = "Hello, how are you?<|endoftext|> It's the last he painted, you know, Mrs. Gisburn said with someunknownperson."
ids = bpe_tokenizer.encode(test_txt_3, allowed_special = {"<|endoftext|>"})
print(ids)
print(bpe_tokenizer.decode(ids))

[15496, 11, 703, 389, 345, 30, 50256, 632, 338, 262, 938, 339, 13055, 11, 345, 760, 11, 9074, 13, 402, 271, 10899, 531, 351, 617, 34680, 6259, 13]
Hello, how are you?<|endoftext|> It's the last he painted, you know, Mrs. Gisburn said with someunknownperson.


In [12]:
# tokenize the data in the file

# load the file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text = f.read()

encoded_text = bpe_tokenizer.encode(text)
print(len(encoded_text))

5145


In [13]:
# remove first 50 tokens from the data
encoded_text = encoded_text[50:]
# sample to create input target pairs
context_size = 5
for i in range(1, context_size + 1):
  context = encoded_text[:i]
  target = encoded_text[i]
  print(f"{context} ---> {target}")

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257
[290, 4920, 2241, 287, 257] ---> 4489


In [14]:
# create a dataset for encoding and returning the input and output tensors
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.output_ids = []

    token_ids = tokenizer.encode(txt)

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i : i+max_length]
      target_chunk = token_ids[i+1 : i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.output_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.output_ids[idx]

In [15]:
# create a dataloader
def create_dataLoader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  return data_loader

In [16]:
# test the dataLoader

# load the file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text = f.read()

dataLoader = create_dataLoader_v1(text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataLoader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [17]:
# dataLoader with higher batch_size
dataLoader = create_dataLoader_v1(text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataLoader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


In [18]:
# creating token embeddings
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [21]:
# create a token_embedding_layer
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [23]:
max_length = 4
dataLoader = create_dataLoader_v1(text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataLoader)
inputs, targets = next(data_iter)
print(f'Inputs: {inputs} \n targets: {targets}')
print(inputs.shape)

Inputs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]) 
 targets: tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
torch.Size([8, 4])


In [24]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [25]:
# create positional encoding
context_length = max_length
positional_encoding_layer =  torch.nn.Embedding(context_length, output_dim)
positional_embeddings = positional_encoding_layer(torch.arange(context_length))
print(positional_embeddings.shape)

torch.Size([4, 256])


In [26]:
# final input embeddings
input_embeddings = token_embeddings + positional_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
