In [16]:
with open("/content/the-verdict.txt", encoding ="utf-8") as f:
    raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Goal: Tokenize all characters


In [17]:
import re
text  = "This is test."
result = re.split(r'[-,.:;?_!"()\[\]\s]', text)
result

['This', 'is', 'test', '']

### Should remove whitespaces ?
1. yes if we want to reduce the memory and computing requirements
2. If the text is senstive like python code which have major role of whitespaces in syntax, better to avoid it

In [18]:
## to remove the whitespace
result = [item for item in result if item.strip()]
result

['This', 'is', 'test']

In [19]:
## processing throught the whole text
preprocessed = re.split(r'[-,.:;?_!"()\[\]\s]',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', 'though', 'a', 'good', 'fellow', 'enough', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', 'in', 'the', 'height', 'of']


In [20]:
## creating Token IDs
tokens = sorted(set(preprocessed)) ## sort the words alphabetically
tokens.extend(["<|endoftext|>","<|unk|>"]) # for the end of one source and unknow token
vocab_size = len(tokens)
vocab_size


1154

## why do we need the vocabulary ?
when we convert the ouptut of an LLM into text we need token ids  

In [21]:
import itertools

## creating a vocabulary dictionary
vocab = {token: integer for integer, token in enumerate(tokens)}
for key, value in itertools.islice(vocab.items(),10):
      print(f'{key}: {value}')

': 0
'Are: 1
'It's: 2
'coming': 3
'done': 4
'subject: 5
'technique': 6
'way: 7
A: 8
Ah: 9


In [22]:
vocab_inverted = {i:s for s, i in vocab.items()}
for key, value in itertools.islice(vocab_inverted.items(),10):
      print(f'{key}: {value}')

0: '
1: 'Are
2: 'It's
3: 'coming'
4: 'done'
5: 'subject
6: 'technique'
7: 'way
8: A
9: Ah


In [23]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'[-,.:;?_!"()\[\]\s]', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self,ids):
        text = " ".join(self.int_to_str[i] for i in ids)
        text = re.sub(r'\s+([,.?!"()\'])',r'\1', text)
        return text


In [24]:
tokenizer = SimpleTokenizer(vocab)

text= """"It's the last he painted, you know,kshitij"
      <|endoftext|> Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)## 1154 is kshtitij

[59, 1005, 616, 541, 761, 1146, 610, 1153, 1152, 71, 34, 868, 1128, 770, 811]


In [25]:
tokenizer.decode(tokenizer.encode(text))

"It's the last he painted you know <|unk|> <|endoftext|> Mrs Gisburn said with pardonable pride"

## Addition content tokens are
1. [BOS] [beginning of the sequence]
2. [EOS] [End of the sequence]
3. [PAD] [Padding]

## GPT uses BPE techniques
Byte-Pair Encoding (BPE) is a data compression and subword tokenization algorithm that creates a new vocabulary by iteratively merging the most frequent pairs of characters in a text. It starts with a vocabulary of individual characters and, through repeated merging, builds a fixed-size vocabulary of subword tokens, allowing models to handle both common words and rare or unseen words efficiently.  

We will be using tiktoken library for implementing BPE

In [26]:
!pip install tiktoken



In [27]:
import importlib
import tiktoken
print("tiktoken version :", importlib.metadata.version("tiktoken") )

tiktoken version : 0.12.0


In [28]:
tokenizer = tiktoken.get_encoding("gpt2")

In [29]:
text = ("Hello do you know about tokenizers? <|endoftext|> I will explaintoyou.")
integers = tokenizer.encode(text, allowed_special= {"<|endoftext|>"})
print(integers)

[15496, 466, 345, 760, 546, 11241, 11341, 30, 220, 50256, 314, 481, 1193, 2913, 726, 280, 13]


In [30]:
string = tokenizer.decode(integers)
string

'Hello do you know about tokenizers? <|endoftext|> I will explaintoyou.'

### Input Target Paris
We will implement a data loade that fetches the input-target pairs using sliding window approach. We will be using the same dataset and tokenizer.


In [31]:
with open("/content/the-verdict.txt", "r",encoding = "utf-8") as file:
  text = file.read()


In [32]:
encoded_text = tokenizer.encode(text)
print(len(encoded_text))

5145


In [33]:
encoded_text[:5]

[40, 367, 2885, 1464, 1807]

In [34]:
 ## the most easiest and intuitive way to create the input-target pairs for the next word prediction task is to x input tokens and y target tokens which are input shifted by 1.

## the context size determines how many tokens are included in the input

context_size = 4
x = encoded_text[:context_size]
y = encoded_text[1:context_size + 1]
print(f"x: {x}")
print(f"y: {y}")

x: [40, 367, 2885, 1464]
y: [367, 2885, 1464, 1807]


In [35]:
for i in range(1, context_size + 1):
  context = encoded_text[:i]
  target = encoded_text[i]
  print(f"x: {context}, --> y: {target}")


x: [40], --> y: 367
x: [40, 367], --> y: 2885
x: [40, 367, 2885], --> y: 1464
x: [40, 367, 2885, 1464], --> y: 1807


In [36]:
for i in range(1, context_size + 1):
  context = encoded_text[:i]
  target = encoded_text[i]
  print(f"x: {tokenizer.decode(context)},--> y: {tokenizer.decode([target])} ")

x: I,--> y:  H 
x: I H,--> y: AD 
x: I HAD,--> y:  always 
x: I HAD always,--> y:  thought 


## Implementing a Data Loader

for efficient data loading PyTorch built-in Dataset and Dataloader classes will be used

1. Tokenize the entire text
2. use a sliding window to chunk the whole book into overlapping sequences of max_length
3. Return the total number of rows of dataset
4. Return a single row from the dataset



In [42]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDataset(Dataset):
  def __init__(self, text, tokenizer, max_length,stride):
    self.input_ids = []
    self.target_ids = []

    ## tokenize the entire dataset

    token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

    ## using sliding window to chunk the book into overlapping sequence
    for i in range(0, len(token_ids)- max_length, stride):
      input_chunk = token_ids[i:i+ max_length]
      target_chunk = token_ids[i+1: i+1 + max_length]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self,idx):
    return self.input_ids[idx], self.target_ids[idx]

In [43]:
## following code will be used for the GPTdataset to load the input in batches via a Pytorch DataLoader.
## 1. initalize the tokenizer
## 2. create dataset
## 3. drop last= True the last which is shorter than batch size is the length is dropped to prevent loss spikes during the training

def create_dataloader(text,batch_size= 4, max_length = 256,stride=128, shuffle= True, drop_last = True, num_worker= 0 ):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDataset(text, tokenizer, max_length, stride)
  dataLoader = DataLoader(
      dataset,
      batch_size = batch_size,
      shuffle = shuffle,
      drop_last= drop_last,
      num_workers= num_worker
  )
  return dataLoader







In [44]:
## convert dataloader into a python iterator to fetch next entry via pythons built-in  next() function

import torch
dataloader = create_dataloader(
    text,
    batch_size=1,
    max_length=4,
    stride=1,
    shuffle=False,
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
