In [46]:
import urllib.request

url =  ("https://raw.githubusercontent.com/rasbt/"
 "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
 "the-verdict.txt")
file_path = "the-veredict.txt"
urllib.request.urlretrieve(url, file_path)

('the-veredict.txt', <http.client.HTTPMessage at 0x7a22fc9ac800>)

In [47]:
# create dataset:
import re
with open("the-veredict.txt", "r") as f:
    txt_ds = f.read()

# seperate ponctuation and others from words
txt_ds = re.split(r'([,.;:"!?\'()_]|--|\s)', txt_ds)    

In [48]:
#check lengthl
len(txt_ds)

9235

In [49]:
# git rid of whitespace
txt_ds = [item.strip() for item in txt_ds if item.strip()]

In [50]:
# check lengthl
len(txt_ds)

4690

In [51]:
# sort the vocabulary and validate unique words only
vocs = sorted(set(txt_ds))

In [52]:
# create an ID for each word
vocs = {i: v for v, i in enumerate(txt_ds)}

In [53]:
len(vocs)

1130

In [54]:
# Print first 50 element
for i, v in enumerate(vocs.items()):
    print(v)
    if i>20:
        break

('I', 4653)
('HAD', 1)
('always', 3820)
('thought', 4585)
('Jack', 2420)
('Gisburn', 2923)
('rather', 4398)
('a', 4642)
('cheap', 8)
('genius', 9)
('--', 4677)
('though', 164)
('good', 314)
('fellow', 2496)
('enough', 4625)
('so', 4496)
('it', 4665)
('was', 4482)
('no', 4682)
('great', 4252)
('surprise', 2734)
('to', 4626)


In [55]:
class Tokenizerv1:
    def __init__(self, vocs):
        self.str_to_int = vocs
        self.int_to_str = {k:v for v, k in vocs.items()}

    def encode(self, txt):
        pre = re.split(r'([,.;!?()_"\']|--|\s)', txt)
        pre = [item.strip() for item in pre if imtem.strip()]
        ids = [self.str_to_int[s] for s in pre]
        return ids
    def decode(self, ids):
        txt = " ".join([self.int_to_str[i] for i in ids])
        txt = re.sub(r'\s+([,.?!"()\'])', r'\1', txt)
        return txt

* In **Version2** of the tokenizer we will add 2 more features:
  - Replace unkown words that doesn't appear in the vocabulary with `<|unk|>` marker.
  - Add special token `<|EOD|>` at the end of each document.

In [56]:
class Tok2:
    def __init__(self, vocs):
        self.str_to_int= vocs
        self.int_to_str= {k:v for v, k in vocs.items()}
    def encode(self, txt):
        pre = re.split(r'([,.;:!?()_"\']\s)', txt)
        pre = [item.strip() for item in pre if item.strip()]
        pre = [item if item in self.str_to_int else '|<UK>|' for item in pre]
        ids = [self.str_to_int[s] for s in pre]
        return ids
    def decode(self, ids):
        txt = " ".join([self.int_to_str[i] for i in ids])
        txt = re.sub(r'\s+([,.?!"()\'])', r'\1', txt)
        return txt
        

In [57]:
txt1 = 'Hi my name is Smail'
txt2 = 'here starts the new doc'
text = ' |<EOT>| '.join((txt1, txt2))
text

'Hi my name is Smail |<EOT>| here starts the new doc'

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken


In [80]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i+1: i+max_length+1]

                
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [81]:
def dataloaderv1(txt, batch_size=4, max_length=256, stride=128, shuffle= True,
                drop_last= True, num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
                            dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last= drop_last,
                            num_workers= num_workers)
    return dataloader

In [82]:
with open('the-veredict.txt', 'r', encoding= 'utf-8') as f:
    raw_txt = f.read()

In [83]:
dloader= dataloaderv1(raw_txt, batch_size=1, max_length=4, stride= 1, shuffle= False)
data_iter = iter(dloader)
a_batch= next(data_iter)

In [84]:
a_batch

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]

In [85]:
dataloaders = dataloaderv1(raw_txt, batch_size=8, max_length=4, stride=4, shuffle= False)
data_iter = iter(dataloaders)
inps, targs= next(data_iter)

In [86]:
inps, targs

(tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]]))