In [1]:
with open("the-verdict.txt") as f:
    raw_text = f.read()

print("first 99 characters of raw_text:", raw_text[:99])
# print length of raw_text
print("Length of raw_text:", len(raw_text))

first 99 characters of raw_text: I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
Length of raw_text: 20479


In [2]:
import tiktoken

class SimpleTokenizerV2:
    def __init__(self):
        self.tokenizer = tiktoken.get_encoding("gpt2")

    def encode(self, text):
        return self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    
    def decode(self, tokens):
        return self.tokenizer.decode(tokens)



tokenizer = SimpleTokenizerV2()

# test the tokenizer
test_sentence = "I've hello painting people don't sAy that stuff about me--they say it about Victor Grindle kiran, qwfqwfqwefvdvewe\""

print("tokenizer.encode(test_sentence):", tokenizer.encode(test_sentence))
print("tokenizer.decode(tokenizer.encode(test_sentence)):", tokenizer.decode(tokenizer.encode(test_sentence)))



tokenizer.encode(test_sentence): [40, 1053, 23748, 12036, 661, 836, 470, 264, 42012, 326, 3404, 546, 502, 438, 9930, 910, 340, 546, 12622, 41379, 293, 479, 343, 272, 11, 10662, 86, 69, 80, 86, 69, 80, 732, 69, 20306, 303, 732, 1]
tokenizer.decode(tokenizer.encode(test_sentence)): I've hello painting people don't sAy that stuff about me--they say it about Victor Grindle kiran, qwfqwfqwefvdvewe"


In [3]:
# check if raw_text is encoded correctly
encoded_text = tokenizer.encode(raw_text)

# print length of encoded_text
print("Length of encoded_text:", len(encoded_text))

# print first 100 characters of encoded_text
print("First 100 characters of encoded_text:", encoded_text[:100])




Length of encoded_text: 5145
First 100 characters of encoded_text: [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536]


In [4]:
# create input target pairs
context_size = 4

for i in range(1, context_size+1):
    context = encoded_text[:i]
    target = encoded_text[i]
    print(f"Context: {context}, Target: {target}")
     # print decoded context and target
    print(f"Decoded Context: {tokenizer.decode(context)}, Decoded Target: {tokenizer.decode([target])}")


Context: [40], Target: 367
Decoded Context: I, Decoded Target:  H
Context: [40, 367], Target: 2885
Decoded Context: I H, Decoded Target: AD
Context: [40, 367, 2885], Target: 1464
Decoded Context: I HAD, Decoded Target:  always
Context: [40, 367, 2885, 1464], Target: 1807
Decoded Context: I HAD always, Decoded Target:  thought


In [5]:
# # do for entire text
# for i in range(1, len(encoded_text)):
#     context = encoded_text[:i]
#     target = encoded_text[i]
#     print(f"Context: {context}, Target: {target}")



In [6]:
!pip install torch



In [None]:
# create a pytorch dataset and dataloader
import torch
from torch.utils.data import Dataset, DataLoader

class InputTargetDataset(Dataset):
    def __init__(self, text, tokenizer, context_size, stride_length):
        self.tokenizer = tokenizer
        self.context_size = context_size
        self.stride_length = stride_length

        self.input_ids = []
        self.target_ids = []

        tokens = self.tokenizer.encode(text)

        for i in range(0, len(tokens) - self.context_size, self.stride_length):
            context = tokens[i:i+self.context_size]
            target = tokens[i+1:i+self.context_size+1]
            self.input_ids.append(torch.tensor(context))
            self.target_ids.append(torch.tensor(target))


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        # i am returning a tensor here!!!
        context = self.input_ids[idx]
        target = self.target_ids[idx]
        return context, target


In [12]:
# create a dataloader

def create_dataloader(text, batch_size=4, context_size=256, stride_length=128, shuffle=True, drop_last=True, num_workers=0):
    dataset = InputTargetDataset(text, SimpleTokenizerV2(), context_size, stride_length)

    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers)
    return dataloader


dataloader = create_dataloader(raw_text, batch_size=2, context_size=4, stride_length=4, shuffle=False, drop_last=True, num_workers=0)


data_iter = iter(dataloader)
input_ids, target_ids = next(data_iter)

print("input_ids:\n", input_ids)
print("\ntarget_ids:\n", target_ids)







input_ids:
 tensor([[  40,  367, 2885, 1464],
        [1807, 3619,  402,  271]])

target_ids:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899]])
