In [1]:
import os
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
import re

In [2]:
pwd = os.getcwd()
path = os.path.join(pwd, 'verdict.txt')

with open(path, 'r') as file:
    content = file.read()

print(content[:100])
print(len(content))


I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g
20479


In [3]:
# tokens = tokenize(content)
tokens = word_tokenize(content)

print(f"tokens len = {len(tokens)}")
print(f"tokens {tokens[:100]}")

tokens len = 4544
tokens ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '``', 'The', 'height', 'of', 'his', 'glory', "''", '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring']


In [4]:
vocab = build_vocab_from_iterator([tokens], specials=["<|unk|>", "<|endoftext|>", "<|pad|>"])
vocab.set_default_index(vocab['<|unk|>'])
print('len vocab = ', len(vocab))
print(vocab.get_itos()[:10])


len vocab =  1141
['<|unk|>', '<|endoftext|>', '<|pad|>', ',', '.', 'the', 'I', 'of', '--', 'to']


In [5]:
for id, word in enumerate(vocab.get_itos()):
    print(word, ' id = ', id)
    break

<|unk|>  id =  0


In [6]:
class SimpleTokenizerV2:

    def __init__(self, vocab) -> None:
        self.str_to_int = vocab
        self.int_to_str = { id: word for id, word in enumerate(vocab.get_itos())}
    
    def encode(self, content):
        tokens = word_tokenize(content)
        res = [self.str_to_int[token] for token in tokens]
        return res
    
    def decode(self, ids):
        res = ' '.join([self.int_to_str[id] for id in ids])
        return res

In [7]:
simpletokenizer = SimpleTokenizerV2(vocab)

res = simpletokenizer.encode("something is very good")
print('encode = ', res)

res = simpletokenizer.decode(res)
print('decode = ', res)

encode =  [1007, 74, 1110, 344]
decode =  something is very good


# Section for BPE with tiktoken library

In [8]:
import tiktoken

In [9]:
tokenizer = tiktoken.get_encoding('gpt2')

In [10]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)
print(tokenizer.decode(ids))

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


# Training data with sliding window sampling

In [11]:
print(content[:100])
tokenizer = tiktoken.get_encoding('gpt2')

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [12]:
ids = tokenizer.encode(content)
print('total len = ', len(ids))
print('ids = ', ids[:10])

total len =  5145
ids =  [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, content, tokenizer, max_len, stride = 1):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(content)

        for i in range(0, len(token_ids) - max_len, stride):
            input_chunk = token_ids[i: i + max_len]
            target_chunk = token_ids[i + 1: i + max_len + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self) -> int: 
        return len(self.input_ids)

    def __getitem__(self, index) -> torch.Tensor:
        return self.input_ids[index], self.target_ids[index]


sample_ds = GPTDatasetV1(content, tokenizer, 100)

In [14]:
print('input ids = ', sample_ds.input_ids[:1])
print('target ids = ', sample_ds.target_ids[:1])

input ids =  [tensor([   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
          257,  7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,
          568,   340,   373,   645,  1049,  5975,   284,   502,   284,  3285,
          326,    11,   287,   262,  6001,   286,   465, 13476,    11,   339,
          550,  5710,   465, 12036,    11,  6405,   257,  5527, 27075,    11,
          290,  4920,  2241,   287,   257,  4489,    64,   319,   262, 34686,
        41976,    13,   357, 10915,   314,  2138,  1807,   340,   561,   423,
          587, 10598,   393, 28537,  2014,   198,   198,     1,   464,  6001,
          286,   465, 13476,     1,   438,  5562,   373,   644,   262,  1466,
         1444,   340,    13,   314,   460,  3285,  9074,    13, 46606,   536])]
target ids =  [tensor([  367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,
         7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,   568,
          340,   373,   645,  104

### Create dataloader

In [15]:
def create_dataloader(txt, batch_size = 8, max_len = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    assert 1 <= stride <= max_len, "Stride must be between 1 and max_len"

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_len, stride)

    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle = shuffle, 
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader, tokenizer, dataset

In [16]:
dataloader, tokenizer, dataset = create_dataloader(content, batch_size = 2, max_len = 4, stride = 1, shuffle = False)

x, y = next(iter(dataloader))

print('features = ')
print(x)
for row in x.tolist():
    print(tokenizer.decode(row))

print('labels = ')
print(y)
for row in y.tolist():
    print(tokenizer.decode(row))

features = 
tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807]])
I HAD always
 HAD always thought
labels = 
tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])
 HAD always thought
AD always thought Jack


# Embedding

In [33]:
context_len = 100
x = torch.randint(0,1000,(8,context_len)).long()
print('x = ', x.shape)
sample_vocab_size = 1000
embedding_size = 16
embedding_layer = torch.nn.Embedding(sample_vocab_size, embedding_size)

emb_x = embedding_layer(x)
print('embedded x = ', emb_x.shape)

pos_embedding_layer = torch.nn.Embedding(context_len, embedding_size)
pos_embeddings = pos_embedding_layer(torch.arange(context_len))
print(pos_embeddings.shape)

# print('pos embedding shape =', pos_embeddings.shape)

b = emb_x + pos_embeddings 
print('b shape = ', b.shape)

# print('input embedding shape = ', input_embedding.shape)

x =  torch.Size([8, 100])
embedded x =  torch.Size([8, 100, 16])
torch.Size([100, 16])
b shape =  torch.Size([8, 100, 16])
