In [2]:
import torch
import tiktoken

In [3]:
# creating dataset
from torch.utils.data import Dataset, DataLoader

class DatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [4]:
def data_loader(text, batch_size, max_length,
                  stride, shuffle, drop_last=True,
                  num_workers=0):
    
    tokenizer = tiktoken.get_encoding('gpt2')

    dataset = DatasetV1(text, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [5]:
import PyPDF2

with open("Employee-Handbook.pdf", "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

print(len(text))

64127


In [6]:
dataloader = data_loader(text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  220,   198, 36824,   220]]), tensor([[  198, 36824,   220, 30579]])]


In [7]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  198, 36824,   220, 30579]]), tensor([[36824,   220, 30579,   220]])]


In [8]:
dataloader = data_loader(text, batch_size=8, max_length=5, stride=1, shuffle=False)

data = iter(dataloader)
third_batch = next(data)
print(third_batch)

[tensor([[  220,   198, 36824,   220, 30579],
        [  198, 36824,   220, 30579,   220],
        [36824,   220, 30579,   220,   220],
        [  220, 30579,   220,   220,   198],
        [30579,   220,   220,   198, 14618],
        [  220,   220,   198, 14618,   220],
        [  220,   198, 14618,   220,   604],
        [  198, 14618,   220,   604,   220]]), tensor([[  198, 36824,   220, 30579,   220],
        [36824,   220, 30579,   220,   220],
        [  220, 30579,   220,   220,   198],
        [30579,   220,   220,   198, 14618],
        [  220,   220,   198, 14618,   220],
        [  220,   198, 14618,   220,   604],
        [  198, 14618,   220,   604,   220],
        [14618,   220,   604,   220,   198]])]


# Position Embedding

In [None]:
#creating embedding layer of vocab_size = 50257 - size of GPT2 and vector_dim = 256
#this embedding layer is trainable - embedding values are learned duing training
vocab_size = 50257
vec_dim = 256

embeddings = torch.nn.Embedding(vocab_size,vec_dim)

In [21]:
max_length = 4
dataloader = data_loader(text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

first_batch = iter(dataloader)
input, targets = next(first_batch)

In [22]:
print(input)
print(input.shape)

tensor([[  220,   198, 36824,   220],
        [30579,   220,   220,   198],
        [14618,   220,   604,   220],
        [  198, 20570,   220,   284],
        [  760,   220,   674,  1664],
        [  220,   604,   220,   198],
        [29733,   434,   220, 19165],
        [  220,   642,   220,   198]])
torch.Size([8, 4])


In [23]:
final_embedding = embeddings(input)
print(final_embedding.shape)

torch.Size([8, 4, 256])


In [24]:
#positional embedding
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length,vec_dim)

In [26]:
pos_embedding = pos_embedding_layer(torch.arange(max_length))
print(pos_embedding.shape)

torch.Size([4, 256])


In [27]:
input_embeddings = final_embedding + pos_embedding
print(input_embeddings.shape)

torch.Size([8, 4, 256])
