# Poetry Notebook

In this notebook we will be implementing GPT to generate text based on the work of Edgar Allan Poe.

In [1]:
# Installing dependencies
!pip install torch

# Downloading dataset from the GitHub
!wget https://raw.githubusercontent.com/kocenko/Poetry-Synthesis/dev/data/poe_data.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2023-05-16 16:49:03--  https://raw.githubusercontent.com/kocenko/Poetry-Synthesis/dev/data/poe_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1930488 (1.8M) [text/plain]
Saving to: ‘poe_data.txt’


2023-05-16 16:49:04 (252 MB/s) - ‘poe_data.txt’ saved [1930488/1930488]



In [2]:
# Essential imports
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [3]:
# Testing if GPU is available
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

In [14]:
# Dataset class definition
### (Option) We can use different data to train it on
### (Option) What if the context affects not the following
###          but the one after the following token? (bigger offset)

class PoeDataset(Dataset):
    valid_split_params = ["train", "valid"]

    def __init__(self, text: str, split: str, split_ratio: float, context_length: int, tokenizer, offset: int = 1):
        ''' Poe Dataset constructor

        Args:
            str:
                file_path: Path to the file containing dataset
                splt: String indicating what type of data this dataset contains
            float:
                split_ratio: Value between (0, 1] of what should be the ratio
                             between training and validation set
            int:
                context_length: Length of the context
                offset: An offset between the end of the context and the target
        '''

        assert split in PoeDataset.valid_split_params, f"{split} is the wrong split type"
        assert split_ratio <= 1 and split_ratio > 0, f"Split ratio value should be from range (0, 1]"
        assert len(text) > 0, f"Dataset file should not be empty"
        assert context_length < len(text), f"Context length should not be more than {len(text) - 1}"

        self.text = text
        self.offset = offset
        self.context_length = context_length
        self.tokenizer = tokenizer
        self.data = torch.tensor(self.tokenizer.encode(self.text), dtype=torch.int32, device=device)

        split_idx = int(len(self.data) * split_ratio)
        if split == "train":
            self.data = self.data[:split_idx]
        else:
            self.data = self.data[split_idx:]

    def __len__(self):
        ''' Returns the size of the dataset
        
        Returns:
            Number of possible shifts in the dataset for choosing the context chunk
        '''
        return len(self.data) - self.context_length - self.context_length + 1
    
    def __getitem__(self, index):
        ''' Returns an item of given index

        Params:
            index: Which item should be returned
        
        Returns:
            Sample of given index
        '''
        assert index > 0 and index < self.__len__()

        x = self.data[index: index + self.context_length]
        y = self.data[index + self.offset: index + self.context_length + self.offset]

        return x, y



In [15]:
# Defined tokenizer class

from typing import List


class Tokenizer:
    ''' Class for character-wise tokenization'''

    def __init__(self, text: str):
        assert len(text) > 0, "Text used for creating tokenizer cannot be empty"

        self.text = text
        self.symbols = sorted(list(set(self.text)))
        self.vocab_size = len(self.symbols)
        self.stoi = { ch:i for i, ch in enumerate(self.symbols)}
        self.itos = { i:ch for i, ch in enumerate(self.symbols)}

    def encode(self, text: str) -> List[int]:
        ''' Encodes string to list of ints '''

        return [self.stoi[ch] for ch in text]
    
    def decode(self, tokens: List[int]) -> str:
        ''' Decodes list of ints to string '''
        
        return ''.join([self.itos[token] for token in tokens])


In [16]:
# Setting up the dataset parameters
### (Option 1) We can use different tokenizer, like SentencePiece
### (Option 2) We can build our own tokenizer, using huggingface library

epochs = 1  # Just for now
split_ratio = 0.85
context_length = 8
offset = 1  # I am wondering what would be the results for 2, for example
batch_size = 4
file_path = "poe_data.txt"

# Reading file, preparing tokenizer
with open(file_path, 'r', encoding="utf-8") as f:
            text = f.read()

tokenizer = Tokenizer(text)
net_config = {
    "vocab_size": lambda: tokenizer.vocab_size
}

# Dataset and dataloader
train_set = PoeDataset(text, 'train', split_ratio, context_length, tokenizer, offset=offset)
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=True)
x, y = next(iter(train_dataloader))

In [17]:
# Simple Decoder Class definition
### (Option) Different split, test data?


class OnlyDecoder(nn.Module):
    def __init__(self, config: dict):
        super().__init__()

        self.vocab_size = config["vocab_size"]
        self.embedding_table = nn.Embedding(self.vocab_size, self.vocab_size, device=device)

    def forward(self, token_idx: int, targets: torch.tensor) -> torch.tensor:
        logits = self.embedding_table(token_idx)
        return logits

In [19]:
# Setting up net parameters
for key, value in net_config.items():
  if callable(value):
    net_config[key] = value()
  else:
    net_config[key]

# Testing Model
model = OnlyDecoder(net_config)
out = model(x, y)