# Generating Shakespeare poetry with RNN

In [27]:
# !pip3 install torchtext
# !pip3 install pytorch-lightning

In [29]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torchtext.vocab import vocab
from torchtext.transforms import VocabTransform
# We only need TF to dowaload the file - an useful function is available in keras
import tensorflow as tf
from collections import OrderedDict
import pytorch_lightning as pl

# import ipdb

# TF: https://www.tensorflow.org/text/tutorials/text_generation
# https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html
# https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
# https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic
# https://anie.me/On-Torchtext/

## Preparing the data

In [2]:
# Downloading the file with Shakespare poetry
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt', 
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
path_to_file

'/home/krzysztof/.keras/datasets/shakespeare.txt'

In [4]:
text = open(path_to_file, 'rb').read()
text = text.decode(encoding='utf-8')

## Unique tokens
We assume the atomic token of the text is a letter.

In [5]:
letters = sorted(set(text))
letters = OrderedDict([(i, 1) for i in letters])

In [6]:
vocabulary = vocab(letters)
vocab_transform = VocabTransform(vocabulary)

In [7]:
# vocabulary.vocab.itos_

## Splitting text by letters

In [8]:
splitted_text = list(text)

In [9]:
encoded_text = vocab_transform(splitted_text)

## Building a model

In [21]:
class TinyShakespeare(nn.Module):
    
    def __init__(self, n_letters, embedding_dim = 15, hidden_size = 32):
        super().__init__()
        self.embed = nn.Embedding(n_letters, embedding_dim=embedding_dim)
        self.rnn = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size)
        self.final_layer = nn.Linear(hidden_size, n_letters)
        self.softmax = nn.Softmax(-1)
        
    def forward(self, input):
        # ipdb.set_trace()
        rnn_output, _ = self.rnn(self.embed(input))
        last_vec = rnn_output[:,-1, :]
        final_output = self.final_layer(last_vec) 
        return self.softmax(final_output)

In [22]:
tiny_shakespeare = TinyShakespeare(65)

In [23]:
sample_input = torch.tensor([1, 10, 56]).reshape(1, -1)

In [25]:
tiny_shakespeare(sample_input)

tensor([[0.0167, 0.0150, 0.0142, 0.0183, 0.0171, 0.0151, 0.0135, 0.0146, 0.0142,
         0.0142, 0.0163, 0.0134, 0.0168, 0.0139, 0.0173, 0.0161, 0.0145, 0.0143,
         0.0169, 0.0164, 0.0151, 0.0171, 0.0132, 0.0158, 0.0137, 0.0154, 0.0157,
         0.0171, 0.0135, 0.0143, 0.0170, 0.0147, 0.0133, 0.0143, 0.0177, 0.0150,
         0.0143, 0.0167, 0.0151, 0.0177, 0.0146, 0.0155, 0.0176, 0.0149, 0.0134,
         0.0151, 0.0156, 0.0126, 0.0152, 0.0167, 0.0132, 0.0176, 0.0161, 0.0162,
         0.0162, 0.0140, 0.0149, 0.0178, 0.0155, 0.0135, 0.0139, 0.0173, 0.0140,
         0.0175, 0.0155]], grad_fn=<SoftmaxBackward0>)

In [32]:
class PoetrySchool(pl.LightningModule):
    
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss_fun = nn.CrossEntropyLoss()
        
    def training_step(self, batch_size, batch_idx):
        x, y = batch
        output = self.model(x)
        loss = self.loss_fun(output, y)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [33]:
poetry_school = PoetrySchool(tiny_shakespeare)

In [None]:
class PoetryDataset(Dataset):
    
    def __init__(self):
        super().__init__()
        self