# Generating Shakespeare poetry with RNN

In [134]:
# !pip3 install torchtext
# !pip3 install pytorch-lightning
# !pip3 install keras-tcn --no-dependencies 

In [137]:
import torch 
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from torchtext.vocab import vocab
from torchtext.transforms import VocabTransform
# We only need TF to dowaload the file - an useful function is available in keras
import tensorflow as tf
from collections import OrderedDict
import pytorch_lightning as pl

import numpy as np

# import ipdb

# TF: https://www.tensorflow.org/text/tutorials/text_generation
# https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html
# https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
# https://pytorch.org/docs/stable/quantization.html#torch.quantization.quantize_dynamic
# https://anie.me/On-Torchtext/

## Preparing the data

In [138]:
# Downloading the file with Shakespare poetry
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt', 
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [139]:
path_to_file

'/home/krzysztof/.keras/datasets/shakespeare.txt'

In [140]:
text = open(path_to_file, 'rb').read()
text = text.decode(encoding='utf-8')

## Unique tokens
We assume the atomic token of the text is a letter.

In [141]:
letters = sorted(set(text))
letters = OrderedDict([(i, 1) for i in letters])

In [142]:
vocabulary = vocab(letters)
vocab_transform = VocabTransform(vocabulary)

In [143]:
# vocabulary.vocab.itos_

## Splitting text by letters

In [144]:
splitted_text = list(text)

In [145]:
encoded_text = vocab_transform(splitted_text)

## Building a model

In [160]:
class TinyShakespeare(pl.LightningModule):
    
    def __init__(self, n_letters, embedding_dim = 15, hidden_size = 32):
        super().__init__()
        self.embed = nn.Embedding(n_letters, embedding_dim=embedding_dim)
        self.rnn = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size, batch_first=True)
        self.final_layer = nn.Linear(hidden_size, n_letters)
        self.softmax = nn.Softmax(-1)
        self.loss_fun = nn.CrossEntropyLoss()
        
    def forward(self, input):
        emb = self.embed(input)
        rnn_output, _ = self.rnn(emb)
        last_vec = rnn_output[:,-1, :]
        final_output = self.final_layer(last_vec) 
        return self.softmax(final_output)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        output = self(x)
        loss = self.loss_fun(output, y)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [161]:
tiny_shakespeare = TinyShakespeare(65)

In [162]:
# Interesting embedding's behaviour - it added a dimesnion at the end
sample_input = torch.tensor([3, 10, 56]).reshape((1, 3))

In [163]:
#tiny_shakespeare(sample_input).shape

In [164]:
class PoetryDataset(Dataset):
    
    def __init__(self, data, lookback, n_next, jump = 1):
        super().__init__()
        self.data = data
        self.lookback = lookback
        self.n_next = n_next
        self.jump = jump
        self.length = len(data) // lookback+n_next
        self.cardinality = np.unique(data).shape[0]

    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        lback = self.data[idx:(idx+self.lookback)]
        nnext = self.data[(idx+self.lookback):(idx+self.lookback+1)]
        
        inp = torch.tensor(lback, dtype = torch.int).reshape((self.lookback))
        
        target = torch.zeros((65))
        target[nnext] = 1
        
        return inp, target        

In [165]:
pd = PoetryDataset(encoded_text, lookback=10, n_next = 1)
pdl = DataLoader(pd, batch_size = 32)

In [166]:
X, y = pd[0]

In [167]:
tiny_shakespeare(X.reshape(-1, 10)).shape

torch.Size([1, 65])

## Training

In [168]:
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [169]:
trainer.fit(model=tiny_shakespeare, train_dataloaders=pdl)

Missing logger folder: /home/krzysztof/Pulpit/nlp/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type             | Params
-------------------------------------------------
0 | embed       | Embedding        | 975   
1 | rnn         | LSTM             | 6.3 K 
2 | final_layer | Linear           | 2.1 K 
3 | softmax     | Softmax          | 0     
4 | loss_fun    | CrossEntropyLoss | 0     
-------------------------------------------------
9.4 K     Trainable params
0         Non-trainable params
9.4 K     Total params
0.038     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]