### Lab 8.3 Text generation

In this lab you will finish building your RNN text generator.  I found that this code actually runs pretty quickly on my MacBook without GPU acceleration.

In [106]:
device = 'cpu'
seq_len = 20
hidden_size = 100
batch_size = 32
lr = 3e-4
epochs = 10

In [107]:
import numpy as np

from tqdm import tqdm, trange

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchmetrics

Here's the code to download and prepare the sonnet dataset.

In [108]:
!wget --no-clobber "https://www.dropbox.com/scl/fi/7r68l64ijemidyb9lf80q/sonnets.txt?rlkey=udb47coatr2zbrk31hsfbr22y&dl=1" -O sonnets.txt
text = (open("sonnets.txt").read())
text = text.lower().strip()

File ‘sonnets.txt’ already there; not retrieving.


In [109]:
print(text[:1000])

i

 from fairest creatures we desire increase,
 that thereby beauty's rose might never die,
 but as the riper should by time decease,
 his tender heir might bear his memory:
 but thou, contracted to thine own bright eyes,
 feed'st thy light's flame with self-substantial fuel,
 making a famine where abundance lies,
 thy self thy foe, to thy sweet self too cruel:
 thou that art now the world's fresh ornament,
 and only herald to the gaudy spring,
 within thine own bud buriest thy content,
 and tender churl mak'st waste in niggarding:
   pity the world, or else this glutton be,
   to eat the world's due, by the grave and thee.

 ii

 when forty winters shall besiege thy brow,
 and dig deep trenches in thy beauty's field,
 thy youth's proud livery so gazed on now,
 will be a tatter'd weed of small worth held:
 then being asked, where all thy beauty lies,
 where all the treasure of thy lusty days;
 to say, within thine own deep sunken eyes,
 were an all-eating shame, and thriftless praise.


Here's my solution for the `CharacterDataset` class.

Note that it returns an entire sequence of tokens for the target (unlike what we did on Monday where it only output a single token for the target.)

In [110]:
class CharacterDataset(Dataset):
  def __init__(self,text,seq_len=100,device='cpu'):
    """
    Initialize a dataset using character tokenization.
    Arguments:
      text: a string containing the dataset
      seq_len: sequence length provided by __getitem__
      device: device for PyTorch tensors
    """
    self.text = text
    self.seq_len = seq_len
    self.vocabulary = ''.join(sorted(list(set(text))))
    self.index_to_char = {n:char for n, char in enumerate(self.vocabulary)}
    self.char_to_index = {char:n for n, char in enumerate(self.vocabulary)}
    self.device = device

  def __len__(self):
    """ Return the length of sequences in the dataset. """
    return len(self.text)-self.seq_len-1

  def __getitem__(self,idx):
    """ Return the input and target sequences starting at given index. """

    text = self.text[idx:idx+self.seq_len+1]
    tokens = self.encode(text)

    return torch.tensor(tokens[:-1],device=self.device),torch.tensor(tokens[1:],device=self.device)
  
  def encode(self,text):
    """ Encode a string to a list of integer tokens. """
    return list(map(self.char_to_index.get,text))

  def decode(self,tokens):
    """ Decode a list of token integers into a string. """
    return ''.join(list(map(self.index_to_char.get,tokens)))

In [111]:
ds = CharacterDataset(text,seq_len=seq_len,device=device)

In [112]:
ds.encode(text[:100])

[38,
 20,
 0,
 0,
 1,
 17,
 29,
 26,
 24,
 1,
 17,
 12,
 20,
 29,
 16,
 30,
 31,
 1,
 14,
 29,
 16,
 12,
 31,
 32,
 29,
 16,
 30,
 1,
 34,
 16,
 1,
 15,
 16,
 30,
 20,
 29,
 16,
 1,
 20,
 25,
 14,
 29,
 16,
 12,
 30,
 16,
 6,
 0,
 1,
 31,
 19,
 12,
 31,
 1,
 31,
 19,
 16,
 29,
 16,
 13,
 36,
 1,
 13,
 16,
 12,
 32,
 31,
 36,
 3,
 30,
 1,
 29,
 26,
 30,
 16,
 1,
 24,
 20,
 18,
 19,
 31,
 1,
 25,
 16,
 33,
 16,
 29,
 1,
 15,
 20,
 16,
 6,
 0,
 1,
 13,
 32,
 31,
 1,
 12,
 30]

In [113]:
print(ds.decode(ds.encode(text[:100])))

i

 from fairest creatures we desire increase,
 that thereby beauty's rose might never die,
 but as


In [114]:
x, y = ds[0]
x.shape, y.shape

(torch.Size([20]), torch.Size([20]))

In [115]:
dl = DataLoader(ds,shuffle=True,batch_size=batch_size)

Here's my solution for the recurrent neural network (RNN) implementation.

In [116]:
class CharacterRNN(nn.Module):
  def __init__(self,vocabulary_size,hidden_size):
    super().__init__()
    self.embedding = nn.Embedding(vocabulary_size,hidden_size)
    self.hidden_size = hidden_size
    self.U = nn.Linear(hidden_size,hidden_size)
    self.W = nn.Linear(hidden_size,hidden_size)
    self.act = nn.SiLU()
    self.V = nn.Linear(hidden_size,vocabulary_size)

  def forward(self,x):
    x = self.embedding(x)
    B,N = x.shape[:2]
    h = torch.zeros(B,self.hidden_size).to(x.device)
    Ux = self.U(x)
    y = []
    for i in range(N):
      Wh = self.W(h)
      h = self.act(Ux[:,i] + Wh)
      y.append(self.V(h))
    return torch.stack(y,dim=1)

In [117]:
model = CharacterRNN(len(ds.vocabulary),hidden_size).to(device)

In [118]:
x_batch, y_batch = next(iter(dl))
x_batch.shape, y_batch.shape

(torch.Size([32, 20]), torch.Size([32, 20]))

In [119]:
model(x_batch).shape

torch.Size([32, 20, 39])

Finally here is my code to train the model.

Note that I needed to use `.view()` to reshape the model output and target, becuase the loss and metric functions want the data to have shape [B,C] not [B,N,C].

In [120]:
opt = torch.optim.Adam(model.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()

metric = torchmetrics.classification.Accuracy(task="multiclass", num_classes=len(ds.vocabulary))
metric.to(device)

MulticlassAccuracy()

In [121]:
epochs = 10

for epoch in range(epochs):
  model.train()
  pbar = tqdm(total=len(dl))
  for x_batch, y_batch in dl:
    opt.zero_grad()

    # Forward pass
    y_pred = model(x_batch)
    # Compute loss
    loss = loss_fn(y_pred.view(-1,len(ds.vocabulary)),y_batch.view(-1))

    loss.backward()

    opt.step()

    pbar.update(1)
  pbar.close()

  model.eval()

  metric.reset()
  pbar = tqdm(total=len(dl))
  for x_batch, y_batch in dl:
    y_pred = model(x_batch)
    metric(y_pred.view(-1,len(ds.vocabulary)),y_batch.view(-1))
    pbar.update(1)
  pbar.close()

  acc = metric.compute().item()

  print(f'epoch {epoch}: {acc}')

 39%|███▉      | 1196/3060 [57:55<1:30:16,  2.91s/it]
100%|██████████| 3060/3060 [00:12<00:00, 240.84it/s]
100%|██████████| 3060/3060 [00:04<00:00, 627.31it/s]


epoch 0: 0.4673592150211334


100%|██████████| 3060/3060 [00:12<00:00, 249.40it/s]
100%|██████████| 3060/3060 [00:04<00:00, 680.17it/s]


epoch 1: 0.49048253893852234


100%|██████████| 3060/3060 [00:11<00:00, 255.97it/s]
100%|██████████| 3060/3060 [00:04<00:00, 688.90it/s]


epoch 2: 0.5028749108314514


100%|██████████| 3060/3060 [00:12<00:00, 240.21it/s]
100%|██████████| 3060/3060 [00:04<00:00, 671.68it/s]


epoch 3: 0.5104367733001709


100%|██████████| 3060/3060 [00:11<00:00, 263.00it/s]
100%|██████████| 3060/3060 [00:04<00:00, 699.77it/s]


epoch 4: 0.5144118070602417


100%|██████████| 3060/3060 [00:11<00:00, 275.82it/s]
100%|██████████| 3060/3060 [00:04<00:00, 695.58it/s]


epoch 5: 0.5187106132507324


100%|██████████| 3060/3060 [00:11<00:00, 258.54it/s]
100%|██████████| 3060/3060 [00:04<00:00, 701.30it/s]


epoch 6: 0.5227249264717102


100%|██████████| 3060/3060 [00:12<00:00, 238.01it/s]
100%|██████████| 3060/3060 [00:05<00:00, 605.70it/s]


epoch 7: 0.5248480439186096


100%|██████████| 3060/3060 [00:12<00:00, 249.98it/s]
100%|██████████| 3060/3060 [00:04<00:00, 645.64it/s]


epoch 8: 0.5277913808822632


100%|██████████| 3060/3060 [00:11<00:00, 256.77it/s]
100%|██████████| 3060/3060 [00:04<00:00, 659.41it/s]

epoch 9: 0.5285513401031494





### Exercises

1. Write a deterministic function to generate text given some starter text.  The function should iteratively add characters to the prompt using the trained model.  This version should be deterministic, in that in always takes the most likely next character according to the model.

Test the function by prompting it with the first 10 characters in the dataset.

In [122]:
def generate_text_deterministic(model,prompt,num_to_generate=1000):
    model.eval()
    gen_text = prompt

    with torch.no_grad(): # disable gradient computation
        for _ in range(num_to_generate):
            indices = [ds.vocabulary.index(c) for c in gen_text]
            in_tensor = torch.tensor(indices).unsqueeze(0).to(device)

            lg = model(in_tensor)
            last_lg = lg[0,-1,:]

            # get next character deterministically
            next_index = torch.argmax(last_lg).item()
            next_char = ds.index_to_char[next_index]
            gen_text += next_char
    return gen_text

# get starter text on first 10 characters in dataset
start_text = text[:10]
gen_out = generate_text_deterministic(model,start_text)
print(gen_out)

i

 from the world the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars the stars t

3. Write a stochastic version of the text generation function.  This one should use `torch.multinomial` to sample the next character.  Note that you will need to apply `torch.softmax` to convert the model output to probabilities.  (In my experience if you don't this you end up with a CUDA error and you end up needing to restart your kernel, so be careful!)

Test the function by prompting it with the first 10 characters in the dataset, and run the generation multiple times to verify the stochastic behavior.

In [123]:
def generate_text_stochastic(model,prompt,num_to_generate=1000):
    model.eval()
    gen_text = prompt

    with torch.no_grad(): # disable gradient computation
        for _ in range(num_to_generate):
            indices = [ds.vocabulary.index(c) for c in gen_text]
            in_tensor = torch.tensor(indices).unsqueeze(0).to(device)

            lg = model(in_tensor)
            last_lg = lg[0,-1,:]

            # get next character stochastically
            probs = F.softmax(last_lg,dim=-1)
            next_index = torch.multinomial(probs,1).item()
            next_char = ds.index_to_char[next_index]
            gen_text += next_char
    return gen_text

# get starter text on first 10 characters in dataset
start_text = text[:10]
gen_out = generate_text_stochastic(model,start_text)
print(gen_out)

i

 from thee, as hast mine earrory-day?
 which proust new;
 thry jecoudd love i do beauty be acquainted creal once,
 and prestation'd,
 by be thaurs i nou firer be when i draw,
 thy glamt be
 arguldant is thy loves the his chimes,
 that befores trust, thou me infantion detery that was for as bisthings and intly thankle go owner i uprive your weaking awad not such cruety:
 be kind,
 sumindn;
 or and thy fighs,
 and therebrean the strong thou that you truth
 withon thing awked for love's window'd free;
 useld mone,
 and of deyle auturn thou dot the blood doth gracious,
 when i hold heach tell
 which thy life decerses of more no rilties new conter with spirl'e thy bared of she do i can not soke eyes,
 tank yournst wint should true most pling thou form this plevoe then fowll love and purbed sprince throw of a moo's to me gumn most thy shows,
 and thou am a fair, what grief;
 but in this,--deven of him while one dot otterest be fire,
 a falls summor comments,
 orde, but in the saully,
 or 