Generate char_set from the document

In [2]:
import numpy as np

## Reading and preprocessing text
with open("../../datasets/pg1268.txt", "r", encoding="utf8") as fp:
    text = fp.read()

start_idx = text.find("THE MYSTERIOUS ISLAND")
end_idx = text.find("End of the Project Gutenberg")
text = text[start_idx:end_idx]
char_set = set(text)

In [3]:
char_set

{'\n',
 ' ',
 '!',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '•',
 '™'}

In [4]:
print(f"Total set size: {len(char_set)}")
print(f"Total text length in chars: {len(text)}")

Total set size: 86
Total text length in chars: 1130779


int mapping

In [5]:
chars_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[char] for char in text], dtype=np.int32)

print(f"Text encoded shape: {text_encoded.shape}")

Text encoded shape: (1130779,)


In [6]:
print(text[:15])
print(text_encoded[:15])

THE MYSTERIOUS 
[46 34 31  1 39 51 45 46 31 44 35 41 47 45  1]


Forming a dataset

In [7]:
import torch
from torch.utils.data import Dataset


seq_length = 40  # Hyperparameter
chunk_size = seq_length + 1

text_chunks = np.array(
    [text_encoded[i : i + chunk_size] for i in range(len(text_encoded) - chunk_size)]
)


class TextDataset(Dataset):
    def __init__(self, text_chunks):
        super().__init__()
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]
        return text_chunk[:-1].long(), text_chunk[1:].long()


seq_dataset = TextDataset(torch.tensor(text_chunks))

Examples

In [8]:
for i, (seq, target) in enumerate(seq_dataset):
    print("Input (x): ", "".join(char_array[seq]))
    print("Target (y): ", "".join(char_array[target]))
    print()
    if i == 1:
        break

Input (x):  THE MYSTERIOUS ISLAND ***

THE MYSTERIOU
Target (y):  HE MYSTERIOUS ISLAND ***

THE MYSTERIOUS

Input (x):  HE MYSTERIOUS ISLAND ***

THE MYSTERIOUS
Target (y):  E MYSTERIOUS ISLAND ***

THE MYSTERIOUS 



Batching

In [9]:
from torch.utils.data import DataLoader

batch_size = 64
torch.manual_seed(1337)
seq_dataloader = DataLoader(
    dataset=seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True
)

Model

In [10]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.GRU(embed_dim,rnn_hidden_size,batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
    def forward(self, x, hidden):
        out = self.embedding(x).unsqueeze(1)
        out, hidden = self.rnn(out, hidden)
        out = self.fc(out).reshape(out.size(0),-1)
        return out, hidden
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden


        

In [11]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(86, 256)
  (rnn): GRU(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=86, bias=True)
)

Setup training loop

In [12]:
device = torch.device("cuda:0")

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 10000 

torch.manual_seed(1)
model = model.to(device)

for epoch in range(num_epochs):
    hidden = model.init_hidden(batch_size).to(device)
    seq_batch, target_batch = next(iter(seq_dataloader))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden = model(seq_batch[:, c], hidden) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.4793
Epoch 500 loss: 1.4327
Epoch 1000 loss: 1.3615
Epoch 1500 loss: 1.3752
Epoch 2000 loss: 1.3700
Epoch 2500 loss: 1.2882
Epoch 3000 loss: 1.3296


Evaluation phase: generating text passages

In [None]:
from torch.distributions.categorical import Categorical


logits = torch.tensor([[1.0, 1.0, 1.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())