Generate char_set from the document

In [1]:
import numpy as np

## Reading and preprocessing text
with open("../../datasets/pg1268.txt", "r", encoding="utf8") as fp:
    text = fp.read()

start_idx = text.find("THE MYSTERIOUS ISLAND")
end_idx = text.find("End of the Project Gutenberg")
text = text[start_idx:end_idx]
char_set = set(text)

In [2]:
char_set

{'\n',
 ' ',
 '!',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '•',
 '™'}

In [3]:
print(f"Total set size: {len(char_set)}")
print(f"Total text length in chars: {len(text)}")

Total set size: 86
Total text length in chars: 1130779


int mapping

In [4]:
chars_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[char] for char in text], dtype=np.int32)

print(f"Text encoded shape: {text_encoded.shape}")

Text encoded shape: (1130779,)


In [5]:
print(text[:15])
print(text_encoded[:15])

THE MYSTERIOUS 
[46 34 31  1 39 51 45 46 31 44 35 41 47 45  1]


Forming a dataset

In [6]:
import torch
from torch.utils.data import Dataset


seq_length = 40  # Hyperparameter
chunk_size = seq_length + 1

text_chunks = np.array(
    [text_encoded[i : i + chunk_size] for i in range(len(text_encoded) - chunk_size)]
)


class TextDataset(Dataset):
    def __init__(self, text_chunks):
        super().__init__()
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]
        return text_chunk[:-1].long(), text_chunk[1:].long()


seq_dataset = TextDataset(torch.tensor(text_chunks))

Examples

In [7]:
for i, (seq, target) in enumerate(seq_dataset):
    print("Input (x): ", "".join(char_array[seq]))
    print("Target (y): ", "".join(char_array[target]))
    print()
    if i == 1:
        break

Input (x):  THE MYSTERIOUS ISLAND ***

THE MYSTERIOU
Target (y):  HE MYSTERIOUS ISLAND ***

THE MYSTERIOUS

Input (x):  HE MYSTERIOUS ISLAND ***

THE MYSTERIOUS
Target (y):  E MYSTERIOUS ISLAND ***

THE MYSTERIOUS 



Batching

In [8]:
from torch.utils.data import DataLoader

batch_size = 64
torch.manual_seed(1337)
seq_dataloader = DataLoader(
    dataset=seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True
)

Model

In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.GRU(embed_dim,rnn_hidden_size,batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
    def forward(self, x, hidden):
        out = self.embedding(x).unsqueeze(1)
        out, hidden = self.rnn(out, hidden)
        out = self.fc(out).reshape(out.size(0),-1)
        return out, hidden
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden


        

In [10]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(86, 256)
  (rnn): GRU(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=86, bias=True)
)

Setup training loop

In [11]:
device = torch.device("cuda:0")

In [12]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

num_epochs = 10000 

torch.manual_seed(1)
model = model.to(device)

for epoch in range(num_epochs):
    hidden = model.init_hidden(batch_size).to(device)
    seq_batch, target_batch = next(iter(seq_dataloader))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden = model(seq_batch[:, c], hidden) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.4793
Epoch 500 loss: 1.4095
Epoch 1000 loss: 1.2884
Epoch 1500 loss: 1.2466
Epoch 2000 loss: 1.2233
Epoch 2500 loss: 1.1478
Epoch 3000 loss: 1.1591
Epoch 3500 loss: 1.1496
Epoch 4000 loss: 1.1488
Epoch 4500 loss: 1.1043
Epoch 5000 loss: 1.1178
Epoch 5500 loss: 1.0553
Epoch 6000 loss: 1.1036
Epoch 6500 loss: 1.0747
Epoch 7000 loss: 1.0201
Epoch 7500 loss: 1.0421
Epoch 8000 loss: 1.0280
Epoch 8500 loss: 1.0201
Epoch 9000 loss: 1.0335
Epoch 9500 loss: 1.0382


Evaluation phase: generating text passages

In [13]:
from torch.distributions.categorical import Categorical


logits = torch.tensor([[1.0, 1.0, 1.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())

Probabilities: [0.33333334 0.33333334 0.33333334]
[[0]
 [0]
 [2]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]]


In [14]:
def sample(model, starting_string, len_generated_text = 500, scale_factor=1.0):
    encoded_input = torch.tensor(np.array([char2int[s] for s in starting_string]))
    encoded_input = torch.reshape(encoded_input, (1,-1))
    
    generated_string = starting_string
    model.eval()
    
    hidden = model.init_hidden(1).to(device)
    hidden = hidden.to('cpu')
    for c in range(len(starting_string)-1):
        _, hidden = model(encoded_input[:,c].view(1), hidden)
    last_char = encoded_input[:,-1]
    
    for i in range(len_generated_text):
        logits, hidden = model(last_char.view(1), hidden)
        logits = torch.squeeze(logits,0)
        scaled_logits = logits*scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_string += str(char_array[last_char])
        
    return generated_string


model.to('cpu')
print(sample(model, starting_string='The island'))

The island, whose inhabisant was that current and thrown on
the tire. It was
this, the stranger did Herbert, “shall we do not suppose put,” remorse being could be seen, the inferior cone was
consider that can come, which had only to be mentiously. You are not return to the snow-possible, and to be absolutely surveying the work.

Indeed, the fire was lamented by water. But they seems to
help his shot spring. They were existed.

“It is useless, they were extended to pass winter would not one o’clock. He con


Predictability vs randomness

In [15]:
logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:        ', nn.functional.softmax(logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])

Probabilities before scaling:         [0.10650698 0.10650698 0.78698605]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611686]
Probabilities after scaling with 0.1: [0.3104238  0.3104238  0.37915248]


In [16]:
print(sample(model, starting_string='The island', 
             scale_factor=2.0))

The island had never consented with importance, was about to be done but for the
midst of the island, the settlers had discovered the prisoner of the balloon, and the sailor was
already enough to consting him on the shore. They were not more than fine and clearly the case was not mistaken. The settlers had formed a symptoms of a steam which was wanting in the Southern Hemisphere. It was therefore completed, and the colony could not be decided that the wind dragged his companions, entered the deck of the w


In [17]:
print(sample(model, starting_string='The island', 
             scale_factor=0.5))

The island colleyed Cla;fuls elg
idoed,.”

“Do
than mat has
abyss’ assica slogrizot, cleeks. This ram,’s hazplaosate symadz; uthentes
werfel, drief,--taysion
was tweeh, Ayrton?”

Two obtaicou.

Cynvituuvers Juss,
cany anchporviby Pencroft. A obscure history-foot.
Serterceapers were now the jaguars, tweicle recolodimed his caprifths alx, tannying out; “an weot of the ovan
away theem, (ot was easy;
duderous, would bewwed them not?”
d. Juden Bolage;
camase. As SIBj6kent,” remolated;
wentorn’s rays, able, Des
