In [4]:
from pathlib import Path
import urllib.request

In [7]:
def download_shakespeare_text():
    path = Path("data/shakespeare/shakespeare.txt")
    if not path.is_file():
        path.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path)
    return path.read_text()


shakespeare_text = download_shakespeare_text()

In [8]:
print("Done", shakespeare_text[:80])

Done First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [13]:
vocab = sorted(set(shakespeare_text.lower()))
"".join(vocab)


"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [14]:
char_to_id = {char: index for index, char in enumerate(vocab)}
id_to_char = {index: char for index, char in enumerate(vocab)}

In [20]:

import torch

def encode(t):
    return torch.tensor([char_to_id[c] for c in t.lower()])

def decode(i):
    return "".join([id_to_char[id.item()] for id in i])

In [21]:
encoded = encode("hello")
decoded = decode(encoded)
print(encoded, decoded)

    

tensor([20, 17, 24, 24, 27]) hello


In [32]:
from torch.utils.data import Dataset, DataLoader

class charDataset(Dataset):
    def __init__(self, text, window_length):
        self.encode = encode(text)
        self.window_length = window_length


    def __len__(self):
        return len(self.encode) - self.window_length

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError("Dataset Index out of range")
        end = idx + self.window_length
        window = self.encode[idx:end]
        target = self.encode[idx+1:end+1]
        return window, target

In [34]:
window_length = 50
batch_size = 512

train_set = charDataset(shakespeare_text[:1000000], window_length)
valid_set = charDataset(shakespeare_text[1000000:1060000], window_length)
test_set = charDataset(shakespeare_text[1060000:], window_length)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [36]:
import torch.nn as nn
torch.manual_seed(42)

embed = nn.Embedding(5, 3) #5 categories, 3 dimensions of embeddings
embed(torch.tensor([[3, 2], [0, 2]]))


tensor([[[ 0.2674,  0.5349,  0.8094],
         [ 2.2082, -0.6380,  0.4617]],

        [[ 0.3367,  0.1288,  0.2345],
         [ 2.2082, -0.6380,  0.4617]]], grad_fn=<EmbeddingBackward0>)

In [44]:
class Model(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_dim=10, hidden_dim=18, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeddings = self.embed(x)
        x, h = self.gru(embeddings)
        x = self.out(x)
        return x.permute(0, 2, 1)


torch.manual_seed(42)
model = Model(len(vocab))
model.train()

        

Model(
  (embed): Embedding(39, 10)
  (gru): GRU(10, 18, num_layers=2, batch_first=True, dropout=0.1)
  (out): Linear(in_features=18, out_features=39, bias=True)
)

In [45]:
model.eval()  # don't forget to switch the model to evaluation mode!
text = "To be or not to b"
encoded_text = encode(text).unsqueeze(dim=0)
with torch.no_grad():
    Y_logits = model(encoded_text)
    predicted_char_id = Y_logits[0, :, -1].argmax().item()
    predicted_char = id_to_char[predicted_char_id]
    print(predicted_char)

a


In [57]:
torch.manual_seed(42)
probs = torch.tensor([[0.5, 0.4, 0.1]])
samples = torch.multinomial(probs, num_samples=10, replacement=True)
samples

tensor([[0, 0, 0, 0, 1, 0, 2, 2, 0, 0]])

In [64]:
import torch.nn.functional as F

def next_char(model, text, temperature=1):
    encoded_text = encode(text).unsqueeze(dim=0)
    with torch.no_grad():
        Y_logits = model(encoded_text)
        Y_probas = F.softmax(Y_logits[0, :, -1] / temperature, dim=-1)
        predicted_char_id = torch.multinomial(Y_probas, num_samples=1).item()
    return id_to_char[predicted_char_id]

In [65]:
def extend_text(model, text, n_chars=80, temperature=1):
    for _ in range(n_chars):
        text += next_char(model, text, temperature)
    return text



In [66]:
print(extend_text(model, "To be or not to be", temperature=0.01))

To be or not to beaaaaaaaaaax aaaaaaaaaaaaaaaaax aaaaaaaaaaaaaaaaaaaaaaaaax saaaaaaaaaaaaaaaaax aa
