In [1]:
from pathlib import Path
import urllib.request


def download_dataset(): 
    path = Path("data/shakespeare/shakespeare.txt")
    if not path.is_file():
        path.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path)
    return path.read_text()

In [24]:
shakespeare_text = download_dataset()
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [32]:
device = 'cpu'
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Device: ", device)

Device:  mps


In [None]:
# we need to convert the text into a sequence of characters then convert each character into a unique integer
vocab = sorted(set(shakespeare_text.lower())) #created a list of sorted unique non duplicate characters that are found in the text
print("".join(vocab)) #printing all the characters in the vobaulary removing duplicates and spaces


<class 'list'>
39
:

 !$&',-.3:;?abcdefghijklmnopqrstuvwxyz


In [18]:
#now we assing token id to each charater, we'll use the index of each character in vocab as the token id
char_to_id = {char: index for index, char in enumerate(vocab)} # the key is the character and the value is the index
id_to_char = {index: char for index, char in enumerate(vocab)} # the key is the index and the value is the character


In [19]:
# lets create functions to encode and decode texts

import torch

def encode_text(text: str):
    return torch.tensor([char_to_id[char] for char in text.lower()])

def decode_text(ids: torch.Tensor):
    return "".join(id_to_char[id] for id in ids)

In [None]:
from torch.utils.data import DataLoader, Dataset

# defining our datasets by creating a custom dataset class
class CharDataset(Dataset):
    def __init__(self, text: str, window_length: int):
        self.text = encode_text(text)
        self.window_length = window_length

    def __len__(self):
        return len(self.text) - self.window_length

    def __getitem__(self, index: int):
        if index >= len(self):
            raise IndexError("dataset index out of range")
        end = index + self.window_length
        window = self.text[index: end]
        target = self.text[index + 1: end + 1]
        return window, target

In [25]:
#configs
window_length = 50
batch_size = 512

#creating our datasets, train test and validation
trainset = CharDataset(shakespeare_text[:1000000], window_length)
validset = CharDataset(shakespeare_text[1000000:1060000], window_length)
testset = CharDataset(shakespeare_text[1060000:], window_length)

#creating our dataloaders
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
validloader = DataLoader(validset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

In [28]:
#lets create some embeddings, lets initialize an embedding layer with 5 characters and 3 dimensions
import torch.nn as nn

torch.manual_seed(42)
embed = nn.Embedding(5,3)
embed(torch.tensor([[3, 2], [0, 2]]))

tensor([[[ 0.2674,  0.5349,  0.8094],
         [ 2.2082, -0.6380,  0.4617]],

        [[ 0.3367,  0.1288,  0.2345],
         [ 2.2082, -0.6380,  0.4617]]], grad_fn=<EmbeddingBackward0>)

In [33]:
#lets build our shakespeare char rnn model

class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_dim=10, hidden_dim=128, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeddings = self.embed(x)
        outputs, _states = self.gru(embeddings)
        return self.output(outputs).permute(0, 2, 1)

torch.manual_seed(42)
model = ShakespeareModel(len(vocab)).to(device)
        

In [38]:
#lets create a loss function and optimizer
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

def train(model, loss, optimizer, train_loader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            l = loss(y_pred, y)
            total_loss += l.item()
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        mean_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {mean_loss:.4f}")


train(model, loss, optimizer, trainloader, epochs=10)



Epoch 1/10, Loss: 3.0872
Epoch 2/10, Loss: 3.0409
Epoch 3/10, Loss: 2.9758
Epoch 4/10, Loss: 2.8775
Epoch 5/10, Loss: 2.7620
Epoch 6/10, Loss: 2.6717
Epoch 7/10, Loss: 2.6064
Epoch 8/10, Loss: 2.5508
Epoch 9/10, Loss: 2.5026
Epoch 10/10, Loss: 2.4601


In [None]:
model.eval()
text = "To be or not to be that is the questio"
encoded = encode_text(text).unsqueeze(dim=0).to(device)

with torch.no_grad():
    y_logits = model(encoded)
    pred_char_id = y_logits[0, :, -1].argmax().item()
    pred_char = id_to_char[pred_char_id]

print(text + pred_char)




To be or not to be that is the question


In [None]:
text = "To be or not to b"
i=0
while i < 10:
    encoded = encode_text(text).unsqueeze(dim=0).to(device)
    with torch.no_grad():
        y_logits = model(encoded)
        pred_char_id = y_logits[0, :, -1].argmax().item()
        pred_char = id_to_char[pred_char_id]
        text += pred_char
        print(text)
        i += 1

#this is what we call greedy decoding, we always take the most likely character at each step. it results in the same character being repeated


To be or not to be
To be or not to be 
To be or not to be t
To be or not to be th
To be or not to be the
To be or not to be the 
To be or not to be the t
To be or not to be the th
To be or not to be the the
To be or not to be the the 


In [None]:
#lets use model estimation probaility sampling using multinomial distribution
#lets look at an example
torch.manual_seed(42)
p = torch.tensor([[0.5, 0.4, 0.1]]) # we are saying the probs of the 3 characters are 50%, 40% and 10%
samples = torch.multinomial(p, num_samples=10, replacement=True) # we are sampling 10 times from the distribution
print(samples)

#we will also use temperature to control the randomness of the sampling
#temperature is a hyperparameter that controls the randomness of the sampling
#if temperature is high, the model will be more random
#if temperature is low, the model will be more deterministic
#we will use temperature to sample from the model


tensor([[0, 0, 0, 0, 1, 0, 2, 2, 0, 0]])


In [58]:
import torch.nn.functional as F

def next_char(model, text, temperature=1.0):
    encoded = encode_text(text).unsqueeze(dim=0).to(device)
    with torch.no_grad():
        y_logits = model(encoded)
        y_prob = F.softmax(y_logits[0, :, -1] / temperature, dim=-1)
        predicted_char_id = torch.multinomial(y_prob, num_samples=1).item()
        return id_to_char[predicted_char_id]

def extend_text(model, text, num_chars=80, temperature=1.0):
    model.eval()
    for i in range(num_chars):
        text += next_char(model, text, temperature)
    return text

print(extend_text(model, "to be or not to b", 80, temperature=0.7))


to be or not to by i lith
thar yome your mey thoo pis the couf,
ad ton was tith for pou couly
sad


In [None]:
#top k and beam search as an alternative to this type of p sampling