In [3]:
import pandas as pd

df = pd.read_csv('projects/blog/4-reviews/IMDB Dataset.csv')['review'].apply(lambda x: x.replace('<br />', '\n'))
print(df.head())

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. \n\nThe filming...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object


In [1]:
import tokenizers
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("projects/blog/4-all-shakespeare/tokenizer.json")
tokenizer.decoder = tokenizers.decoders.ByteLevel()

In [2]:
with open("projects/blog/4-all-shakespeare/shakespeare.txt", "r") as f:
    corpus = f.read()
print(f"Corpus has {len(corpus):,} characters")

Corpus has 5,378,662 characters


In [3]:
encoded = tokenizer.encode(corpus).ids
print(f"Encoded corpus has {len(encoded):,} tokens")
del corpus

Encoded corpus has 2,774,526 tokens


In [4]:
train_split = 0.8
train_size = int(len(encoded) * train_split)
train_data = encoded[:train_size]
val_data = encoded[train_size:]

print(f"Training data has {len(train_data):,} tokens")
print(f"Validation data has {len(val_data):,} tokens")
del encoded

Training data has 2,219,620 tokens
Validation data has 554,906 tokens


In [5]:
device = "mps"

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import matplotlib.pyplot as plt

In [None]:
# make dataset and round to nearest 256
seq_len = 256
train_dataset = torch.tensor(train_data[:-(len(train_data) % seq_len)]).view(-1, seq_len)
val_dataset = torch.tensor(val_data[:-(len(val_data) % seq_len)]).view(-1, seq_len)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # RNN
        self.model = nn.RNN(embed_dim, hidden_dim, num_layers, batch_first=True)
        
        # Fully connected layer to predict each character
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Embedding
        x = self.embedding(x)
        
        # Initialize hidden state if not provided
        if hidden is None:
            hidden = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device)
        
        # GRU output along with new hidden state
        out, hidden = self.model(x, hidden)
        
        # Reshape output for the fully connected layer
        out = out.reshape(-1, self.hidden_dim)
        out = self.fc(out)
        return out, hidden
    
    def generate(self, ctx, hidden=None, max_len=256, temperature=1.0):
        assert 0 <= temperature <= 1, "Temperature has to be between 0 and 1"

        # Set the model to evaluation
        self.eval()

        # Convert the context to a tensor
        ctx = torch.tensor(ctx, device=device).view(1, -1)

        result = [ctx]

        with torch.no_grad():
            for i in range(max_len-len(ctx[0])):
                # Get the output and hidden state
                output, hidden = self(ctx, hidden)

                # Apply temperature
                output = output.view(-1).div(temperature).exp()

                # Sample the next character
                char = torch.multinomial(output, 1).item()

                # Append to the result
                result.append(torch.tensor(char, device=device).view(1, 1))

                # Update the context
                ctx = torch.cat(result[-1:], dim=-1)

        return torch.cat(result, dim=-1).view(-1).tolist()

# Create an instance of the updated model
vocab_size = 512
embed_dim = 128
hidden_dim = 256
num_layers = 6

model = Model(vocab_size, embed_dim, hidden_dim, num_layers).to(device)
num_train_steps = 0
print(f"Model created with {sum(p.numel() for p in model.parameters()):,} parameters")

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # GRU
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, batch_first=True)
        
        # Fully connected layer to predict each character
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Embedding
        x = self.embedding(x)
        
        # Initialize hidden state if not provided
        if hidden is None:
            hidden = torch.zeros(self.num_layers, x.size(0), self.hidden_dim, device=x.device)
        
        # GRU output along with new hidden state
        out, hidden = self.gru(x, hidden)
        
        # Reshape output for the fully connected layer
        out = out.reshape(-1, self.hidden_dim)
        out = self.fc(out)
        return out, hidden
    
    def generate(self, ctx, hidden=None, max_len=256, temperature=1.0):
        assert 0 <= temperature <= 1, "Temperature has to be between 0 and 1"

        # Set the model to evaluation
        self.eval()

        # Convert the context to a tensor
        ctx = torch.tensor(ctx, device=device).view(1, -1)

        result = [ctx]

        with torch.no_grad():
            for i in range(max_len-len(ctx[0])):
                # Get the output and hidden state
                output, hidden = self(ctx, hidden)

                # Apply temperature
                output = output.view(-1).div(temperature).exp()

                # Sample the next character
                char = torch.multinomial(output, 1).item()

                # Append to the result
                result.append(torch.tensor(char, device=device).view(1, 1))

                # Update the context
                ctx = torch.cat(result[-1:], dim=-1)

        return torch.cat(result, dim=-1).view(-1).tolist()

# Create an instance of the updated model
vocab_size = 512  # number of unique characters
embed_dim = 128   # embedding dimension
hidden_dim = 256  # LSTM hidden dimensions
num_layers = 6  # number of GRU layers

model = Model(vocab_size, embed_dim, hidden_dim, num_layers).to(device)
num_train_steps = 0
print(f"Model created with {sum(p.numel() for p in model.parameters()):,} parameters")

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)

In [None]:
import tqdm.notebook as tqdm

In [None]:
@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    for batch in tqdm.tqdm(loader, desc="Evaluation"):
        batch = batch.to(device)
        x, y = batch[:, :-1], batch[:, 1:]
        
        output, _ = model(x)
        
        loss = criterion(output, y.flatten())
        total_loss += loss.item()
    return total_loss / len(loader)

test_loss = evaluate(model, val_loader)

In [None]:
from ema_pytorch import EMA
ema = EMA(model, beta=0.99)

In [None]:
# Training loop
pbar = tqdm.tqdm(range(20), desc="Training")
for epoch in pbar:
    test_loss = evaluate(model, train_loader)
    model.train()
    pbar = tqdm.tqdm(train_loader, leave=True, desc=f"Epoch {epoch}")
    for seq in pbar:
        seq = seq.to(device)
        x = seq[:,:-1]
        y = seq[:,1:]
            
        # Forward pass
        output, _ = model(x)
        loss = criterion(output, y.flatten())
            
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ema.update()

        pbar.set_description(f"Epoch {epoch}")
        pbar.set_postfix_str(f"Loss: {loss.item():.4f}, Test Loss: {test_loss:.4f}, Step: {num_train_steps}")
        num_train_steps += 1

In [None]:
# Generate a new name
context = tokenizer.encode("First Citizen").ids
generated = model.generate(context, max_len=256, temperature=1)
print(print(tokenizer.decode(generated)))


In [None]:
# Save Model
torch.save(model.state_dict(), 'projects/4-shakespeare/model.pt')