# Baby Names

## Intro

### Download and Split Dataset

In [None]:
import os
import pandas as pd
import random


def extract_names() -> list[str]:
    fURL = "https://www.cs.cmu.edu/Groups/AI/util/areas/nlp/corpora/names/female.txt"
    mURL = "https://www.cs.cmu.edu/Groups/AI/util/areas/nlp/corpora/names/male.txt"

    # Read text files
    f = pd.read_csv(fURL, header=None)[5:]
    m = pd.read_csv(mURL, header=None)[5:]

    # Concatenate and return names
    return f[0].tolist() + m[0].tolist()


def save_train_test_split(path: str, ratio: float) -> None:
    names = list(set(extract_names()))

    random.shuffle(names)

    os.makedirs(path, exist_ok=True)

    train_split = sorted(names[: int(len(names) * ratio)])
    test_split = sorted(names[int(len(names) * ratio) :])

    with open(path + "/train.txt", "w") as f:
        f.write("\n".join(train_split).encode("ascii", errors="ignore").decode("ascii").lower())

    with open(path + "/test.txt", "w") as f:
        f.write("\n".join(test_split).encode("ascii", errors="ignore").decode("ascii").lower())


names = extract_names()
save_train_test_split("data/names", 0.9)

In [None]:
# Train and test split
with open("data/names/train.txt") as f:
    train_names = f.read().split("\n")

with open("data/names/test.txt") as f:
    test_names = f.read().split("\n")

### Basic Data Viz

In [None]:
print(f"Train: {len(train_names)}")
print(f"Test: {len(test_names)}")
print(f"Training names: {train_names[:5]}")
print(f"Test names: {test_names[:5]}")

In [None]:
vocab = list(sorted(set("".join(train_names))))
print(f"Vocab: {vocab}")
print(f"Vocab size: {len(vocab)}")

#### Letter Pairings

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of the names
plt.figure(figsize=(30, 5))

plt.subplot(1, 3, 1)
plt.title("Distribution of name lengths")
plt.hist([len(name) + 2 for name in train_names], bins=20) # +2 for <s> and </s>
plt.xlabel("Name length")
plt.ylabel("Frequency")


plt.subplot(1, 3, 2)
plt.title("Frequency per letter")
letter_freq = {letter: 0 for letter in vocab}
for name in train_names:
    for letter in name:
        if letter.isalpha():
            letter_freq[letter.lower()] += 1
        
plt.bar(letter_freq.keys(), letter_freq.values()) # type: ignore
plt.xlabel("Letter")
plt.ylabel("Frequency")


plt.subplot(1, 3, 3)
plt.title("Distribution of first letters") # Ignoring <s>
first_letter_freq = {letter: 0 for letter in vocab}
for name in train_names:
    first_letter_freq[name[0]] += 1
plt.bar(first_letter_freq.keys(), first_letter_freq.values()) # type: ignore

plt.xlabel("First letter")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
# Plot confunsion matrix of letter pairs
import numpy as np

letter_pairs = [a + b for a in vocab for b in vocab]
letter_pair_freq = {pair: 0 for pair in letter_pairs}
for name in train_names:
    for i in range(len(name) - 1):
        letter_pair_freq[name[i:i+2].lower()] += 1

confusion_matrix = np.zeros((len(vocab), len(vocab)))
for i, a in enumerate(vocab):
    for j, b in enumerate(vocab):
        confusion_matrix[i, j] = letter_pair_freq[a + b]

plt.figure(figsize=(10, 10))
plt.imshow(confusion_matrix)
plt.xticks(range(len(vocab)), vocab)
plt.yticks(range(len(vocab)), vocab)
plt.xlabel("Second letter")
plt.ylabel("First letter")
plt.title("Confusion matrix of letter pairs")
plt.colorbar()
plt.tight_layout()
plt.show()

## Training

In [None]:
device = "mps"

In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm

### Encoded Corpus

In [None]:
# Add start and end tokens
special_tokens = ["<s>", "</s>", "<pad>", "<unk>"]
vocab = special_tokens + vocab
print(f"Vocab: {vocab}")
print(f"Vocab size: {len(vocab)}")

In [None]:
seq_len = max(len(name) for name in train_names) + 2
print(f"Max sequence length: {seq_len}")


def pad_sequence(sequence: list[int], seq_len: int) -> list[int]:
    return sequence + [vocab.index("<pad>")] * (seq_len - len(sequence))


def encode_name(name: str) -> list[int]:
    result = [vocab.index("<s>")]
    for letter in name.lower():
        if letter in vocab:
            result.append(vocab.index(letter))
        else:
            result.append(vocab.index("<unk>"))
    result.append(vocab.index("</s>"))
    return pad_sequence(result, seq_len)


def decode_name(name: list[int]) -> str:
    for token in special_tokens:
        while vocab.index(token) in name:
            name.remove(vocab.index(token))
    

    name = "".join([vocab[i] for i in name]) # type: ignore
    return name[0].upper() + name[1:] # type: ignore


print(encode_name("Alice"))
print(decode_name(encode_name("Alice")))

In [None]:
train_encoded_names = [encode_name(name) for name in train_names]
test_encoded_names = [encode_name(name) for name in test_names]

In [None]:
len(train_encoded_names), len(test_encoded_names)

### Create Dataset and Loader

In [None]:
train_encoded_corpus = torch.Tensor(train_encoded_names).to(device)
test_encoded_corpus = torch.Tensor(test_encoded_names).to(device)

train_dataset = torch.utils.data.TensorDataset(train_encoded_corpus)
test_dataset = torch.utils.data.TensorDataset(test_encoded_corpus)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

### Helper functions

In [None]:
@torch.no_grad()
def validate(model):
    model.eval()
    total_loss = 0
    for x, in test_loader:
        inputs = x[:, :-1].long()
        targets = x[:, 1:].long()

        outputs, _ = model(inputs)

        loss = torch.nn.functional.cross_entropy(outputs.transpose(1, 2), targets)
        total_loss += loss.item()
        
    model.train()
    return total_loss / len(test_loader)

### RNN Model

In [None]:
class RNN(nn.Module):
    def __init__(
        self, vocab_size, hidden_size, num_layers, dropout, batch_first=True
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(
            hidden_size,
            hidden_size,
            num_layers,
            batch_first=batch_first,
            dropout=dropout,
            nonlinearity="relu",
        )
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, vocab_size),
        )

    def forward(self, x, hidden: torch.Tensor | None = None):
        if hidden is None:
            hidden = torch.zeros(
                self.rnn.num_layers, x.size(0), self.rnn.hidden_size, device=device
            )

        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = self.mlp(x)

        return x, hidden
    
    def forward_with_gradient(self, x, hidden=None):
        if hidden is None:
            hidden = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size, device=x.device)
        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        output = self.mlp(x)
        return output, hidden, x  # return embeddings for gradient computation

    @torch.no_grad()
    def generate(
        self,
        start_seq: str | None = None,
        max_len: int = 20,
        hidden: torch.Tensor | None = None,
    ) -> str:
        self.eval()
        if start_seq is None:
            start_seq = "<s>"

        if hidden is None:
            hidden = torch.zeros(
                self.rnn.num_layers, 1, self.rnn.hidden_size, device=device
            )
            
        x = torch.Tensor([vocab.index(start_seq)]).long().unsqueeze(0).to(device)

        output = [x.flatten()]
        for _ in range(max_len):
            x, hidden = self(x, hidden)
            
            if x.shape[1] > 1:
                x = x[:, -1:]

            x = x.softmax(dim=-1).argmax(dim=-1)
            
            if x.item() == vocab.index("</s>") and len(output) > 2:
                break
            output.append(x.flatten())
        self.train()
        
        return decode_name(torch.cat(output).flatten().tolist())


hidden_size = 32
num_layers = 12
dropout = 0

model = RNN(len(vocab), hidden_size, num_layers, dropout).to(device)
num_train_steps = 0

print(f"Model has {sum(p.numel() for p in model.parameters()):,} parameters")
model.generate()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=4e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=10)

In [None]:
# Train the model
roll_loss = 0
model.train()
for epoch in range(100):
    pbar = tqdm(train_loader, leave=True, desc=f"Epoch {epoch + 1:02d}")
    val_loss = validate(model)

    for batch in pbar:
        batch = batch[0].to(device)
        model.train()
        optimizer.zero_grad()

        inputs = batch[:, :-1].long()
        targets = batch[:, 1:].long()

        outputs, _ = model(inputs)

        loss = torch.nn.functional.cross_entropy(outputs.transpose(1, 2), targets)

        roll_loss = 0.9 * roll_loss + 0.1 * loss.detach()

        loss.backward()
        optimizer.step()
        num_train_steps += 1
        

        pbar.set_postfix_str(f"Loss: {loss.item():.4f}, Val loss: {val_loss:.4f}, Steps: {num_train_steps}, LR: {optimizer.param_groups[0]['lr']:.2e}, Roll loss: {roll_loss:.4f}")
        pbar.update()

    scheduler.step(roll_loss)

model.eval()
print("\n\nGenerated names:")
model.generate()

In [None]:
for _ in range(10):
    print(model.generate(hidden=torch.randn(num_layers, 1, hidden_size, device=device)))


In [None]:
torch.save(model.state_dict(), "blog/4-names/models/rnn.pth")

### LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, dropout, batch_first=True):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers,dropout=dropout, batch_first=batch_first)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden: tuple[torch.Tensor, torch.Tensor] | None = None):
        if hidden is None:
            hidden = (
                torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size, device=x.device),
                torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size, device=x.device),
            )

        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = self.fc(x)

        return x, hidden

    @torch.no_grad()
    def generate(
        self,
        start_seq: str | None = None,
        max_len: int = 20,
        hidden: tuple[torch.Tensor, torch.Tensor] | None = None,
    ) -> str:
        self.eval()
        if start_seq is None:
            start_seq = "<s>"

        if hidden is None:
            hidden = (
                torch.zeros(self.rnn.num_layers, 1, self.rnn.hidden_size, device=device),
                torch.zeros(self.rnn.num_layers, 1, self.rnn.hidden_size, device=device)
            )
        x = torch.Tensor([vocab.index(start_seq)]).long().unsqueeze(0).to(device)

        output = [x.flatten()]
        for _ in range(max_len):
            x, hidden = self(x, hidden)
            if x.shape[1] > 1:
                x = x[:, -1:]

            x = x.softmax(dim=-1).argmax(dim=-1)
            if x.item() == vocab.index("</s>") and len(output) > 2:
                break
            output.append(x.flatten())
        self.train()
        
        return decode_name(torch.cat(output).flatten().tolist())


hidden_size = 32
num_layers = 8
dropout = 0.0

model = LSTM(len(vocab), hidden_size, num_layers, dropout).to(device)
num_train_steps = 0

print(f"Model has {sum(p.numel() for p in model.parameters()):,} parameters")
model.generate()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=4e-4)

In [None]:
# Train the model
for epoch in range(100):
    pbar = tqdm(train_loader, leave=True, desc=f"Epoch {epoch + 1:02d}")
    val_loss = validate(model)
    
    for batch in pbar:
        batch = batch[0].to(device)
        model.train()
        optimizer.zero_grad()

        inputs = batch[:, :-1].long()
        targets = batch[:, 1:].long()

        outputs, _ = model(inputs)

        loss = torch.nn.functional.cross_entropy(outputs.transpose(1, 2), targets)
        loss.backward()
        optimizer.step()
        num_train_steps += 1

        pbar.set_postfix(loss=loss.item(), val_loss=val_loss, step=num_train_steps)
        pbar.update()

model.eval()
print("\n\nGenerated names:")
model.generate()

In [None]:
for _ in range(10):
    print(model.generate())

In [None]:
torch.save(model.state_dict(), "blog/4-names/models/lstm.pth")

### GRU

In [None]:
class GRU(nn.Module):
    def __init__(
        self, vocab_size, hidden_size, num_layers, dropout, batch_first=True
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            num_layers,
            batch_first=batch_first,
            dropout=dropout,
        )
        self.fc = nn.Linear(hidden_size, vocab_size)
    def forward(self, x, hidden: torch.Tensor | None = None):
        if hidden is None:
            hidden = torch.zeros(
                self.gru.num_layers, x.size(0), self.gru.hidden_size, device=device
            )

        x = self.embedding(x)
        x, hidden = self.gru(x, hidden)
        x = self.fc(x)

        return x, hidden
    
    def forward_with_gradient(self, x, hidden=None):
        if hidden is None:
            hidden = torch.zeros(self.gru.num_layers, x.size(0), self.gru.hidden_size, device=x.device)
        x = self.embedding(x)
        x, hidden = self.gru(x, hidden)
        output = self.fc(x)
        return output, hidden, x  # return embeddings for gradient computation

    @torch.no_grad()
    def generate(
        self,
        start_seq: str | None = None,
        max_len: int = 20,
        hidden: torch.Tensor | None = None,
    ) -> str:
        self.eval()
        if start_seq is None:
            start_seq = "<s>"

        if hidden is None:
            hidden = torch.zeros(
                self.gru.num_layers, 1, self.rnn.hidden_size, device=device
            )
        x = torch.Tensor([vocab.index(start_seq)]).long().unsqueeze(0).to(device)

        output = [x.flatten()]
        for _ in range(max_len):
            x, hidden = self(x, hidden)
            if x.shape[1] > 1:
                x = x[:, -1:]

            x = x.softmax(dim=-1).argmax(dim=-1)
            if x.item() == vocab.index("</s>") and len(output) > 2:
                break
            output.append(x.flatten())
        self.train()
        
        return decode_name(torch.cat(output).flatten().tolist())


hidden_size = 16
num_layers = 8
dropout = 0.1

model = GRU(len(vocab), hidden_size, num_layers, dropout).to(device)
num_train_steps = 0

print(f"Model has {sum(p.numel() for p in model.parameters()):,} parameters")
model.generate()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

In [None]:
# Train the model
for epoch in range(100):
    pbar = tqdm(train_loader, leave=True, desc=f"Epoch {epoch + 1:02d}")
    val_loss = validate(model)
    
    for batch in pbar:
        batch = batch[0].to(device)
        model.train()
        optimizer.zero_grad()

        inputs = batch[:, :-1].long()
        targets = batch[:, 1:].long()

        outputs, _ = model(inputs)

        loss = torch.nn.functional.cross_entropy(outputs.transpose(1, 2), targets)
        loss.backward()
        optimizer.step()
        num_train_steps += 1

        pbar.set_postfix(loss=loss.item(), val_loss=val_loss, step=num_train_steps)
        pbar.update()

print("\n\nGenerated names:")
model.generate()

In [None]:
for _ in range(10):
    print(model.generate())

In [None]:
torch.save(model.state_dict(), "blog/4-names/models/gru.pth")