In [238]:
import torch 
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np


In [239]:
with open("data/names.txt") as f:
    words = f.read().splitlines()

chars = [".", "@"] + sorted(list(set(''.join(words))))
vocab_size = len(chars) # 28 in this case

str_to_idx = {s: i for i, s in enumerate(chars)}
idx_to_str = {i: s for s, i in str_to_idx.items()}

# Build the dataset

In [240]:
from torch.nn.utils.rnn import pad_sequence

dataset = []

for word in words:
    sequence = [0] + [str_to_idx[c] for c in word] + [0]
    dataset.append(torch.tensor(sequence))

dataset = pad_sequence(dataset, batch_first=True, padding_value=1)

print(dataset[:3])

dataloader = DataLoader(TensorDataset(dataset), batch_size=500, shuffle=True)

tensor([[ 0,  6, 14, 14,  2,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 0, 16, 13, 10, 23, 10,  2,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 0,  2, 23,  2,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]])


## Define the model architecture

In this case, we will be making an RNN from scratch

In [241]:
class RNN(nn.Module):
    def __init__(self, hidden_size=500, embedding_size=20):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.character_embeddings = nn.Embedding(vocab_size, embedding_size)
        # self.i2h = nn.Linear(embedding_size+hidden_size, hidden_size)
        self.i2h = nn.Sequential(
            nn.Linear(embedding_size+hidden_size, hidden_size),
            nn.LeakyReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh()
        )
        # self.i2o = nn.Linear(embedding_size+hidden_size, vocab_size)
        self.i2o = nn.Sequential(
            nn.Linear(embedding_size+hidden_size, hidden_size),
            nn.LeakyReLU(),
            nn.Linear(hidden_size, vocab_size),
        )

    def forward(self, input, hidden):
        input = self.character_embeddings(input) 
        combined = torch.cat([input, hidden], 1)
        # print(combined.shape)
        hidden = self.i2h(combined)
        output = self.i2o(combined)

        return output, hidden

    def initialize_hidden(self, batch_size=1):
        return torch.zeros(batch_size, self.hidden_size)

In [242]:
model = RNN().cuda()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=1, reduction='sum') # Ignore the padding character

for epoch in range(20):
    losses = []
    counts = 0
    with tqdm(dataloader) as dataloader:
        for batch in dataloader:
            batch = batch[0].cuda()

            hidden = model.initialize_hidden(len(batch)).cuda()
            # Initialise the hidden state

            loss = 0
            counts = 0

            for i in range(batch.shape[1]-1):
                inpt = batch[:, i]
                target = batch[:, i+1]
                # The input character and the target character (next character)
                # for each batch
                # We are trying to predict the next character

                output, hidden = model(inpt, hidden) # Pass the input and the hidden state
                # This gives us a new hidden state and an output
                loss += criterion(output, target)
                counts += 1


            losses.append(loss.item()/(counts*len(batch)))
            optim.zero_grad()
            loss.backward()
            optim.step()
            
    # print(losses)
    print(f"Epoch {epoch} loss: {np.mean(losses)}")

100%|██████████| 65/65 [00:01<00:00, 47.02it/s]


Epoch 0 loss: 1.1138403007738515


100%|██████████| 65/65 [00:01<00:00, 50.78it/s]


Epoch 1 loss: 1.0073226393764114


100%|██████████| 65/65 [00:01<00:00, 48.96it/s]


Epoch 2 loss: 0.9727418524753242


100%|██████████| 65/65 [00:01<00:00, 48.20it/s]


Epoch 3 loss: 0.9487377819781537


100%|██████████| 65/65 [00:01<00:00, 48.87it/s]


Epoch 4 loss: 0.9296817814113376


100%|██████████| 65/65 [00:01<00:00, 48.63it/s]


Epoch 5 loss: 0.9144846200227181


100%|██████████| 65/65 [00:01<00:00, 56.77it/s]


Epoch 6 loss: 0.9000249926231404


100%|██████████| 65/65 [00:01<00:00, 58.76it/s]


Epoch 7 loss: 0.8890579865426729


100%|██████████| 65/65 [00:01<00:00, 57.73it/s]


Epoch 8 loss: 0.8770256451729017


100%|██████████| 65/65 [00:01<00:00, 57.75it/s]


Epoch 9 loss: 0.8675789488536773


100%|██████████| 65/65 [00:01<00:00, 56.75it/s]


Epoch 10 loss: 0.8586517209993376


100%|██████████| 65/65 [00:01<00:00, 60.29it/s]


Epoch 11 loss: 0.8511194804087107


100%|██████████| 65/65 [00:01<00:00, 59.23it/s]


Epoch 12 loss: 0.8436176445096245


100%|██████████| 65/65 [00:01<00:00, 57.05it/s]


Epoch 13 loss: 0.837214591535377


100%|██████████| 65/65 [00:01<00:00, 54.78it/s]


Epoch 14 loss: 0.8299169315293675


100%|██████████| 65/65 [00:01<00:00, 59.85it/s]


Epoch 15 loss: 0.8240024350608701


100%|██████████| 65/65 [00:01<00:00, 60.44it/s]


Epoch 16 loss: 0.816561248964252


100%|██████████| 65/65 [00:01<00:00, 58.80it/s]


Epoch 17 loss: 0.8112552887034028


100%|██████████| 65/65 [00:01<00:00, 58.62it/s]


Epoch 18 loss: 0.8051690009270515


100%|██████████| 65/65 [00:01<00:00, 57.51it/s]

Epoch 19 loss: 0.8009239397738085





In [243]:
def sample_names():
    model.cpu()
    model.eval()
    for i in range(20):
        hidden = model.initialize_hidden()
        input = torch.tensor([0])
        name = ""
        for i in range(20):
            output, hidden = model(input, hidden)
            input = torch.multinomial(F.softmax(output, dim=1), 1)
            name += idx_to_str[input.item()]
            if input.item() == 0:
                break
            input = torch.tensor([input])
        if name.strip('.') in words:
            print("FOUND:", name)
            continue
        print("NEW:  ",name)

sample_names()

NEW:   bros.
NEW:   mohendry.
FOUND: terrius.
NEW:   rays.
NEW:   myanee.
FOUND: canan.
NEW:   melca.
NEW:   rabin.
NEW:   jhaden.
NEW:   xovann.
NEW:   glain.
NEW:   zadric.
NEW:   lariana.
NEW:   jenevicto.
FOUND: kailani.
NEW:   yasari.
NEW:   deara.
FOUND: siren.
NEW:   calemay.
NEW:   elyzia.


In [244]:
@torch.no_grad()
def evaluate_model(model):
    model.eval()
    correct = 0
    total = 0
    losses = []

    with tqdm(total=len(dataset)) as pbar:
        for name in dataset:
            hidden = model.initialize_hidden()
            loss = 0
            counts = 0
            for i in range(len(name)-1):
                if name[i] == 0 and i > 0:
                    break
                input = torch.tensor([name[i]])
                target = torch.tensor([name[i+1]])
                output, hidden = model(input, hidden)
                loss += F.cross_entropy(output, target)
                pred = output.argmax(dim=1)
                correct += (pred == target).sum()
                counts += 1
                total += 1
            losses.append(loss.item()/counts)
            pbar.update(1)
    print(f"Accuracy: {correct/total}")
    print(f"Loss: {np.mean(losses)}")



evaluate_model(model)

100%|██████████| 32033/32033 [00:59<00:00, 536.48it/s]

Accuracy: 0.4321531057357788
Loss: 1.8199594114819886



