In [1]:
import torch
import os
import random
import string

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"using: {device}")

using: cuda


In [19]:
class NamesDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_path: str, test: bool):
        names = []
        max_name_len = 0
        with open(os.path.join(dataset_path, "English.txt"), "r", encoding="utf-8") as f:
            for line in f:
                name = line.strip().lower()
                name = f"<{name}>"
                max_name_len = max(max_name_len, len(name))
                names.append(name)

        random.seed(42)
        random.shuffle(names)
        
        x = []
        y = []
        for name in names:
            for i in range(1, len(name)):
                name_subset = name[:i+1]
                if name_subset == name:
                    y.append(" ")
                else:
                    y.append(name[i+1])
                name_subset = self.padding(name_subset, max_name_len)
                x.append(name_subset)
        
        self.xy = list(zip(x, y))

    def __getitem__(self, idx):
        x_str, y_str = self.xy[idx]
        x = self.name2tensor(x_str).to(device=device)
        y = self.name2tensor(y_str).to(device=device)
        y.squeeze_()

        return x, y, 1
    
    def __len__(self):
        return len(self.xy)
    
    @staticmethod
    def padding(x: str, target_len):
        required_padding = target_len - len(x)
        if required_padding:
            # left_pad = random.randint(0, required_padding)
            left_pad = required_padding  # put all paddings to the left
            right_pad = required_padding - left_pad
            x_padded = left_pad * " " + x + right_pad * " "
            assert len(x_padded)==target_len, "padding has gone wrong!"
            return x_padded
        else:
            return x

    @staticmethod
    def name2tensor(name):
        all_letters = " " + string.ascii_letters[:26] + "<>"
        tensor = torch.zeros(len(name), len(all_letters))
        for i, char in enumerate(name):
            tensor[i, all_letters.find(char)] = 1
        return tensor
    
    @staticmethod
    def tensor2name(tensor):
        all_letters = " " + string.ascii_letters[:26]  + "<>"
        name = ""
        for row in tensor:
            name += all_letters[row.argmax(-1).item()]
        return name

ds = NamesDataset(dataset_path="../datasets/names/", test=False)
ds_loader = torch.utils.data.DataLoader(ds, batch_size=512, shuffle=False)

In [3]:
class RNNModel(torch.nn.Module):
    def __init__(self, feat_size, hidden_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.i2r = torch.nn.RNN(feat_size, hidden_size, num_layers, batch_first=True)
        self.r2o = torch.nn.Linear(hidden_size, feat_size)
        
        self.softmax = torch.nn.LogSoftmax(dim=-1)
        
    def forward(self, X):
        """
        
        Args:
            X (Tensor): input tensor, shape: [nbatches, ncharacters, len_character_tensor]

        Returns:
            Tensor: output tenosr, shape: [nbatches, nclasses]
        """
        hidden = torch.zeros(self.num_layers, X.shape[0], self.hidden_size, device=device)
        out, _ = self.i2r(X, hidden)
        out = self.r2o(out[:,-1,:])
        out = self.softmax(out)
        return out

rnn = RNNModel(26+1+2, 100, 1).to(device=device)

In [4]:
lossfn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)

def train_one_batch(X, Y):
    """
    do one step training.
    Args:
        X (Tensor): input tensors.
        Y (Tensor): one-hot encoded target tensors.

    Returns:
        float: loss value.
    """
    optimizer.zero_grad()
    output = rnn(X)
    loss = lossfn(output, Y)
    loss.backward()
    optimizer.step()
    return loss.item()

def eval(X, Y):
    """
    returns the accurace of the rnn model.
    Args:
        X (Tensor): input tensors.
        Y (Tensor): one-hot encoded target tensors.
    Returns:
        float: loss, accuracy.
    """

    with torch.no_grad():
        output = rnn(X)
        val_loss = lossfn(output, Y).item()

    return val_loss

In [5]:
n_epochs = 10
print("epoch\t\tbatch\t\ttraining loss")
print("----------------------------------------------")
train_loss = []
for epoch in range(n_epochs):
    train_loss.append(0)
    ibatch = 0
    all_batches = len(ds_loader)
    for X, Y in ds_loader:
        ibatch += 1
        train_loss[-1] += train_one_batch(X, Y)
        print(f"{epoch+1}\t\t{ibatch}/{all_batches}\t\t{train_loss[-1]/all_batches:.2f}", end="\r")
    print()

epoch		batch		training loss
----------------------------------------------
1		53/53		2.53
2		53/53		1.95
3		53/53		1.83
4		53/53		1.78
5		53/53		1.74
6		53/53		1.71
7		53/53		1.68
8		53/53		1.66
9		53/53		1.64
10		53/53		1.61


In [15]:
def generate_name(start_str):
    pred = ""
    result = start_str
    while pred != ">":
        input = ds.name2tensor(result).to(device=device)
        input.unsqueeze_(0)
        pred = rnn(input)
        pred = ds.tensor2name(pred).strip()
        result += pred
        yield result


In [18]:
import time
for res in generate_name("<og"):
    print(res, end="\r")
    time.sleep(0.3)

<oggerson>