## Importing modules

In [None]:
import os
import time
import math
import string
import random

In [None]:
import collections
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import torch
from torch import nn
from torch import optim
from torch.functional import F
from torch.utils.data import DataLoader

In [None]:
from utils import evaluate
from utils import training

In [None]:
from importlib import reload

## Loading data

In [None]:
data_dict_re = torch.load("./saves/data/clean_names.pt")

In [None]:
data_in_char = data_dict_re["data_in_char"]
char_vocab = data_dict_re["char_vocab"]

**NOTE:** `char_vocab` contains a **PAD token** which is meant for when we want to batch our training data. We are not doing that here, so we are removing it.

In [35]:
if "<PAD>" in char_vocab:
    char_vocab.remove("<PAD>")

In [None]:
print("data length:", len(data_in_char))
print("vocab size:", len(char_vocab))

In [None]:
print(data_in_char[:5])

In [None]:
char_to_ix = {ch:i for i,ch in enumerate(char_vocab)}
ix_to_char = {i:ch for ch,i in char_to_ix.items()}

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_as_str, _map):
        self.data_as_int = []
        self.max_seqlen = float("-inf")
        self.min_seqlen = float("inf")
        
        # Convert characters to integers
        for seq_as_str in data_as_str:
            seq_as_int = evaluate.keys_to_values(seq_as_str, _map,
                random.choice(list(_map)))
            
            self.data_as_int.append(seq_as_int)
            self.max_seqlen = max(self.max_seqlen, len(seq_as_int)-1)
            self.min_seqlen = min(self.min_seqlen, len(seq_as_int)-1)

    def __len__(self):
        return len(self.data_as_int)

    def __getitem__(self, ix):
        # Get data sample at index ix
        item = self.data_as_int[ix]
        
        # Slice x and y from sample
        x = item[:-1]
        y = item[ 1:]
        
        return torch.tensor(x), torch.tensor(y)

In [None]:
dataset = Dataset(data_in_char, char_to_ix)
dataloader = DataLoader(dataset, 1, True)

In [None]:
print("Dataset size:", len(dataset))
print("Max sequence length:", dataset.max_seqlen)
print("Min sequence length:", dataset.min_seqlen)

## Model

In [None]:
class Model(nn.Module):
    def __init__(self, _map, hidden_size, emb_dim=8, n_layers=1, dropout_p=0.2):
        """
        Input:
            _map: char_to_ix.
            hidden_size: Number of features to learn.
            emb_dim: Size of embedding vector.
            n_layers: Number of layers.
            dropout_p: Dropout probability.
        """
        super(Model, self).__init__()
        
        self.vocab_size  = len(_map)
        self.hidden_size = hidden_size
        self.emb_dim     = emb_dim
        self.n_layers    = n_layers
        self.dropout_p   = dropout_p
        
        self.embedding = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim =self.emb_dim)
        
        self.lstm = nn.LSTM(
            input_size =self.emb_dim,
            hidden_size=self.hidden_size,
            num_layers =self.n_layers,
            batch_first=True)
        
        self.dropout = nn.Dropout(self.dropout_p)
        
        self.fc = nn.Linear(
            in_features =self.hidden_size,
            out_features=self.vocab_size)
        
    def forward(self, x, prev_state):
        """
        Input:
            x: x
            prev_state: The previous state of the model.
            
        Output:
            out: The output of the model.
            state: The current state of the model.
        """
        n_b, n_s = x.shape
        
        embed = self.embedding(x)
        yhat, state = self.lstm(embed, prev_state)
        
        yhat = self.dropout(yhat)
        out = self.fc(yhat)
        return out, state
    
    def init_state(self, b_size=1):
        return (torch.zeros(self.n_layers, b_size, self.hidden_size),
                torch.zeros(self.n_layers, b_size, self.hidden_size))

In [None]:
model = Model(char_to_ix, 64, 8, n_layers=1, dropout_p=0.2)
model

In [None]:
loss_history = []

## Loading and Saving Model

In [None]:
def load_model(path):
    m_data = torch.load(path)
    
    m = Model(
        _map       =m_data["_map"],
        hidden_size=m_data["hidden_size"],
        emb_dim    =m_data["emb_dim"],
        n_layers   =m_data["n_layers"],
        dropout_p  =m_data["dropout_p"])
    
    m.load_state_dict(m_data["state_dict"])
    l_hist = m_data["loss_history"]
    return m, l_hist

**Uncomment cell to load the trained model**

In [None]:
# model, loss_history = load_model("./saves/model/dino-name.pt")
# model

In [None]:
def save_model(m, l_hist, _map, path=None):
    if not path: path = "./saves/model/dino-name.pt"
        
    m_data = {
        "_map"        : _map,
        "hidden_size" : m.hidden_size,
        "emb_dim"     : m.emb_dim,
        "n_layers"    : m.n_layers,
        "dropout_p"   : m.dropout_p,
        "state_dict"  : m.state_dict(),
        "loss_history": l_hist}
    torch.save(m_data, path)

## Training

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
iteration = 50000
per_iter = 5000
start_t = time.time()

for _ti in range(iteration//per_iter):
    model, costs = training.train(
        model, dataloader, per_iter, criterion, clip=0.25, lr=1e-3, print_every=1000)
    
    loss_history.extend(costs)
    save_model(model, loss_history, char_to_ix)
    time.sleep(5)
    
    print("\n" + "="*50)
    print("Round: {:2} of {:2}, Running Time: {:7.2f} sec".format(
        _ti+1, iteration//per_iter, time.time() - start_t))
    print("="*50 + "\n")

In [None]:
cum = 250
plt.xlabel("Iteration")
plt.ylabel("Cross-Entropy Loss")
plt.plot(
    [sum(loss_history[i:i+cum])/cum for i in range(0, len(loss_history), cum)])

In [None]:
print("Iter: {} | Min: {:.4f} | Max: {:.4f} | Last: {:.4f} | Ave: {:.4f}".format(
    len(loss_history), min(loss_history), max(loss_history), loss_history[-1],
    sum(loss_history)/len(loss_history)))

## Evaluating

In [None]:
n_samp = 100
ix_list = list(char_to_ix.values())[1:]

In [None]:
originality = evaluate.originality(
    n_samp, dataset.data_as_int, evaluate.sample, model, ix_list,
    4, False, dataset.max_seqlen, char_to_ix["<EOS>"], False)

Initialise sampling with a **randomly chosen character**

In [None]:
for i in range(10):
    seed = random.choice(ix_list)
    
    print(ix_to_char[seed], "=>", "".join(evaluate.keys_to_values(
        evaluate.sample(model, seed, 5, False, 30, char_to_ix["<EOS>"], False),
        ix_to_char, "<?>")))

Initialise sampling with **a list of characters** instead of a single character

In [None]:
for i in range(10):
    word = "python"
    seed = evaluate.keys_to_values(list(word), char_to_ix, char_to_ix["<EOS>"])
    
    print(word, "=>", "".join(evaluate.keys_to_values(
        evaluate.sample(model, seed, 5, False, 30, char_to_ix["<EOS>"], False),
        ix_to_char, "<?>")))

Sample the next **most likely character** instead of the next **topk most likely characters**

In [None]:
for ch in char_vocab:
    seed = char_to_ix[ch]
    
    print("{:->5}".format(ch), "=>", "".join(evaluate.keys_to_values(
        evaluate.sample(model, seed, 1, True, 30, char_to_ix["<EOS>"], False),
        ix_to_char, "<?>")))