In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler, Adam

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

import pandas as pd
import string

In [2]:
df = pd.read_csv("resources/names.csv")
df

Unnamed: 0.1,Unnamed: 0,Category,Name
0,0,Arabic,b'Khoury'
1,1,Arabic,b'Nahas'
2,2,Arabic,b'Daher'
3,3,Arabic,b'Gerges'
4,4,Arabic,b'Nazari'
...,...,...,...
20069,20069,Vietnamese,b'Truong'
20070,20070,Vietnamese,b'Van'
20071,20071,Vietnamese,b'Vinh'
20072,20072,Vietnamese,b'Vuong'


In [15]:
all_letters = ["<pad>"] + list(string.ascii_letters + "/1234567890 .,;:'-\"") + ["<eos>"]
n_letters = len(all_letters)
n_letters

73

In [16]:
stoi = {letter : idx for idx, letter in enumerate(all_letters)}
itos = [letter for idx, letter in enumerate(all_letters)]

In [17]:
stoi["<eos>"], itos[59]

(72, '6')

In [18]:
len(stoi)

73

In [19]:
class NamesDataset(Dataset):
    def __init__(self, df, stoi, eos_token="<eos>"):
        self.stoi = stoi
        self.eos_token = eos_token
        self.n_tokens = len(self.stoi)
        
        self.categories = df["Category"].tolist()
        self.names = df["Name"].tolist()
        
        
        self.all_categories = list(set(self.categories))
        self.n_categories = len(self.all_categories)

    def __getitem__(self, item):
        category = self.categories[item]
        name = self.names[item]
        
        category_tensor = self.get_category_tensor(category)
        
        input_tensor = torch.tensor([stoi[char] for char in name])
        target_tensor = torch.tensor([stoi[char] for char in list(name[1:])+[self.eos_token]])
        
        item_dict = {"category": category,
        "name": name,
        "category_tensor": category_tensor,
        "input_tensor": input_tensor,
        "target_tensor": target_tensor}
        
        
        return item_dict

    def __len__(self):
        return len(self.categories)
    
    
    def get_category_tensor(self, category):
        li = self.all_categories.index(category)
        tensor = torch.zeros(1, self.n_categories)
        tensor[0][li] = 1
        return tensor

In [20]:
ds = NamesDataset(df, stoi)

ds[0]

{'category': 'Arabic',
 'name': "b'Khoury'",
 'category_tensor': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 'input_tensor': tensor([ 2, 69, 37,  8, 15, 21, 18, 25, 69]),
 'target_tensor': tensor([69, 37,  8, 15, 21, 18, 25, 69, 72])}

In [21]:
def collate_fn(data):
    def merge(sequences):
        "https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py"
        
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq[:end]
        return padded_seqs, lengths

    categories = [x["category"] for x in data]          
    names = [x["name"] for x in data]          
    category_tensors = torch.cat([x["category_tensor"] for x in data])
    
    input_tensors = [x["input_tensor"] for x in data]
    input_tensors, _ = merge(input_tensors)
    
    target_tensors = [x["target_tensor"] for x in data]
    target_tensors, _ = merge(target_tensors)
    
    return categories, names, category_tensors, input_tensors, target_tensors

In [22]:
dl = DataLoader(ds, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [23]:
next(iter(dl))

(['Russian'],
 ["b'Anofriev'"],
 tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 tensor([[ 2, 69, 27, 14, 15,  6, 18,  9,  5, 22, 69]]),
 tensor([[69, 27, 14, 15,  6, 18,  9,  5, 22, 69, 72]]))

In [24]:
class NamesDatamodule(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.df = pd.read_csv("resources/names.csv")
        
        self.all_letters = all_letters = ["<pad>"] + list(string.ascii_letters + " .,;'-") + ["<eos>"]
        self.stoi = {letter : idx for idx, letter in enumerate(self.all_letters)}

    def setup(self, stage=None):
        self.train_set = NamesDataset(df, self.stoi)

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)
    
    def collate_fn(self, data):
        def merge(sequences):
            "https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py"

            lengths = [len(seq) for seq in sequences]
            padded_seqs = torch.zeros(len(sequences), max(lengths)).long()
            for i, seq in enumerate(sequences):
                end = lengths[i]
                padded_seqs[i, :end] = seq[:end]
            return padded_seqs, lengths

        categories = [x["category"] for x in data]          
        names = [x["name"] for x in data]          
        category_tensors = torch.cat([x["category_tensor"] for x in data])

        input_tensors = [x["input_tensor"] for x in data]
        input_tensors, _ = merge(input_tensors)

        target_tensors = [x["target_tensor"] for x in data]
        target_tensors, _ = merge(target_tensors)
        
        item_dict = {"categories": categories, 
                     "names": names, 
                     "category_tensors": category_tensors,
                     "input_tensors": input_tensors,
                     "target_tensors": target_tensors}

        return item_dict

In [25]:
class RNN(pl.LightningModule):
    lr = 5e-4

    def __init__(self, input_size, hidden_size, embeding_size, n_categories, n_layers, output_size, p):
        super().__init__()

        self.criterion = nn.CrossEntropyLoss()
        
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        
        self.embeding = nn.Embedding(input_size+n_categories, embeding_size)
        self.lstm = nn.LSTM(embeding_size+n_categories, hidden_size, n_layers, dropout=p)
        self.out_fc = nn.Linear(hidden_size, output_size)
        
        self.dropout = nn.Dropout(p)
        

    def forward(self, batch_of_category, batch_of_letter, hidden, cell):
        ## letter level operations
        
        embeding = self.dropout(self.embeding(batch_of_letter))
        category_plus_letter = torch.cat((batch_of_category, embeding), 1)

        #sequence_length = 1
        category_plus_letter = category_plus_letter.unsqueeze(1)
        
        out, (hidden, cell) = self.lstm(category_plus_letter, (hidden, cell))
        out = self.out_fc(out)
        out = out.squeeze(1)
        
        return out, (hidden, cell)
        

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), self.lr)
        scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        item_dict = batch
        loss = 0
        batch_of_category = item_dict["category_tensors"]

        #to(device) needed due to some problem with PL
        hidden = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)
        cell = torch.zeros(self.n_layers, 1, self.hidden_size).to(self.device)

        #we loop over letters, single batch at the time 
        for t in range(item_dict["input_tensors"].size(1)):
            batch_of_letter = item_dict["input_tensors"][:, t]
            
            output, (hidden, cell) = self(batch_of_category, batch_of_letter, hidden, cell)
            
            loss += self.criterion(output, item_dict["target_tensors"][:, t])

        loss = loss/(t+1)

        tensorboard_logs = {'train_loss': loss}

        return {'loss': loss, 'log': tensorboard_logs}
    
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        
        return hidden, cell

In [26]:
dm = NamesDatamodule(1)

rnn_model = RNN(input_size=ds.n_tokens,
            hidden_size=256,
            embeding_size = 128, 
            n_layers=2,    
            n_categories=ds.n_categories,
            output_size=ds.n_tokens,
            p=0.3)


trainer = Trainer(max_epochs=3, 
                  logger=None,
                  gpus=1,
                  checkpoint_callback=False,
                  )

trainer.fit(rnn_model, dm)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | embeding  | Embedding        | 11 K  
2 | lstm      | LSTM             | 940 K 
3 | out_fc    | Linear           | 18 K  
4 | dropout   | Dropout          | 0     


Training: 0it [00:00, ?it/s]

1