# Classifing last names with character-level RNN

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [2]:
PATH = Path("/data2/yinterian/name_dataset/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/name_dataset/names_test.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv.gz'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_test.csv.gz')]

In [3]:
! head /data2/yinterian/name_dataset/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [4]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [5]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [6]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [7]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [8]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [9]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int32)

In [10]:
# one hot encoding
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [11]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [43]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
val = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [44]:
batch_size = 2000
n=len(val)
train_dl = DataLoader(train, batch_size=batch_size)
val_dl = DataLoader(val, batch_size=n)

In [45]:
len(train), len(val)

(13374, 13374)

In [46]:
x,y = train[0]
print(x.shape,y)

(15, 55) 2


## Model

In [88]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [89]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [90]:
x, y = next(iter(train_dl))

In [91]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [92]:
batch = x.shape[0]
h = model.initHidden(batch).cuda()
x = x.cuda().float()
y = y.cuda().long()

In [93]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [94]:
for ei in range(x.shape[1]):
    y_t, h = model(x[:,ei], h)

In [95]:
# note that just the last x_t is used in the loss
# update
loss = F.cross_entropy(y_t, y)
loss.item()

2.870267152786255

## Training

In [96]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [97]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [98]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch).cuda()
        loss = 0
        x = x.cuda().float()
        y = y.cuda().long()
        
        for t in range(x.shape[1]):
            out, h = model(x[:,t], h)
        
        loss = F.cross_entropy(out, y)
        optim.zero_grad()
        loss.backward()
        #loss.backward(retain_graph=True)
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [99]:
def val_metric(model, val_dl):
    model.eval()
    x, y = next(iter(val_dl))
    x = x.cuda().float()
    y = y.cuda().long()
    N = x.shape[0]
    h = model.initHidden(N).cuda()
    for t in range(x.shape[1]):
        out, h = model(x[:,t], h)
    loss = F.cross_entropy(out, y)
    _, pred = torch.max(out, 1)
    acc = pred.eq(y).sum().float()/N
    return loss.item(), acc.item()

In [100]:
vocab_size = 55
hidden_size = 80
n_classes = 18

In [101]:
def train_loop(model, lr, train_dl, val_dl, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        val_loss, val_acc = val_metric(model, val_dl)
        if i%5 == 1: print("train loss %.3f val loss %.3f and val accuracy %.3f" % (loss, val_loss, val_acc))

In [102]:
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [103]:
train_loop(model, 0.01, train_dl, val_dl, epochs=20)

train loss 2.245 val loss 1.791 and val accuracy 0.469
train loss 1.623 val loss 1.534 and val accuracy 0.523
train loss 1.418 val loss 1.318 and val accuracy 0.619
train loss 1.258 val loss 1.177 and val accuracy 0.664


In [104]:
train_loop(model, 0.001, train_dl, val_dl, epochs=20)

train loss 1.174 val loss 1.167 and val accuracy 0.649
train loss 1.132 val loss 1.119 and val accuracy 0.667
train loss 1.102 val loss 1.090 and val accuracy 0.677
train loss 1.078 val loss 1.064 and val accuracy 0.685


In [105]:
train_loop(model, 0.001, train_dl, val_dl, epochs=20)

train loss 1.056 val loss 1.046 and val accuracy 0.682
train loss 1.023 val loss 1.008 and val accuracy 0.700
train loss 0.997 val loss 0.980 and val accuracy 0.706
train loss 0.977 val loss 0.959 and val accuracy 0.711


# Model with character embeddings 

In [67]:
class NameDatasetEmb(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [68]:
train_2 = NameDatasetEmb(PATH/"names_train.csv", vocab2id, label2id)
val_2 = NameDatasetEmb(PATH/"names_test.csv", vocab2id, label2id)

In [69]:
batch_size = 2000
n = len(val_2)
train_dl_2 = DataLoader(train_2, batch_size=batch_size)
val_dl_2 = DataLoader(val_2, batch_size=n)

In [70]:
train_2[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
       dtype=int32), 2)

In [79]:
class CharEmbRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size):
        super(CharEmbRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.long()
        x = self.emb(x)
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Train 

In [80]:
vocab_size = 55
emb_size = 30
hidden_size = 80
n_classes = 18
model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes).cuda()

In [81]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)

train loss 4.827 val loss 2.221 and val accuracy 0.392
train loss 1.514 val loss 1.419 and val accuracy 0.567
train loss 1.343 val loss 1.251 and val accuracy 0.618
train loss 1.226 val loss 1.144 and val accuracy 0.642
train loss 1.140 val loss 1.059 and val accuracy 0.668
train loss 1.059 val loss 0.982 and val accuracy 0.690
train loss 0.988 val loss 0.918 and val accuracy 0.720
train loss 0.932 val loss 0.862 and val accuracy 0.739
train loss 0.883 val loss 0.816 and val accuracy 0.756
train loss 0.871 val loss 0.853 and val accuracy 0.733


In [82]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)

train loss 0.841 val loss 0.917 and val accuracy 0.708
train loss 0.826 val loss 0.759 and val accuracy 0.770
train loss 0.755 val loss 0.704 and val accuracy 0.787
train loss 0.723 val loss 0.667 and val accuracy 0.796
train loss 0.703 val loss 0.669 and val accuracy 0.797
train loss 0.666 val loss 0.615 and val accuracy 0.812
train loss 0.638 val loss 0.586 and val accuracy 0.821
train loss 0.633 val loss 0.595 and val accuracy 0.818
train loss 0.578 val loss 0.536 and val accuracy 0.837
train loss 0.553 val loss 0.508 and val accuracy 0.844


In [83]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)

train loss 0.650 val loss 0.697 and val accuracy 0.782
train loss 0.539 val loss 0.507 and val accuracy 0.843
train loss 0.503 val loss 0.467 and val accuracy 0.856
train loss 0.479 val loss 0.444 and val accuracy 0.865


In [84]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)

train loss 0.588 val loss 0.639 and val accuracy 0.796
train loss 0.491 val loss 0.460 and val accuracy 0.857
train loss 0.446 val loss 0.415 and val accuracy 0.871
train loss 0.422 val loss 0.390 and val accuracy 0.880


In [85]:
train_loop(model, 0.001, train_dl_2, val_dl_2, epochs=50)

train loss 0.378 val loss 0.374 and val accuracy 0.886
train loss 0.373 val loss 0.370 and val accuracy 0.889
train loss 0.370 val loss 0.366 and val accuracy 0.890
train loss 0.367 val loss 0.363 and val accuracy 0.891
train loss 0.363 val loss 0.359 and val accuracy 0.893
train loss 0.360 val loss 0.356 and val accuracy 0.893
train loss 0.356 val loss 0.352 and val accuracy 0.894
train loss 0.353 val loss 0.349 and val accuracy 0.895
train loss 0.350 val loss 0.346 and val accuracy 0.896
train loss 0.347 val loss 0.343 and val accuracy 0.897


## Exercise
* Change the first model to learn a character language model that generates last names.
* Use one cycle training on this problem.

# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs.