<a href="https://colab.research.google.com/github/lilnoes/notebooks/blob/main/rnn_generate_names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path

In [None]:
!wget https://download.pytorch.org/tutorial/data.zip && unzip data.zip && pwd

--2020-06-30 06:46:46--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 52.84.16.73, 52.84.16.5, 52.84.16.104, ...
Connecting to download.pytorch.org (download.pytorch.org)|52.84.16.73|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2020-06-30 06:46:47 (9.08 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   

In [None]:
data = Path('/content/data')


In [None]:
import unicodedata
import re
import string
letters = string.ascii_letters + " .,-;'"
vocab_size = len(letters) + 1
def uni_to_asci(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in letters)

def process_word(word, shift=False):
  word = uni_to_asci(word)
  # word = re.sub(r'[^a-z]', '', word)
  word = [letters.find(i) for i in word]
  if shift:
    word.append(len(letters))
  return torch.tensor(word)


In [None]:
def load_dataset(filedir):
  categories = []
  names_dict = {}
  tensor_dict = {}
  shifted_dict = {}
  for filename in Path(filedir).glob('*.txt'):
    cat = filename.stem
    categories.append(cat)
    lines = filename.read_text(encoding='utf-8').strip().split('\n')
    names_dict[cat] = lines
    tensor_dict[cat] = [process_word(name) for name in lines]
    shifted_dict[cat] = [process_word(name[1:], True) for name in lines]


  return categories, names_dict, tensor_dict, shifted_dict

In [None]:
categories, data_dict, tensor_dict, shifted_dict = load_dataset('/content/data/names')

In [None]:
x = process_word('emma')
x

tensor([ 4, 12, 12,  0])

In [None]:
import numpy as np
def get_random():
  np.random.seed()
  i = np.random.randint(0, len(categories))
  cat = categories[i]
  j = np.random.randint(0, len(data_dict[cat]) )
  name = data_dict[cat][j]
  tensor = tensor_dict[cat][j]
  shift = shifted_dict[cat][j]
  return cat, torch.tensor([i]), name, tensor, shift

In [None]:
get_random()

('Arabic',
 tensor([3]),
 'Atiyeh',
 tensor([26, 19,  8, 24,  4,  7]),
 tensor([19,  8, 24,  4,  7, 58]))

In [None]:
class Model(nn.Module):
  def __init__(self, tx, units, batch_size, vocab_size):
    super(Model, self).__init__()
    self.units = units
    self.batch_size = batch_size
    self.tx = tx
    self.embedding_letter = nn.Embedding(vocab_size, 128)
    self.embedding_cat = nn.Embedding(len(categories), 32)
    self.state = self.initialize_state()
    self.vocab_size = vocab_size
    self.rnn = nn.GRUCell(input_size=128+32, hidden_size=units)
    self.linear1 = nn.Linear(units, 64)
    self.linear2 = nn.Linear(64, vocab_size)

  def forward(self, cat, x, state, train=True):
    cat = self.embedding_cat(cat)
    # print(cat.shape)
    x = torch.cat((self.embedding_letter(x), cat), dim=-1)
    state = self.rnn(x, state)
    x = F.dropout( F.relu( self.linear1(state), 0.2) )
    x = self.linear2(x)
    if not train:
      return F.softmax(x, dim=-1), state
    x = F.log_softmax(x, dim=-1)
    return x, state
    

  def initialize_state(self):
    return torch.zeros(self.batch_size, self.units)

In [None]:
tx = vocab_size
units = 256
batch_size = 1
vocab_size = vocab_size

model = Model(tx, units, batch_size, vocab_size)

In [None]:
cat, catx, name, tensor, shift = get_random()
# tensor = model.get_embedding(tensor)
state = model.initialize_state()
y, _= model(catx, tensor[0].view(1),state)
print(y)
# print(tensor)

torch.Size([1, 32])
tensor([[-3.9879, -4.0923, -4.0870, -4.2259, -4.1210, -4.0089, -4.0537, -4.0829,
         -4.0913, -4.0510, -4.0314, -3.9797, -4.0686, -4.1866, -4.0586, -4.2074,
         -4.0101, -4.1678, -3.9693, -4.1856, -4.1325, -4.0690, -4.1766, -4.1277,
         -4.0724, -4.1148, -4.0171, -3.9162, -4.1556, -4.0791, -4.0098, -4.1327,
         -3.9420, -3.9905, -4.0899, -4.1620, -4.0165, -4.0605, -4.0727, -3.9840,
         -4.1216, -4.0531, -4.1733, -4.1897, -4.0925, -4.0279, -4.0463, -4.0862,
         -4.1765, -4.0428, -4.1337, -4.0614, -4.1824, -3.9716, -3.9224, -4.1788,
         -3.9922, -4.1719, -4.1321]], grad_fn=<LogSoftmaxBackward>)


In [None]:
loss_fn(y, shift[0].view(1))
# y.shape

tensor(3.9879, grad_fn=<NllLossBackward>)

In [None]:
loss_fn = nn.NLLLoss()
model = Model(tx, units, batch_size, vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

istate = model.initialize_state()
def train_step(cat, x, y):
  state = istate
  optimizer.zero_grad()
  loss = 0
  for i in range(x.size(0)):
    out, state = model(cat, x[i].view(1), state)
    loss += loss_fn(out, y[i].view(1))
  loss.backward()
  optimizer.step()
  return loss.item()/x.size(0)


In [None]:
def train(epochs, steps=1000):
  count = 1
  for epoch in range(epochs):
    batch_loss = 0
    for step in range(steps):
      count += 1
      _,catx,_,x,y = get_random()
      batch_loss += train_step(catx, x, y)
      if count %100==0:
        rid = np.random.randint(0, len(categories))
        letter = np.random.randint(26, 52)
        print('Evaluation ', evaluate(rid, letters[letter]))
        print('Evaluation 1', evaluate(rid, letters[letter]))
        print('Evaluation 0.5', evaluate1(rid, letters[letter]), 0.5)
        print('Evaluation 0.75', evaluate1(rid, letters[letter]), 0.75)
        print('Evaluation 0.25', evaluate1(rid, letters[letter]), 0.25)
    print(f'epoch {epoch+1} loss {batch_loss/steps:.3f}')

In [None]:
train(10)

Evaluation  ('Irish', 'Xand')
Evaluation 1 ('Irish', 'Xinghe')
Evaluation 0.5 ('Irish', 'Xanghona') 0.5
Evaluation 0.75 ('Irish', 'Xemgeon') 0.75
Evaluation 0.25 ('Irish', 'Xinner') 0.25
Evaluation  ('Portuguese', 'Dara')
Evaluation 1 ('Portuguese', 'Darro')
Evaluation 0.5 ('Portuguese', 'Datca') 0.5
Evaluation 0.75 ('Portuguese', 'Das') 0.75
Evaluation 0.25 ('Portuguese', "D'egrijes") 0.25
Evaluation  ('Greek', 'Quras')
Evaluation 1 ('Greek', 'Quralis')
Evaluation 0.5 ('Greek', 'Qudes') 0.5
Evaluation 0.75 ('Greek', 'Qupis') 0.75
Evaluation 0.25 ('Greek', 'Quttalarigos') 0.25
Evaluation  ('Spanish', 'Xarak')
Evaluation 1 ('Spanish', 'Xarcha')
Evaluation 0.5 ('Spanish', 'Xures') 0.5
Evaluation 0.75 ('Spanish', 'Xari') 0.75
Evaluation 0.25 ('Spanish', 'Xonma') 0.25
Evaluation  ('German', 'Yoman')
Evaluation 1 ('German', 'Yous')
Evaluation 0.5 ('German', 'Yhin') 0.5
Evaluation 0.75 ('German', 'Yaurogs') 0.75
Evaluation 0.25 ('German', 'Yeilt') 0.25
Evaluation  ('Vietnamese', 'Rhin')
Eval

In [None]:
letters[51]

'Z'

In [None]:
np.random.rand()

0.5955402518903686

In [None]:
def evaluate(category_id, name='A'):
  temp = 1.0
  with torch.no_grad():
    cat = categories[category_id]
    catx = torch.tensor([category_id])
    tensor = process_word(name)
    state = model.initialize_state()

    for i in range(tensor.size(0)):
      y, state = model(catx, tensor[i].view(1), state, False)
    for i in range(15):
      _,y = y.topk(1, dim=-1)
      if y.item()==58:
        return cat, name
      else:
        name += letters[y.item()]
      y, state = model(catx, y.view(1), state, False)
      # y = y/temp
      # ind = torch.multinomial(y, 1)[-1, 0].item()
      # tensor = torch.tensor([ind])
      # if ind==58:
        # return cat, name
      # name += letters[ind]
    return cat, name

In [None]:
def evaluate1(category_id, name='A', temp=1.0):
  with torch.no_grad():
    cat = categories[category_id]
    catx = torch.tensor([category_id])
    tensor = process_word(name)
    state = model.initialize_state()

    for i in range(tensor.size(0)):
      y, state = model(catx, tensor[i].view(1), state, False)
    for i in range(15):
      y = torch.multinomial(y/temp, 1)
      if y.item()==58:
        return cat, name
      else:
        name += letters[y.item()]
      y, state = model(catx, y.view(1), state, False)
      # y = y/temp
      # ind = torch.multinomial(y, 1)[-1, 0].item()
      # tensor = torch.tensor([ind])
      # if ind==58:
        # return cat, name
      # name += letters[ind]
    return cat, name

In [None]:
evaluate1(4, 'H')

('Polish', 'Holgi')

In [None]:
torch.multinomial(y.exp(), 1)

tensor([[33]])

In [None]:
# cat, catx,name, tensor = get_random()
tensor = process_word('satoshi')
tensor
y = model(tensor)
_,ind = y.topk(3)
for i in range(ind.shape[1]):
  print(categories[ind[0][i]])

Japanese
Italian
Russian


In [None]:
_,ind = y.topk(1)
ind = ind.item()
cat, categories[ind]

('English', 'Polish')

In [None]:
categories

In [None]:
cat, catx,name, tensor = get_random()
y = rnn(tensor)
y.shape

torch.Size([1, 1, 18])

In [None]:
loss_fn(y.view(1,-1), catx)

tensor(2.7988, grad_fn=<NllLossBackward>)

In [None]:
x.dtype
y = rnn(x)
y.size()

torch.Size([1, 1, 18])

In [None]:
cat, catx,name, tensor = get_random()

In [None]:
catx.shape

torch.Size([1])

In [None]:
train_step(tensor, catx)

RuntimeError: ignored

In [None]:
target = torch.tensor(0).expand(1)
target.size()
loss = nn.CrossEntropyLoss()
loss(y.view(1, -1), target)

tensor(2.9066, grad_fn=<NllLossBackward>)