<a href="https://colab.research.google.com/github/jdasam/mas1004-2022/blob/main/notebooks/Data_AI_Week13_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN Tutorial
### With Names from Different Countries

In [1]:
import torch
import torch.nn as nn
from pathlib import Path
import matplotlib.pyplot as plt

import random
from tqdm.auto import tqdm

DEV= 'cuda'

## 0. Get Dataset

In [2]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

--2022-11-24 06:38:47--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 13.227.219.81, 13.227.219.43, 13.227.219.79, ...
Connecting to download.pytorch.org (download.pytorch.org)|13.227.219.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2022-11-24 06:38:47 (101 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean

In [3]:
data_dir = Path('data/names')
txt_fns = data_dir.glob("*.txt")

names_in_dict = {}
for txt_fn in txt_fns:
  with open(txt_fn) as f:
    name_of_countries = f.readlines()
    print(f"Category: {txt_fn.stem}")
    names_in_dict[txt_fn.stem] = name_of_countries

Category: Chinese
Category: Irish
Category: Scottish
Category: Greek
Category: Portuguese
Category: French
Category: Korean
Category: Russian
Category: Czech
Category: Arabic
Category: Dutch
Category: German
Category: Vietnamese
Category: English
Category: Polish
Category: Japanese
Category: Spanish
Category: Italian


In [4]:
for key in names_in_dict:
  print(f"{key}: {names_in_dict[key][0][:-2]}, {len(names_in_dict[key])}")

Chinese: An, 268
Irish: Ada, 232
Scottish: Smit, 100
Greek: Adamidi, 203
Portuguese: Abre, 74
French: Abe, 277
Korean: Ah, 94
Russian: Ababk, 9408
Czech: Ab, 519
Arabic: Khour, 2000
Dutch: Aalsbur, 297
German: Abbin, 724
Vietnamese: Nguye, 73
English: Abba, 3668
Polish: Adamcza, 139
Japanese: Ab, 991
Spanish: Aban, 298
Italian: Abandonat, 709


## 1. Define Dataset

In [10]:
def normalize_name(name):
  return name.replace('\n', '').replace(u'\xa0', u' ').lower()


In [41]:
class NameSet:
  def __init__(self, data_dir='data/names'):
    self.dir = Path(data_dir)
    txt_fns = list(self.dir.glob('*.txt'))

    total_names = [] 
    for txt_fn in txt_fns:
      with open(txt_fn, 'r') as f:
        names = f.readlines()
      origin = txt_fn.stem.lower()
      names = [(normalize_name(name), origin) for name in names]
      # print(names)
      total_names += names
    # print(total_names)
    self.data = total_names
    self.unique_characters = self.get_unique_characters_of_dataset()
    self.char2idx = {character: idx for idx, character in enumerate(self.unique_characters)}
    self.origin_vocab = self.get_unique_origin_of_dataset()
    self.origin2idx = {origin:idx for idx, origin in enumerate(self.origin_vocab)}

  def get_unique_characters_of_dataset(self):
    # first, get name of every datasample
    names = [name_origin[0] for name_origin in self.data]
    characters = [character for name in names for character in name]
    return sorted(list(set(characters)))
    # characters = []
    # for name in names:
    #   # name: "kim"
    #   for character in name:
    #     # character: "k", "i", "m"
    #     characters.append(character)

  def get_unique_origin_of_dataset(self):
    origins = [name_origin[1] for name_origin in self.data]
    return sorted(list(set(origins)))

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    name, origin = self.data[idx]
    name_in_indices = [self.char2idx[char]  for char in name]
    origin_in_indices = self.origin2idx[origin]
    return torch.tensor(name_in_indices), origin_in_indices

dataset = NameSet()
print(dataset.unique_characters)
print(len(dataset.unique_characters))

dataset[0]
# for idx, char in enumerate(dataset.unique_characters):
#   print(idx, char)

[' ', "'", ',', '-', '/', '1', ':', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'à', 'á', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'ñ', 'ò', 'ó', 'õ', 'ö', 'ù', 'ú', 'ü', 'ą', 'ł', 'ń', 'ś', 'ż']
57


(tensor([ 7, 20, 13]), 1)

In [38]:
dataset.origin2idx['korean']

11

### 1-1 Add zero-padding 
- Each name in the dataset has a different length
- Therefore, you have to add zero padding so that each data sample in a batch has same length
  - If you use zero-padding, it is better to not use index 0 for input category
  - For example, if you represent `"a"` as a categorical index 0 and then use zero padding, you cannot figure out whether `[0, 0, 0, 0]` is just padded zeros or `[a, a, a, a]`

In [46]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

batch = next(iter(dataloader))
name, origin = batch

In [47]:
emb_dim = 16
char_emb = nn.Embedding(len(dataset.unique_characters), emb_dim)

print(name)
char_emb(name)

tensor([[14, 27, 10, 15, 11, 28]])


tensor([[[ 0.2511, -0.1842,  1.1461, -1.0116, -0.7139, -0.8785, -1.0704,
           0.7881, -0.3323,  1.1519,  0.0246, -0.9147, -0.1136, -0.6151,
          -0.4166, -0.0124],
         [-1.1343, -1.5755, -0.0450, -0.3907,  0.1222,  0.9830,  0.2772,
          -2.1569, -0.1153, -0.4699, -0.9344, -1.3191,  0.0541,  0.5153,
           0.3268, -0.4650],
         [-1.6256,  0.4459,  1.6011, -0.8895, -0.0613,  0.3166, -0.0920,
          -0.8784,  1.1307, -1.0302,  1.6094,  0.8842, -0.9604,  1.3050,
           0.9393,  1.1049],
         [-2.1076, -1.2149,  0.0164, -2.3491, -1.3630, -0.2470, -1.7833,
           0.1954, -0.4038, -1.5208, -0.7799,  0.4148, -0.4923, -0.6173,
          -2.4094, -0.0368],
         [ 0.9011, -0.1217, -0.6886, -0.2963, -1.9161, -0.3839, -0.1186,
          -0.9541, -0.5131, -0.1391,  1.6145,  0.4353, -0.7381, -0.5653,
          -0.6574,  0.8613],
         [-0.4882, -0.4340,  0.3054,  0.6388, -0.9687, -0.1892, -1.9232,
           0.6675,  0.4900, -1.8803, -0.1799, -0.900

In [48]:
rnn_layer = nn.RNN(16, 4)
rnn_layer(char_emb(name))

(tensor([[[-0.5916,  0.2416, -0.8721, -0.6358],
          [-0.2858,  0.8351,  0.4141,  0.8669],
          [ 0.5969,  0.9392,  0.9313, -0.9414],
          [ 0.7534, -0.2403, -0.4808, -0.7936],
          [ 0.0925, -0.1601, -0.0582, -0.7800],
          [ 0.3210,  0.9719,  0.6172,  0.0500]]], grad_fn=<StackBackward0>),
 tensor([[[-0.5916,  0.2416, -0.8721, -0.6358],
          [-0.2858,  0.8351,  0.4141,  0.8669],
          [ 0.5969,  0.9392,  0.9313, -0.9414],
          [ 0.7534, -0.2403, -0.4808, -0.7936],
          [ 0.0925, -0.1601, -0.0582, -0.7800],
          [ 0.3210,  0.9719,  0.6172,  0.0500]]], grad_fn=<StackBackward0>))

## 2. Make RNN Model
### 2-1. See how RNN works
- Since our input is a categorical index, we will use nn.Embedding
- ![Diagram](https://datascience-enthusiast.com/figures/rnn_step_forward.png)

In [22]:
torch.manual_seed(0)
test_input = torch.randn([11, 4])
test_input[7] = torch.zeros(4) # let's just change the 7th input

input_to_hidden_layer = nn.Linear(4, out_features=5)
hidden_to_hidden_layer = nn.Linear(5, 5)

initial_hidden_state = torch.zeros(5)
initial_hidden_state
prev_hidden_state = initial_hidden_state

time_step = 0

hidden_outputs = []

for time_step in range(len(test_input)):
  current_input = test_input[time_step]
  current_input_to_hidden = input_to_hidden_layer(current_input)
  print(current_input_to_hidden)
  current_hidden_to_hidden = hidden_to_hidden_layer(prev_hidden_state)
  print(current_hidden_to_hidden)
  current_hidden_state = current_input_to_hidden + current_hidden_to_hidden
  current_hidden_state = torch.tanh(current_hidden_state)
  print(current_hidden_state)
  prev_hidden_state = current_hidden_state
  hidden_outputs.append(current_hidden_state)
hidden_outputs = torch.stack(hidden_outputs, dim=0)

tensor([-0.9437,  0.1529, -0.3462, -1.4978,  0.6068], grad_fn=<AddBackward0>)
tensor([-0.1831,  0.2654, -0.2721,  0.4058,  0.3065], grad_fn=<AddBackward0>)
tensor([-1.1269,  0.4183, -0.6184, -1.0920,  0.9133], grad_fn=<AddBackward0>)
tensor([-0.3607,  0.3824,  1.4850, -1.1160,  0.5243], grad_fn=<AddBackward0>)
tensor([-0.4322, -0.6096,  0.2361, -0.0878,  0.4762], grad_fn=<AddBackward0>)
tensor([-0.7929, -0.2272,  1.7211, -1.2038,  1.0005], grad_fn=<AddBackward0>)
tensor([-0.7042,  0.6548, -0.3177, -0.6706,  0.5947], grad_fn=<AddBackward0>)
tensor([-0.9311, -0.0091,  0.2306, -0.6226, -0.1075], grad_fn=<AddBackward0>)
tensor([-1.6353,  0.6457, -0.0871, -1.2932,  0.4871], grad_fn=<AddBackward0>)
tensor([-0.5581,  0.3917,  0.6266,  0.4143, -0.3039], grad_fn=<AddBackward0>)
tensor([-0.4411, -0.3164,  0.6640, -0.1326,  0.1968], grad_fn=<AddBackward0>)
tensor([-0.9992,  0.0753,  1.2905,  0.2817, -0.1071], grad_fn=<AddBackward0>)
tensor([-1.1610,  0.2827, -1.2173, -0.8705,  0.4742], grad_fn=<A

In [21]:
# hidden_outputs = torch.stack(hidden_outputs, dim=0)
print(hidden_outputs.shape)
print(hidden_outputs)

torch.Size([11, 5])
tensor([[-1.1269,  0.4183, -0.6184, -1.0920,  0.9133],
        [-0.7929, -0.2272,  1.7211, -1.2038,  1.0005],
        [-1.6353,  0.6457, -0.0871, -1.2932,  0.4871],
        [-0.9992,  0.0753,  1.2905,  0.2817, -0.1071],
        [-1.4618,  0.9090, -1.1327, -1.1002,  0.3041],
        [-0.4400,  0.4650,  0.1657, -0.0765,  0.6844],
        [-0.7475,  0.7408,  0.1903, -0.5428,  0.5015],
        [-1.2924,  0.5724, -0.2626, -0.0653,  0.3060],
        [-0.8949,  0.1977,  0.3012,  0.2309, -0.1231],
        [-0.0384,  0.8418,  0.3796, -0.6060, -0.3019],
        [-0.6968,  1.3023, -0.3603,  2.6543, -1.3842]],
       grad_fn=<StackBackward0>)


In [23]:
print(hidden_outputs.shape)
print(hidden_outputs)

torch.Size([11, 5])
tensor([[-1.1269,  0.4183, -0.6184, -1.0920,  0.9133],
        [-0.7929, -0.2272,  1.7211, -1.2038,  1.0005],
        [-1.6353,  0.6457, -0.0871, -1.2932,  0.4871],
        [-0.9992,  0.0753,  1.2905,  0.2817, -0.1071],
        [-1.4618,  0.9090, -1.1327, -1.1002,  0.3041],
        [-0.4400,  0.4650,  0.1657, -0.0765,  0.6844],
        [-0.7475,  0.7408,  0.1903, -0.5428,  0.5015],
        [-0.9125,  0.5028,  0.5683, -0.3739,  0.1380],
        [-1.0971,  0.5330,  0.4041,  0.3141, -0.3931],
        [ 0.0169,  1.0509,  0.5633, -0.5044, -0.4689],
        [-0.6922,  1.5083, -0.3049,  2.7645, -1.5321]],
       grad_fn=<StackBackward0>)


## 2.2 Make Name Classification Model
- Input: Sequence of characters, in categorical indices
  - Length of the input sequence is arbitrary 
- Output: Probability of the corresponding nationality for a given name in a sequence of characters
  - Regardless of input length, the output is a single vector
  - Softmax output of each class

## 2.3 Complete Trainer

In [None]:
class Trainer:
  def __init__(self, model, train_loader, valid_loader, model_name='resnet'):
    self.model = model
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.model.to(self.device)
    self.criterion = nn.NLLLoss()
    self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
    self.best_loss = np.inf
    self.best_acc = 0.0
    self.train_losses = []
    self.valid_losses = []
    self.train_accs = []
    self.valid_accs = []
    self.model_name = model_name

  def validation(self):
    self.model.eval() # change the model from train mode to evaluation mode
    # Some models work in different ways based on whtehter it is on training step
    # or on inference step

    # In validation step, you don't have to calculate the gradient
    # with torch.no_grad():

    current_loss = 0
    num_total_correct_pred = 0
    with torch.inference_mode(): # every torch computation under this indent
    # will be run without calculating the gradient or computation history
      for batch in self.valid_loader:
        images, labels = batch
        images, labels = images.to(self.device), labels.to(self.device)
        outputs = self.model(images)
        probs = torch.softmax(outputs, dim=-1)
        log_probs = torch.log(probs)

        loss = self.criterion(log_probs, labels)
        predicted_classes = torch.argmax(outputs, dim=-1)
        num_acc_pred = (predicted_classes == labels.to(self.device)).sum()
        #num_acc_pred is on self.device
        num_total_correct_pred += num_acc_pred.item()
        # in validation stage, we don't care about single batch's loss
        # we want to see the result for total images of validation set

        current_loss += loss.item() * len(labels)
        # instead of adding the mean loss, we add sum of loss
        # because the batch size can be different
    mean_loss = current_loss / len(self.valid_loader.dataset)
    mean_acc = num_total_correct_pred / len(self.valid_loader.dataset) # number of total datasample in the validation loader
    return mean_loss, mean_acc
    # return {'loss': mean_loss, 'acc': mean_acc}



  def train_by_number_of_epochs(self, num_epochs):
    for epoch in tqdm(range(num_epochs)):
      self.model.train()
      for batch in tqdm(self.train_loader, leave=False):
        images, labels = batch
        images, labels = images.to(self.device), labels.to(self.device)
        self.optimizer.zero_grad()
        outputs = self.model(images) # this is logits
        probs = torch.softmax(outputs, dim=-1)
        log_probs = torch.log(probs)
        loss = self.criterion(log_probs, labels) # you have to feed log_probs

        acc = (torch.argmax(outputs, dim=-1) == labels.to(self.device)).sum() / len(labels)
        # for torch.nn.NLLLoss
        loss.backward()
        self.optimizer.step()

        self.train_losses.append(loss.item())
        self.train_accs.append(acc.item())
        # don't try self.train_losses.append(loss)
      # training step has ended
      # we want to test our model on the validation set
      valid_loss, valid_acc = self.validation()

      # is this model the best? 
      # let's decide it based on valid_acc
      if valid_acc > self.best_acc:
        self.best_acc = valid_acc

        # If it is the best model, save the model's weight'
        models_parameters = self.model.state_dict()
        print(f"Saving best model at epoch {len(self.valid_accs)}, acc: {valid_acc}")
        torch.save(models_parameters, f'{self.model_name}_best.pt')

      self.valid_losses.append(valid_loss)
      self.valid_accs.append(valid_acc)

    # Plot Accuracy curve
    plt.plot(self.train_accs)
    plt.plot(range(len(self.train_loader)-1, len(self.train_accs), len(self.train_loader)) ,self.valid_accs)
    plt.title("Accuracy")