## PyTorch RNN Tutorial - Name Classification Using A Recurrent Neural Net

https://www.youtube.com/watch?v=WEV61GmmPrk

<img src="images/rnn-first.png">

<img src="images/rnn-unfolded.png">

<img src="images/rnn-applications.png">

<img src="images/rnn-pros-cons.png">

<img src="images/rnn-one-hot.png">

<img src="images/rnn-name-classification.png">

Further readings:  

https://karpathy.github.io/2015/05/21/rnn-effectiveness/

https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-recurrent-neural-networks#architecture

https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html


**Note that pics and content taken from https://github.com/python-engineer/pytorch-examples just while learning the apis of pytorch myself** 


In our examples we will do name classification - data is from names of people from 18 different countries.

Data taken from [here](https://download.pytorch.org/tutorial/data.zip)

and stored here  /Users/jvsingh/work/github/python_codes/learn/data/rnn-data 


We will take whole name as a sequence and put in rnn as one input

In [36]:
import io
import os
import unicodedata
import string
import glob

import torch
import random

# alphabet small + capital letters + " .,;'"
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in ALL_LETTERS
    )

def load_data():
    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    all_categories = []
    
    def find_files(path):
        return glob.glob(path)
    
    # Read a file and split into lines
    def read_lines(filename):
        lines = io.open(filename, encoding='utf-8').read().strip().split('\n')
        return [unicode_to_ascii(line) for line in lines]
    
    for filename in find_files('data/rnn-data/data/names/*.txt'):
        #print(f"Debug: filename: {filename}")
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        
        lines = read_lines(filename)
        category_lines[category] = lines
        
    return category_lines, all_categories



"""
To represent a single letter, we use a “one-hot vector” of 
size <1 x n_letters>. A one-hot vector is filled with 0s
except for a 1 at index of the current letter, e.g. "b" = <0 1 0 0 0 ...>.
To make a word we join a bunch of those into a
2D matrix <line_length x 1 x n_letters>.
That extra 1 dimension is because PyTorch assumes
everything is in batches - we’re just using a batch size of 1 here.
"""

# Find letter index from all_letters, e.g. "a" = 0
def letter_to_index(letter):
    return ALL_LETTERS.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor

# This gives random choice from names and return correspoding country
def random_training_example(category_lines, all_categories):
    
    def random_choice(a):
        random_idx = random.randint(0, len(a) - 1)
        return a[random_idx]
    
    category = random_choice(all_categories)
    line = random_choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_tensor(line)
    return category, line, category_tensor, line_tensor

In [37]:
print(ALL_LETTERS)
# This leaves only
print(unicode_to_ascii('Ślusàrski'))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
Slusarski


In [38]:
# We use one-hot encoding 
print(letter_to_tensor('J'))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])


In [39]:
# each letter is 57-dim vector (1 x 57) and 5 letter word => 5 x 1 x 57
print(line_to_tensor('Jones').size()) 

torch.Size([5, 1, 57])


In [43]:
# Loads data from /Users/jvsingh/work/github/python_codes/ml-python/pytorch_tut
category_lines, all_categories = load_data()
category_lines['Italian'][:5]

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

In [44]:
#all_categories
#category_lines

## WRITE RNN HERE

<img src="images/rnn-name-classification.png">

In [None]:
import torch
import torch.nn as nn 
import matplotlib.pyplot as plt

# Not needed to import as we have defined in this notebook itself
# from utils import ALL_LETTERS, N_LETTERS
# from utils import load_data, letter_to_tensor, line_to_tensor, random_training_example


class RNN(nn.Module):
    # note that nn.RNN module avilable in python
    # we are impmeenting from scratch as we want better understanding
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        # output two hidden layers : we combine them (is one layer with different array loc??)
        # how does this combine work - may be python engineer tells later
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        # Softwmax layer
        sefl.softmax = nn.LogSoftmax(dim = 1)
    
    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)
        # then we apply our linear layer
        hidden  = self.i2h(combined)
        output  = self.i2o(combined)
        output  = self.softmax(output)
        return output, hidden
        
        
        
    
