# character rnn workshop

designed to give some hands on exposure to a chracter rnn

In [None]:
# filesystem and data downloading libraries
import os # for filesystem operations
import requests # for interacting with webpages to get data
from zipfile import ZipFile # unzip in python to keep it all in one place\
import glob # easy file matching

# numerical libraries
import random
import numpy as np # linear algebra library

# text processing libraries
import string
import unicodedata

import time

# deep learning library
import torch # mostly for tensors
import torch.nn as nn # the Neural Networks module from torch. nn by convention

# plotting library
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


## Getting the data

In [None]:
# premade
url = 'https://download.pytorch.org/tutorial/data.zip'

# data download and file managment
if not os.path.isdir('data'):
    os.mkdir('data')
if not os.path.isfile('data/name_data.zip'):
    r = requests.get(url) # download the data
    with open('data/name_data.zip', 'wb') as f:
        f.write(r.content) # save the data to a file

# extract the data
with ZipFile('data/name_data.zip', 'r') as data_zip:
   # Extract all the contents of the data zip file in to the data directory
   data_zip.extractall()

# Look at the folders and the file contents

In [None]:
# print out the data directory contents
# print out some of the file contents


In [None]:
# set all language files as a list
fnames = []
fnames

## Loading the data

### Helper functions

In [None]:
# string cleaner helper functions
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def clean_names(name):
    return unicode_to_ascii(name.strip())

In [None]:
# set global variables of the total character set
all_letters = ""
n_letters = len(all_letters)
n_letters, all_letters

In [None]:
# load the language files into a list of languages and a {language:names} dict
all_languages = [] # list of language names
language_names = {} # dict of key = language, value = list of all names in that language

n_languages = len(all_languages)
print(n_languages, all_languages)

## Data as tensors (vectors)

In [None]:
# livecode helper functions
def character_to_tensor(c):
    return t

def word_to_tensor(word):
    return w

In [None]:
# try out some letter/word to tensor examples


## Sampling the data

In [None]:
# 
def random_choice(l):
    """random selection from list"""
    item = l[random.randint(0, len(l)-1)]
    return item 
    
def random_language_name(language=None):
    """returns all the information for a random language-name pair
    
    args:
        language = if None, select a random language
    """
    if language==None:
        language = random_choice(all_languages)

    name = random_choice(language_names[language])

    language_index = all_languages.index(language)
    language_tensor = torch.tensor(language_index)

    name_tensor = word_to_tensor(name)
    
    return language, name, language_tensor, name_tensor

In [10]:
# explore some random names


## The network!

This is the specific RNN structure that we want to be building:

![network diagram](network_structure.png)

In [None]:
#livecode
class RNN(nn.Module):
    """This dictates the structure and size of the network"""
    def __init__(self, data_size, hidden_size, output_size):
        """Sets up class attributes (mostly data dimensions) and network layers"""
        super(RNN, self).__init__()

    
    def forward(self, x, last_hidden):
        """Describes how data moves through the RNN layers"""
        return output_probabilities, hidden
        

In [None]:
# initialise a RNN
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_languages)

In [None]:
# pass a character through it


In [None]:
#look at the outputs


### making sense of model output

In [None]:
def language_from_output(output):
    """takes output probability vector and returns top language"""

    top_value, top_index = output.topk(1)
    index = top_index[0].item()
    probability = top_value[0].item()
    language = all_languages[index]
    return language, index, probability

In [12]:
# explore some output

## Training

### Predict and update model

In [None]:
criterion = nn.NLLLoss() # negative log likelihood loss. Convention
lr = 0.005 # totally arbitrary starter learning rate

def predict(rnn, name_t):
    """pass each character in an input name tensor through the network"""
    
    return output

def train_step(lang_t, name_t):
    """predict language, compare with target, and update model parameters """
    
    return output, loss.item()

In [None]:
#try out a train step on a random name


### Training loop

In [None]:
def time_since(start):
    """little helper for pretty timestamping"""
    now = time.time()
    dt = now - start
    mins = int(dt/60)
    return f'{mins:>02}:{dt - mins*60:>05.2f}(m:s)'

In [None]:
def train(rnn, n_iters):
    """loop the train_step and view progress"""
    
    return losses, average_losses

#### Training run and performance

In [11]:
# run some training, and look at the losses/outputs

## Measure of success: Confusion

In [None]:
# live code 
def language_confusion(rnn, language, n_samples = 100):
    """evaluate how often the model predicts names from a language correctly"""
    
    return norm_predictions

In [None]:
preds = language_confusion(rnn, "Polish")
preds, sum([p for p in preds.values()])

### Visualise the confusion

In [None]:
def evaluate_total_confusion(rnn,samples_per_language=100):
    """Calculates confusion array for all languages"""
    confusion_list = []
    
    return np.concatenate(confusion_list, axis = 0)

In [None]:
def plot_confusion(confusion):
    """displays a nxn confusion numpy array"""
    # Set up plot
    fig = plt.figure(facecolor='white',figsize=(5,5))
    ax = fig.add_subplot(111)
    
    # add the color scale
    cax = ax.matshow(confusion)
    cbar = fig.colorbar(cax)
    
    # Set up axes
    ax.set_xticklabels([''] + all_languages, rotation=90)
    ax.set_yticklabels([''] + all_languages)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    # labeling
    ax.set_xlabel("predicted languag")
    ax.set_ylabel("target languag")
    cbar.ax.set_ylabel('predicted fraction')
    plt.show()

In [None]:
confusion = evaluate_total_confusion(samples_per_language = 100)
plot_confusion(confusion)