## 1. Systematic processing of data (15 points)

#### 1.1.Effect of hidden state length: run the script for hidden state sizes of 32, 64 and 128 by modifying the value of variable n_hidden. Report the accuracy numbers yielded by the different algorithm variants. (5 points).

Hidden size = 32 -> accuracy = 0.933197

Hidden size = 64 -> accuracy = 0.940072

Hidden size = 128 -> accuracy = 0.943559

Average accuracy  = 0.93894267

In [None]:
import torch.nn as nn
import random
import time
import math

In [2]:
# runned the same code 3 times by only changing the hidden states sizes to 32, 64 and 128

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names

names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])

    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))

In [4]:
# Turning Names into Tensors
# --------------------------
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [6]:
# %% Training
# ========
# Preparing for Training
# ----------------------
def categoryFromOutput(output):
    # compute max
    top_n, top_i = output.topk(1)
    # output index of max
    category_i = top_i.item()
    return languages[category_i], category_i

# We will also want a quick way to get a training example (a name and its
# language):
    
def randomTrainingExample(i):
    category = y_list[i]
    name = x_list[i]
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    name_tensor = nameToTensor(name)
    return category, name, category_tensor, name_tensor

In [7]:
# %% Training the Network
# --------------------
criterion = nn.NLLLoss()

learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, name_tensor):
    # initialize hidden state - do this every time before passing an input sequence
    hidden = rnn.initHidden()
    # reset grad counters - do this every time after backprop
    rnn.zero_grad()
    # manually go through each element in input sequence
    for i in range(name_tensor.size()[0]):
        output, hidden = rnn(name_tensor[i], hidden)
    # backpropagate based on loss at last element only
    loss = criterion(output, category_tensor)
    loss.backward()

    # Update network parameters
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

In [8]:
n_iters = len(x_list)
print_every = 5
plot_every = 1000

# Keep track of loss for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

n_correct = 0

for iter in range(0, n_iters):
    category, name, category_tensor, name_tensor = randomTrainingExample(iter)
    output, loss = train(category_tensor, name_tensor)
    current_loss += loss
    guess, guess_i = categoryFromOutput(output)
    if guess == category:
        n_correct += 1
            
accuracy = n_correct / len(x_list)
print('Accuracy is %f' % accuracy)

Accuracy is 0.944605


"\nfor i in range(n_confusion):\n    category, name, category_tensor, name_tensor = randomTrainingExample(i)\n    print(i)\n    output = evaluate(name_tensor)\n    guess, guess_i = categoryFromOutput(output)\n    category_i = languages.index(category)\n    confusion[category_i][guess_i] += 1\n    \naccuracy = sum(confusion.diag())/sum(sum(confusion))\nprint('Accuracy is %f' % accuracy.item())\n\n#n_correct = 0\n#for iter in range(1, n_iters + 1):\n#        category, line, category_tensor, line_tensor = randomTrainingExample()\n#        print(iter)\n#        output, loss = train(category_tensor, line_tensor)\n#        current_loss += loss\n#        guess, guess_i = categoryFromOutput(output)\n#        category_i = languages.index(category)\n#        # compare category_i, guess_i\n#        if category_i == guess_i:\n#            n_correct += 1\n\n#accuracy = n_correct / len(x_list)\n#print('Accuracy is %f' % accuracy)\n"

#### 1.2.Effect of systematic training: the script trains the network by going through 100 thousand data samples (see variable n_iters) one by one in a random manner. The network parameters are updated by backpropagating losses computed on a per-sample basis. Modify the script so that, instead of picking each training sample randomly, it goes through every available sample exactly once per training epoch. Randomize the order of the samples within each epoch. Train the network for five epochs, and report accuracy results as you change the hidden state size as in problem 1.1. Since the dataset comprises around 20 thousand datapoints, the total number of data passes for this modified training process are similar to those required by problem 1.1. Report the accuracy numbers yielded by the different algorithm variants. (10 points).

In [1]:
import torch.nn as nn
import random
import time
import math

In [2]:
# runned this code 3 times for each different hidden state size (32, 64, 128)

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os


def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names
names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])

    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))

In [4]:
# Turning Names into Tensors
# --------------------------
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 32
rnn = RNN(n_letters, n_hidden, n_categories)

In [6]:
# %% Training
# ========
# Preparing for Training
# ----------------------
def categoryFromOutput(output):
    # compute max
    top_n, top_i = output.topk(1)
    # output index of max
    category_i = top_i.item()
    return languages[category_i], category_i

# We will also want a quick way to get a training example (a name and its
# language):

# tuple (category, name) list
category_name_list = []

# list of index from 0 to 20073
index_list=[x for x in range(0,len(y_list))]

for i in index_list:
    category_name_list.append( (y_list[i], x_list[i]) ) 
    
    
def randomTrainingExample():
    random_index = random.choice(index_list)
    index_list.remove(random_index)
    category = category_name_list[random_index][0]
    name = category_name_list[random_index][1]
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    name_tensor = nameToTensor(name)
    return category, name, category_tensor, name_tensor

In [7]:
# %% Training the Network
# --------------------
criterion = nn.NLLLoss()

learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, name_tensor):
    # initialize hidden state - do this every time before passing an input sequence
    hidden = rnn.initHidden()
    # reset grad counters - do this every time after backprop
    rnn.zero_grad()
    # manually go through each element in input sequence
    for i in range(name_tensor.size()[0]):
        output, hidden = rnn(name_tensor[i], hidden)
    # backpropagate based on loss at last element only
    loss = criterion(output, category_tensor)
    loss.backward()

    # Update network parameters
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

In [8]:
# number of total data samples
n_iters = len(x_list)

# Keep track of loss for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

nepoch = 5
print_every = 100
n_correct = 0

for epoch in range(nepoch):
    # number of category_i equals to guess_i
    n_correct = 0
    index_list=[x for x in range(0,len(y_list))]
    for iter in range(1, n_iters + 1):
        category, name, category_tensor, name_tensor = randomTrainingExample()
        output, loss = train(category_tensor, name_tensor)
        current_loss += loss
        guess, guess_i = categoryFromOutput(output)
        if guess == category:
            n_correct += 1
        # Print iter number, loss, name and guess
        #correct = '✓' if guess == category else '✗ (%s)' % category
        #print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, name, guess, correct))

accuracy = n_correct / len(x_list)
print('Accuracy is %f' % accuracy)

Accuracy is 0.725267
