## 1. Batch training of data (15 + 10 points)

#### 1.1.Modify the implementation of the network to leverage the RNN subclass of module torch.nn, which readily incorporates support for batch training. Set the hidden state size to 128 and train the network through five epochs with a batch size equal to the total number of samples. Note that, since the data samples are of different lengths, you will need to pad the length of the samples to a unique sequence length (e.g., at least the length of the longest sequence) in order to be able to feed the batch to the network. This is because RNN expects the input to be a tensor of shape (seq_len, batch, input_size). You can either manually pad with 0s, or you can use built-in functions such as torch.nn.utils.rnn.pad_sequence to perform the padding. Report the accuracy number yielded by this approach on the full training set. (15 points).

In [1]:
import torch
import torch.nn as nn
import random
import time
import math
import numpy as np

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

# Hyper Parameters
NEPOCHS = 5
BATCH_SIZE = 20074
TIME_STEP = 57          
INPUT_SIZE = 57         
LR = 0.01               # learning rate

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names

names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])
x_list = np.array(x_list)
    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))
y_list = np.array(y_list)
    
#length of the longest sequence
max_len = len(x_list[0])
for names in x_list:
    if len(names) > max_len:  
        max_len = len(names)

In [4]:
# Turning Names into Tensors
# --------------------------
from torch.nn.utils.rnn import pad_sequence
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryToTensor(category):
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    return category_tensor
    
sequences_name = []
for name in x_list:
    sequences_name.append(nameToTensor(name))
feature = pad_sequence(sequences_name, batch_first=True, padding_value=0).squeeze()

sequences_category = []
for category in y_list:
    sequences_category.append(categoryToTensor(category))
target = pad_sequence(sequences_category, batch_first=True, padding_value=0).squeeze()

In [5]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(         
            input_size = INPUT_SIZE,
            hidden_size = 128,         # number of hidden units
            num_layers = 1,           # number of layers
            batch_first = True,       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your RNN will output a tensor with shape (seq_len, batch.
            #If your input data is of shape (batch_size, seq_len, features) then you need batch_first=True and your RNN will output a tensor with shape (batch_size, seq_len, hidden_size).
        )
        self.out = nn.Linear(128, 18)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        # r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state
        r_out, h = self.rnn(x, None)   # None represents zero initial hidden state

        # choose last time step of output
        out = self.out(r_out[:, -1, :])
        return out
    
rnn = RNN()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
loss_func = nn.CrossEntropyLoss()                       # the target label is not one-hot

In [8]:
# Data Loader for easy mini-batch return in training
import torch.utils.data as data_utils
train_data = data_utils.TensorDataset(feature, target)
train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                           batch_size=BATCH_SIZE, shuffle=True)

In [9]:
# %% training and testing
for epoch in range(NEPOCHS):
    for step, (x, y) in enumerate(train_loader):        # gives batch data
        #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
        b_x = x
        b_y = y                                         # batch y

        output = rnn(b_x)                               # rnn output
        loss = loss_func(output, b_y)                   # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
        optimizer.step()                                # apply gradients

# getting accuracy        
n_correct = 0
for i in range(0, len(sequences_name)):
    test_output = rnn(sequences_name[i].transpose(0, 1))                   # (samples, time_step, input_size)
    pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
    #import pdb;pdb.set_trace()
    if pred_y == sequences_category[i].data.numpy().squeeze():
        n_correct += 1
accuracy = n_correct / len(sequences_name)
print("Accuracy: ", accuracy)

Accuracy:  0.46861612035468764


#### 1.2.Modify the implementation from 1.1 to support arbitrary mini-batch sizes. In this case, instead of padding to a unique sequence length, adaptively pad the length of the mini batch to the length of the longest sample in the mini batch itself. Report the accuracy number (on the full training set) yielded by this approach on mini batch sizes of 1000, 2000 and 5000 after five epochs of training. (10 points).

In [1]:
import torch
import torch.nn as nn
import random
import time
import math
import numpy as np

In [2]:
# runned the same code 3 times by only changing the hidden states sizes to 32, 64 and 128

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

# Hyper Parameters
NEPOCHS = 5
BATCH_SIZE = 1000
TIME_STEP = 57          
INPUT_SIZE = 57         
LR = 0.01               # learning rate

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names

names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])
x_list = np.array(x_list)
    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))
y_list = np.array(y_list)
    
#length of the longest sequence
max_len = len(x_list[0])
for names in x_list:
    if len(names) > max_len:  
        max_len = len(names)

In [4]:
# Turning Names into Tensors
# --------------------------
from torch.nn.utils.rnn import pad_sequence
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryToTensor(category):
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    return category_tensor
    
sequences_name = []
for name in x_list:
    sequences_name.append(nameToTensor(name))
feature = pad_sequence(sequences_name, batch_first=True, padding_value=0).squeeze()

sequences_category = []
for category in y_list:
    sequences_category.append(categoryToTensor(category))
target = pad_sequence(sequences_category, batch_first=True, padding_value=0).squeeze()

In [5]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(         
            input_size = INPUT_SIZE,
            hidden_size = 128,         # number of hidden units
            num_layers = 1,           # number of layers
            batch_first = True,       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your RNN will output a tensor with shape (seq_len, batch.
            #If your input data is of shape (batch_size, seq_len, features) then you need batch_first=True and your RNN will output a tensor with shape (batch_size, seq_len, hidden_size).
        )
        self.out = nn.Linear(128, 18)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        # r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state
        r_out, h = self.rnn(x, None)   # None represents zero initial hidden state

        # choose last time step of output
        out = self.out(r_out[:, -1, :])
        return out
    
rnn = RNN()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
loss_func = nn.CrossEntropyLoss()                       # the target label is not one-hot

In [6]:
# Data Loader for easy mini-batch return in training
import torch.utils.data as data_utils
train_data = data_utils.TensorDataset(feature, target)
train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                           batch_size=BATCH_SIZE, shuffle=True)

In [7]:
# %% training and testing
for epoch in range(NEPOCHS):
    for step, (x, y) in enumerate(train_loader):        # gives batch data
        #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
        b_x = x
        b_y = y                                         # batch y

        output = rnn(b_x)                               # rnn output
        loss = loss_func(output, b_y)                   # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
        optimizer.step()                                # apply gradients

# getting accuracy        
n_correct = 0
for i in range(0, len(sequences_name)):
    test_output = rnn(sequences_name[i].transpose(0, 1))                   # (samples, time_step, input_size)
    pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
    #import pdb;pdb.set_trace()
    if pred_y == sequences_category[i].data.numpy().squeeze():
        n_correct += 1
accuracy = n_correct / len(sequences_name)
print("Accuracy: ", accuracy)

Accuracy:  0.3248978778519478


## 2. Model cross-validation (20 + 15 points)

#### 2.1.Modify the implementation from 1.1 or 1.2 to enable support of five-fold cross-validation by leveraging the Kfold object from scikitlearn. Report the average accuracy on the full training and test sets across all five folds for two cases: (a) after 5 epochs of training, and (b) after however many epochs it takes the algorithm to converge. (10 + 5 points). For (b), feel free to use a batch size of your choosing.

##### 2.1 (a)

In [1]:
import torch
import torch.nn as nn
import random
import time
import math

In [2]:
# runned the same code 3 times by only changing the hidden states sizes to 32, 64 and 128

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

# Hyper Parameters
NEPOCHS = 5
BATCH_SIZE = 20074
TIME_STEP = 57          
INPUT_SIZE = 57         
LR = 0.001               # learning rate

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names

names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
import numpy as np

def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])
x_list = np.array(x_list)
    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))
y_list = np.array(y_list)
    
#length of the longest sequence
max_len = len(x_list[0])
for names in x_list:
    if len(names) > max_len:  
        max_len = len(names)

In [4]:
# Turning Names into Tensors
# --------------------------
from torch.nn.utils.rnn import pad_sequence
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryToTensor(category):
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    return category_tensor

In [5]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(         
            input_size = INPUT_SIZE,
            hidden_size = 128,         # number of hidden units
            num_layers = 1,           # number of layers
            batch_first = True,       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your RNN will output a tensor with shape (seq_len, batch.
            #If your input data is of shape (batch_size, seq_len, features) then you need batch_first=True and your RNN will output a tensor with shape (batch_size, seq_len, hidden_size).
        )
        self.out = nn.Linear(128, 18)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        # r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state
        r_out, h = self.rnn(x, None)   # None represents zero initial hidden state

        # choose last time step of output
        out = self.out(r_out[:, -1, :])
        return out

In [6]:
from sklearn.model_selection import KFold
import sklearn
from statistics import mean

# Kfold five-vold cross-validation
kfold = KFold(n_splits=5, shuffle = True)
training_accuracy_list = []
testing_accuracy_list = []
# fold number
f = 1

for train, test in kfold.split(x_list):
    rnn = RNN()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
    loss_func = nn.CrossEntropyLoss()
    
    x_train, x_test, y_train, y_test = x_list[train], x_list[test], y_list[train], y_list[test]
    
    x_train_tensor = []
    for name in x_train:
        x_train_tensor.append(nameToTensor(name))
    feature = pad_sequence(x_train_tensor, batch_first=True, padding_value=0).squeeze()

    y_train_tensor = []
    for category in y_train:
        y_train_tensor.append(categoryToTensor(category))
    target = pad_sequence(y_train_tensor, batch_first=True, padding_value=0).squeeze()
    
    x_test_tensor = []
    for name in x_test:
        x_test_tensor.append(nameToTensor(name))
    #x_test = pad_sequence(x_test_tensor, batch_first=True, padding_value=0).squeeze()

    y_test_tensor = []
    for category in y_test:
        y_test_tensor.append(categoryToTensor(category))
    #y_test = pad_sequence(y_test_tensor, batch_first=True, padding_value=0).squeeze()
    
    #Data Loader for easy mini-batch return in training
    import torch.utils.data as data_utils
    train_data = data_utils.TensorDataset(feature, target)
    train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    
    # %% training and testing
    print("Fold",f)
    f += 1
    for epoch in range(NEPOCHS):
        for step, (x, y) in enumerate(train_loader):        # gives batch data
            #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
            b_x = x
            b_y = y                                         # batch y

            output = rnn(b_x)                               # rnn output
            loss = loss_func(output, b_y)                   # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients
    
    # getting accuracy of training set        
    n_correct1 = 0
    for i in range(0, len(x_train_tensor)):
        test_output = rnn(x_train_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_train_tensor[i].data.numpy().squeeze():
            n_correct1 += 1
    accuracy = n_correct1 / len(x_train_tensor)
    print("Accuracy on training set: ", accuracy)
    training_accuracy_list.append(accuracy)
    
    # getting accuracy of test set        
    n_correct2 = 0
    for i in range(0, len(x_test_tensor)):
        test_output = rnn(x_test_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_test_tensor[i].data.numpy().squeeze():
            n_correct2 += 1
    accuracy = n_correct2 / len(x_test_tensor)
    print("Accuracy on test set: ", accuracy)
    testing_accuracy_list.append(accuracy)

print("Average accuracy on full training: ", mean(training_accuracy_list))
print("Average accuracy on test sets: ", mean(testing_accuracy_list))

Fold 1
Accuracy on training set:  0.4674014571268448
Accuracy on test set:  0.4734744707347447
Fold 2
Accuracy on training set:  0.4703281648919609
Accuracy on test set:  0.46176836861768367
Fold 3
Accuracy on training set:  0.4656578865433713
Accuracy on test set:  0.4804483188044832
Fold 4
Accuracy on training set:  0.4682109720406003
Accuracy on test set:  0.4702366127023661
Fold 5
Accuracy on training set:  0.4714819427148194
Accuracy on test set:  0.4571499750871948
Average accuracy on full training:  0.46861608466351934
Average accuracy on test sets:  0.4686155491892945


##### 2.1 (b)

In [6]:
from sklearn.model_selection import KFold
import sklearn
from statistics import mean

# Kfold five-vold cross-validation
kfold = KFold(n_splits=5, shuffle = True)
training_accuracy_list = []
testing_accuracy_list = []
# fold number
f = 1

for train, test in kfold.split(x_list):
    rnn = RNN()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
    loss_func = nn.CrossEntropyLoss()
    
    x_train, x_test, y_train, y_test = x_list[train], x_list[test], y_list[train], y_list[test]
    
    x_train_tensor = []
    for name in x_train:
        x_train_tensor.append(nameToTensor(name))
    feature = pad_sequence(x_train_tensor, batch_first=True, padding_value=0).squeeze()

    y_train_tensor = []
    for category in y_train:
        y_train_tensor.append(categoryToTensor(category))
    target = pad_sequence(y_train_tensor, batch_first=True, padding_value=0).squeeze()
    
    x_test_tensor = []
    for name in x_test:
        x_test_tensor.append(nameToTensor(name))
    #x_test = pad_sequence(x_test_tensor, batch_first=True, padding_value=0).squeeze()

    y_test_tensor = []
    for category in y_test:
        y_test_tensor.append(categoryToTensor(category))
    #y_test = pad_sequence(y_test_tensor, batch_first=True, padding_value=0).squeeze()
    
    #Data Loader for easy mini-batch return in training
    import torch.utils.data as data_utils
    train_data = data_utils.TensorDataset(feature, target)
    train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    
    # %% training and testing
    print("Fold",f)
    f += 1
    NEPOCHS = 20
    for epoch in range(NEPOCHS):
        for step, (x, y) in enumerate(train_loader):        # gives batch data
            #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
            b_x = x
            b_y = y                                         # batch y

            output = rnn(b_x)                               # rnn output
            loss = loss_func(output, b_y)                   # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients
    
    # getting accuracy of training set        
    n_correct1 = 0
    for i in range(0, len(x_train_tensor)):
        test_output = rnn(x_train_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_train_tensor[i].data.numpy().squeeze():
            n_correct1 += 1
    accuracy = n_correct1 / len(x_train_tensor)
    print("Accuracy on training set: ", accuracy)
    training_accuracy_list.append(accuracy)
    
    # getting accuracy of test set        
    n_correct2 = 0
    for i in range(0, len(x_test_tensor)):
        test_output = rnn(x_test_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_test_tensor[i].data.numpy().squeeze():
            n_correct2 += 1
    accuracy = n_correct2 / len(x_test_tensor)
    print("Accuracy on test set: ", accuracy)
    testing_accuracy_list.append(accuracy)

print("Average accuracy on full training: ", mean(training_accuracy_list))
print("Average accuracy on test sets: ", mean(testing_accuracy_list))

Fold 1
Accuracy on training set:  0.4692072980882994
Accuracy on test set:  0.46625155666251555
Fold 2
Accuracy on training set:  0.46609377918923967
Accuracy on test set:  0.47870485678704855
Fold 3
Accuracy on training set:  0.4669032941029952
Accuracy on test set:  0.47546699875466997
Fold 4
Accuracy on training set:  0.47001681300205495
Accuracy on test set:  0.46301369863013697
Fold 5
Accuracy on training set:  0.47085927770859276
Accuracy on test set:  0.45964125560538116
Average accuracy on full training:  0.4686160924182364
Average accuracy on test sets:  0.46861567328795045


#### 2.2.Modify the implementation from 2.1 to incorporate the LSTM subclass of module torch.nn. Report the average accuracy on the full training and test sets across all five folds for two cases: (a) after 5 epochs of training, and (b) after however many epochs it takes the algorithm to converge. (10 + 5 points).

##### 2.2 (a)

In [1]:
import torch
import torch.nn as nn
import random
import time
import math
import numpy as np

In [2]:
# runned the same code 3 times by only changing the hidden states sizes to 32, 64 and 128

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

# Hyper Parameters
NEPOCHS = 5
BATCH_SIZE = 20074
TIME_STEP = 57          
INPUT_SIZE = 57         
LR = 0.01               # learning rate

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names

names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])
x_list = np.array(x_list)
    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))
y_list = np.array(y_list)
    
#length of the longest sequence
max_len = len(x_list[0])
for names in x_list:
    if len(names) > max_len:  
        max_len = len(names)

In [4]:
# Turning Names into Tensors
# --------------------------
from torch.nn.utils.rnn import pad_sequence
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryToTensor(category):
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    return category_tensor

In [5]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         
            input_size = INPUT_SIZE,
            hidden_size = 128,         # number of hidden units
            num_layers = 1,           # number of layers
            batch_first = True,       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your LSTM will give output of shape (seq_len, batch.
            #If your input data is of shape (batch_size, seq_len, features) then you need batch_first=True and your LSTM will give output of shape (batch_size, seq_len, hidden_size).
        )
        self.out = nn.Linear(128, 18)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state

        # choose last time step of r_out
        out = self.out(r_out[:, -1, :])
        return out

In [6]:
from sklearn.model_selection import KFold
import sklearn
from statistics import mean

# Kfold five-vold cross-validation
kfold = KFold(n_splits=5, shuffle = True)
training_accuracy_list = []
testing_accuracy_list = []
# fold number
f = 1

for train, test in kfold.split(x_list):
    rnn = RNN()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
    loss_func = nn.CrossEntropyLoss()
    
    x_train, x_test, y_train, y_test = x_list[train], x_list[test], y_list[train], y_list[test]
    
    x_train_tensor = []
    for name in x_train:
        x_train_tensor.append(nameToTensor(name))
    feature = pad_sequence(x_train_tensor, batch_first=True, padding_value=0).squeeze()

    y_train_tensor = []
    for category in y_train:
        y_train_tensor.append(categoryToTensor(category))
    target = pad_sequence(y_train_tensor, batch_first=True, padding_value=0).squeeze()
    
    x_test_tensor = []
    for name in x_test:
        x_test_tensor.append(nameToTensor(name))
    #x_test = pad_sequence(x_test_tensor, batch_first=True, padding_value=0).squeeze()

    y_test_tensor = []
    for category in y_test:
        y_test_tensor.append(categoryToTensor(category))
    #y_test = pad_sequence(y_test_tensor, batch_first=True, padding_value=0).squeeze()
    
    #Data Loader for easy mini-batch return in training
    import torch.utils.data as data_utils
    train_data = data_utils.TensorDataset(feature, target)
    train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    
    # %% training and testing
    print("Fold",f)
    f += 1
    for epoch in range(NEPOCHS):
        for step, (x, y) in enumerate(train_loader):        # gives batch data
            #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
            b_x = x
            b_y = y                                         # batch y

            output = rnn(b_x)                               # rnn output
            loss = loss_func(output, b_y)                   # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients
    
    # getting accuracy of training set        
    n_correct1 = 0
    for i in range(0, len(x_train_tensor)):
        test_output = rnn(x_train_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_train_tensor[i].data.numpy().squeeze():
            n_correct1 += 1
    accuracy = n_correct1 / len(x_train_tensor)
    print("Accuracy on training set: ", accuracy)
                #accuracy = sklearn.metrics.accuracy_score(y_test, pred_y)
                #print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.2f' % accuracy)
    training_accuracy_list.append(accuracy)
    
    # getting accuracy of test set        
    n_correct2 = 0
    for i in range(0, len(x_test_tensor)):
        test_output = rnn(x_test_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_test_tensor[i].data.numpy().squeeze():
            n_correct2 += 1
    accuracy = n_correct2 / len(x_test_tensor)
    print("Accuracy on test set: ", accuracy)
                #accuracy = sklearn.metrics.accuracy_score(y_test, pred_y)
                #print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.2f' % accuracy)
    testing_accuracy_list.append(accuracy)

print("Average accuracy on full training: ", mean(training_accuracy_list))
print("Average accuracy on test sets: ", mean(testing_accuracy_list))

Fold 1
Accuracy on training set:  0.469394109222243
Accuracy on test set:  0.4655043586550436
Fold 2
Accuracy on training set:  0.4706395167818669
Accuracy on test set:  0.46052303860523036
Fold 3
Accuracy on training set:  0.4650351827635594
Accuracy on test set:  0.4829389788293898
Fold 4
Accuracy on training set:  0.46852232393050625
Accuracy on test set:  0.4689912826899128
Fold 5
Accuracy on training set:  0.46948941469489414
Accuracy on test set:  0.46512207274539114
Average accuracy on full training:  0.46861610947861393
Average accuracy on test sets:  0.46861594630499354


##### 2.2 (b)

In [6]:
from sklearn.model_selection import KFold
import sklearn
from statistics import mean

# Kfold five-vold cross-validation
kfold = KFold(n_splits=5, shuffle = True)
training_accuracy_list = []
testing_accuracy_list = []
# fold number
f = 1

for train, test in kfold.split(x_list):
    rnn = RNN()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
    loss_func = nn.CrossEntropyLoss()
    
    x_train, x_test, y_train, y_test = x_list[train], x_list[test], y_list[train], y_list[test]
    
    x_train_tensor = []
    for name in x_train:
        x_train_tensor.append(nameToTensor(name))
    feature = pad_sequence(x_train_tensor, batch_first=True, padding_value=0).squeeze()

    y_train_tensor = []
    for category in y_train:
        y_train_tensor.append(categoryToTensor(category))
    target = pad_sequence(y_train_tensor, batch_first=True, padding_value=0).squeeze()
    
    x_test_tensor = []
    for name in x_test:
        x_test_tensor.append(nameToTensor(name))
    #x_test = pad_sequence(x_test_tensor, batch_first=True, padding_value=0).squeeze()

    y_test_tensor = []
    for category in y_test:
        y_test_tensor.append(categoryToTensor(category))
    #y_test = pad_sequence(y_test_tensor, batch_first=True, padding_value=0).squeeze()
    
    #Data Loader for easy mini-batch return in training
    import torch.utils.data as data_utils
    train_data = data_utils.TensorDataset(feature, target)
    train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    
    # %% training and testing
    print("Fold",f)
    f += 1
    NEPOCHS = 30
    for epoch in range(NEPOCHS):
        for step, (x, y) in enumerate(train_loader):        # gives batch data
            #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
            b_x = x
            b_y = y                                         # batch y

            output = rnn(b_x)                               # rnn output
            loss = loss_func(output, b_y)                   # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients
    
    # getting accuracy of training set        
    n_correct1 = 0
    for i in range(0, len(x_train_tensor)):
        test_output = rnn(x_train_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_train_tensor[i].data.numpy().squeeze():
            n_correct1 += 1
    accuracy = n_correct1 / len(x_train_tensor)
    print("Accuracy on training set: ", accuracy)
                #accuracy = sklearn.metrics.accuracy_score(y_test, pred_y)
                #print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.2f' % accuracy)
    training_accuracy_list.append(accuracy)
    
    # getting accuracy of test set        
    n_correct2 = 0
    for i in range(0, len(x_test_tensor)):
        test_output = rnn(x_test_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_test_tensor[i].data.numpy().squeeze():
            n_correct2 += 1
    accuracy = n_correct2 / len(x_test_tensor)
    print("Accuracy on test set: ", accuracy)
                #accuracy = sklearn.metrics.accuracy_score(y_test, pred_y)
                #print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.2f' % accuracy)
    testing_accuracy_list.append(accuracy)

print("Average accuracy on full training: ", mean(training_accuracy_list))
print("Average accuracy on test sets: ", mean(testing_accuracy_list))

Fold 1
Accuracy on training set:  0.47088859829379165
Accuracy on test set:  0.45952677459526775
Fold 2
Accuracy on training set:  0.46677875334703284
Accuracy on test set:  0.4759651307596513
Fold 3
Accuracy on training set:  0.4689582165763746
Accuracy on test set:  0.4672478206724782
Fold 4
Accuracy on training set:  0.4678996201506943
Accuracy on test set:  0.4714819427148194
Fold 5
Accuracy on training set:  0.46855541718555416
Accuracy on test set:  0.46885899352267063
Average accuracy on full training:  0.4686161211106895
Average accuracy on test sets:  0.46861613245297745


#### 2.3.Modify the implementation from 2.2 to enable support of stratified five- fold cross-validation by leveraging the StratifiedKFold object from scikitlearn. Report the average accuracy on the full training and test sets across all five folds after 5 epochs of training. Feel free to use a batch size of your choosing. (5 points).

In [1]:
import torch
import torch.nn as nn
import random
import time
import math
import numpy as np

In [2]:
# runned the same code 3 times by only changing the hidden states sizes to 32, 64 and 128

from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

# Hyper Parameters
NEPOCHS = 5
BATCH_SIZE = 20074
TIME_STEP = 57          
INPUT_SIZE = 57         
LR = 0.01               # learning rate

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the names dictionary, a list of names per language
# dictionary keys are languages, values are names

names = {}
languages = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    languages.append(category)
    lines = readLines(filename)
    names[category] = lines

n_categories = len(languages)

def findName(dict, name):
    keys = dict.keys()
    for key in keys:
        if name in dict[key]:
            return key
    return ''

In [3]:
# function to return key for any value 
def get_key(val): 
    for key, value in names.items(): 
         if val in value: 
            return key 

# X list - names
x_list = []
for key in names.keys():
    x_list.extend(names[key])
x_list = np.array(x_list)
    
# Y list - categories
y_list = []
for name in x_list:
    y_list.append(get_key(name))
y_list = np.array(y_list)
    
#length of the longest sequence
max_len = len(x_list[0])
for names in x_list:
    if len(names) > max_len:  
        max_len = len(names)

In [4]:
# Turning Names into Tensors
# --------------------------
from torch.nn.utils.rnn import pad_sequence
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def nameToTensor(name):
    tensor = torch.zeros(len(name), 1, n_letters)
    for li, letter in enumerate(name):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryToTensor(category):
    category_tensor = torch.tensor([languages.index(category)], dtype=torch.long)
    return category_tensor

In [5]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         
            input_size = INPUT_SIZE,
            hidden_size = 128,         # number of hidden units
            num_layers = 1,           # number of layers
            batch_first = True,       # If your input data is of shape (seq_len, batch_size, features) then you don’t need batch_first=True and your LSTM will give output of shape (seq_len, batch.
            #If your input data is of shape (batch_size, seq_len, features) then you need batch_first=True and your LSTM will give output of shape (batch_size, seq_len, hidden_size).
        )
        self.out = nn.Linear(128, 18)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state

        # choose last time step of r_out
        out = self.out(r_out[:, -1, :])
        return out

In [12]:
import sklearn
from sklearn.model_selection import StratifiedKFold
from statistics import mean

# Stratified Kfold five-vold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True)

training_accuracy_list = []
testing_accuracy_list = []
# fold number
f = 1

for train, test in skf.split(x_list, y_list):
    rnn = RNN()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
    loss_func = nn.CrossEntropyLoss()
    
    x_train, x_test, y_train, y_test = x_list[train], x_list[test], y_list[train], y_list[test]
    
    x_train_tensor = []
    for name in x_train:
        x_train_tensor.append(nameToTensor(name))
    feature = pad_sequence(x_train_tensor, batch_first=True, padding_value=0).squeeze()

    y_train_tensor = []
    for category in y_train:
        y_train_tensor.append(categoryToTensor(category))
    target = pad_sequence(y_train_tensor, batch_first=True, padding_value=0).squeeze()
    
    x_test_tensor = []
    for name in x_test:
        x_test_tensor.append(nameToTensor(name))
    #x_test = pad_sequence(x_test_tensor, batch_first=True, padding_value=0).squeeze()

    y_test_tensor = []
    for category in y_test:
        y_test_tensor.append(categoryToTensor(category))
    #y_test = pad_sequence(y_test_tensor, batch_first=True, padding_value=0).squeeze()
    
    #Data Loader for easy mini-batch return in training
    import torch.utils.data as data_utils
    train_data = data_utils.TensorDataset(feature, target)
    train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    
    # %% training and testing
    print("Fold",f)
    f += 1
    for epoch in range(NEPOCHS):
        for step, (x, y) in enumerate(train_loader):        # gives batch data
            #b_x = x.view(-1, 19, 19)                        # reshape x to (batch, time_step, input_size)
            b_x = x
            b_y = y                                         # batch y

            output = rnn(b_x)                               # rnn output
            loss = loss_func(output, b_y)                   # cross entropy loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients
    
    # getting accuracy of training set        
    n_correct1 = 0
    for i in range(0, len(x_train_tensor)):
        test_output = rnn(x_train_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_train_tensor[i].data.numpy().squeeze():
            n_correct1 += 1
    accuracy = n_correct1 / len(x_train_tensor)
    print("Accuracy on training set: ", accuracy)
                #accuracy = sklearn.metrics.accuracy_score(y_test, pred_y)
                #print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.2f' % accuracy)
    training_accuracy_list.append(accuracy)
    
    # getting accuracy of test set        
    n_correct2 = 0
    for i in range(0, len(x_test_tensor)):
        test_output = rnn(x_test_tensor[i].transpose(0, 1))                   # (samples, time_step, input_size)
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
        #import pdb;pdb.set_trace()
        if pred_y == y_test_tensor[i].data.numpy().squeeze():
            n_correct2 += 1
    accuracy = n_correct2 / len(x_test_tensor)
    print("Accuracy on test set: ", accuracy)
                #accuracy = sklearn.metrics.accuracy_score(y_test, pred_y)
                #print("Epoch: ", epoch, "| train loss: %.4f" % loss.item(), '| test accuracy: %.2f' % accuracy)
    testing_accuracy_list.append(accuracy)

print("Average accuracy on full training: ", mean(training_accuracy_list))
print("Average accuracy on test sets: ", mean(testing_accuracy_list))



Fold 1
Accuracy on training set:  0.46878893595813603
Accuracy on test set:  0.4679264047737444
Fold 2
Accuracy on training set:  0.4686721474838067
Accuracy on test set:  0.4683922349427576
Fold 3
Accuracy on training set:  0.4685885063196563
Accuracy on test set:  0.46872663842511836
Fold 4
Accuracy on training set:  0.46855933258622834
Accuracy on test set:  0.4688434695912263
Fold 5
Accuracy on training set:  0.4684718331777155
Accuracy on test set:  0.46919431279620855
Average accuracy on full training:  0.46861615110510857
Average accuracy on test sets:  0.46861661210581107
