In [11]:
%matplotlib inline

In [12]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import random
import copy
from torch.utils.data import Dataset, DataLoader

def findFiles(path): return glob.glob(path)

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

# Partition data
def partition(data, ratio=0.3):
    shuffled = random.sample(data, k=len(data))
    split_idx = int(len(shuffled)*0.3)
    return shuffled[:split_idx], shuffled[split_idx:]

for filename in findFiles('data/names/*.txt'):
    filename_norm = os.path.normpath(filename)
#     print(filename_norm)
    category = filename_norm.split('\\')[-1].split('.')[0]
    all_categories.append(category)
    lines = readLines(filename_norm)
#     print(lines)
    category_lines[category] = partition(lines)

n_categories = len(all_categories)


In [13]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), n_letters)
    for li, letter in enumerate(line):
        tensor[li][letterToIndex(letter)] = 1
    return tensor

# print(letterToTensor('J'))
# print(lineToTensor('Jones').shape)

In [14]:
# category-index mapping
category_index = {c:i for i, c in enumerate(all_categories)}
    
class WordDataset(Dataset):
    def __init__(self, wordnamelist, train_test_idx):
        self.word_cat_list = []
        
        for cat, words in category_lines.items():
            for word in words[train_test_idx]:
                name = lineToTensor(word)
                category = torch.tensor([category_index[cat]])
                self.word_cat_list.append((name, category))
                
        random.shuffle(self.word_cat_list)
        
    def __getitem__(self, index):
        word, cat = self.word_cat_list[index]
#         print('category', cat.shape, '\n' 'word', word.shape)
        return word, cat
        
    def __len__(self):
        return len(self.word_cat_list)


In [15]:
def collate(batch):
    # batch = [(tensor, label), (tensor, label),...]
    # sort batch in descending order of tensor sequence length
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    
    # sends each (tensor, label) in sorted batch into zip
    tensor, label = zip(*sorted_batch)
    
#     print('label', label[0].shape)
    return torch.nn.utils.rnn.pack_sequence(tensor), torch.cat(label)


In [16]:
import torch.nn as nn

class LSTMmodel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.lstm = torch.nn.LSTM(input_size=input_size,
                     hidden_size=hidden_size,
                     num_layers = num_layers
                    )
        self.fc = torch.nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
#         print('x','\n',x)
        # x is PackedSequence
        h_0 = torch.zeros(self.num_layers, x.batch_sizes[0], self.hidden_size)
        c_0 = torch.zeros(self.num_layers, x.batch_sizes[0], self.hidden_size)
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        output = self.fc(h_n[-1,:,:])
        return output


In [17]:
def train(model, train_dataloader, test_dataloader, train_data_len):
    
    cross_entropy = nn.CrossEntropyLoss(size_average=False)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    
    best_model_weights = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    total_iter = train_data_len / train_dataloader.batch_size
    
    for epoch in range(1,6):
        running_loss = 0
        totals = 0
        iteration = 0
        model.train()
        
        for packedseq, label in train_dataloader:
            iteration += 1
            optimizer.zero_grad()
            output = model(packedseq)
            
            loss = cross_entropy(output, label)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            totals += label.shape[0]
            
            print('iter: {}/{}, running loss = {:.4f}'.format(iteration, total_iter, running_loss), end='\r')
            
        training_loss = running_loss / totals
        test_loss, test_acc = test(model, test_dataloader)
            
        if test_acc > best_acc:
            best_acc = test_acc
            best_model_weights = copy.deepcopy(model.state_dict())
            
        print('Epoch: {} \tTraining Loss: {:.4f} \tTest Loss: {:.4f} \tTest Accuracy: {:.4f}'.format(
               epoch, training_loss, test_loss, test_acc))
        
    print('Best test acc: {:4f}'.format(best_acc))
    model.load_state_dict(best_model_weights)

In [18]:
def test(model, dataloader):
    cross_entropy = nn.CrossEntropyLoss(size_average=False)
    
    model.eval()
    total_loss = 0
    total_corrects = 0
    count = 0
        
    with torch.no_grad():
        for packedseq, label in dataloader:
            output = model(packedseq)
            
            loss = cross_entropy(output, label)
            total_loss += loss.item()
            count += label.shape[0]
            correct = output.argmax(dim=1) == label
            total_corrects += correct.item()
            
    loss = total_loss/count
    accuracy = total_corrects/count
    
    return loss, accuracy

Task 1

In [19]:
train_data = WordDataset(category_lines, 1)
test_data = WordDataset(category_lines, 0)

train_dataloader = DataLoader(train_data, batch_size=1, shuffle=True, collate_fn=collate)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=True, collate_fn=collate)

In [20]:
for num_layers in (1,2):
    for num_hidden in (10, 50, 250):
        model = LSTMmodel(input_size=n_letters, hidden_size=num_hidden, num_layers=num_layers, num_classes=n_categories)
        train(model, train_dataloader=train_dataloader, test_dataloader=test_dataloader, train_data_len=len(train_data))
        test_loss, test_acc = test(model, test_dataloader)
        
        print('No. of layers: {} \tNo. of hidden layers: {} \tTest Loss: {:.4f} \tTest Accuracy: {:.4f}'.format(
               num_layers, num_hidden, test_loss, test_acc))

Epoch: 1 	Training Loss: 1.2722 	Test Loss: 1.0274 	Test Accuracy: 0.7002
Epoch: 2 	Training Loss: 0.9885 	Test Loss: 0.9178 	Test Accuracy: 0.7405
Epoch: 3 	Training Loss: 0.9075 	Test Loss: 0.8621 	Test Accuracy: 0.7431
Epoch: 4 	Training Loss: 0.8663 	Test Loss: 0.8542 	Test Accuracy: 0.7476
Epoch: 5 	Training Loss: 0.8351 	Test Loss: 0.8120 	Test Accuracy: 0.7573
Best val acc: 0.757273
No. of layers: 1 	No. of hidden layers: 10 	Test Loss: 0.8120 	Test Accuracy: 0.7573
Epoch: 1 	Training Loss: 1.2325 	Test Loss: 0.9278 	Test Accuracy: 0.7210
Epoch: 2 	Training Loss: 0.9067 	Test Loss: 0.8977 	Test Accuracy: 0.7124
Epoch: 3 	Training Loss: 0.7799 	Test Loss: 0.7338 	Test Accuracy: 0.7782
Epoch: 4 	Training Loss: 0.6936 	Test Loss: 0.6913 	Test Accuracy: 0.7914
Epoch: 5 	Training Loss: 0.6389 	Test Loss: 0.6696 	Test Accuracy: 0.7967
Best val acc: 0.796675
No. of layers: 1 	No. of hidden layers: 50 	Test Loss: 0.6696 	Test Accuracy: 0.7967
Epoch: 1 	Training Loss: 1.2668 	Test Loss: 