In [575]:
# -*- coding: utf-8 -*-
"""
   Introduction to Deep Learning (LDA-T3114)
   Skeleton code for Assignment 5: Language Identification using Recurrent Architectures

   Hande Celikkanat & Miikka Silfverberg
"""

'\n   Introduction to Deep Learning (LDA-T3114)\n   Skeleton code for Assignment 5: Language Identification using Recurrent Architectures\n\n   Hande Celikkanat & Miikka Silfverberg\n'

In [576]:
%matplotlib inline

In [577]:
from random import choice, random, shuffle
import sys

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk

from data import read_datasets, WORD_BOUNDARY, UNK, HISTORY_SIZE
from paths import data_dir

torch.set_num_threads(10)

In [648]:
#--- hyperparameters ---
N_EPOCHS = 20
LEARNING_RATE = 0.01
REPORT_EVERY = 10
EMBEDDING_DIM = 30
HIDDEN_DIM = 20
BATCH_SIZE = 10
N_LAYERS = 1

# this is an additional parameter for the jupyter notebook skeleton code only
# it covers for the command-line argument in the .py code
MODEL_CHOICE = 'lstm'

In [649]:
#--- models ---
class LSTMModel(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 character_set_size,
                 n_layers,
                 hidden_dim,
                 n_classes):
        super(LSTMModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.character_set_size = character_set_size        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes

        # WRITE CODE HERE
        self.embeds = nn.Embedding(character_set_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim,n_classes)


    def forward(self, inputs):
        # WRITE CODE HERE
        #[sequence_len, batch_size]
        embeds = self.embeds(inputs) #.view(len(inputs),1,-1)
        # We recommend to use a single input for lstm layer (no special initialization of the hidden layer):
        lstm_out, hidden = self.lstm(embeds) 
        
        # WRITE MORE CODE HERE
        output = self.linear(lstm_out[-1])
        output = F.log_softmax(output)
        return output

In [650]:
class GRUModel(nn.Module): 
    def __init__(self, 
                 embedding_dim, 
                 character_set_size,
                 n_layers,
                 hidden_dim,
                 n_classes):
        super(GRUModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.character_set_size = character_set_size        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes

        # WRITE CODE HERE
        self.embeds = nn.Embedding(character_set_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim,n_classes)
        

    def forward(self, inputs):
        # WRITE CODE HERE
        embeds = self.embeds(inputs) #.view(len(inputs),1,-1)
        # We recommend to use a single input for gru layer (no special initialization of the hidden layer):
        gru_out, hidden = self.gru(embeds)
        
        # WRITE MORE CODE HERE

        output = self.linear(gru_out[-1])
        output = F.log_softmax(output,dim=1)
        return output

In [651]:
class RNNModel(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 character_set_size,
                 n_layers,
                 hidden_dim,
                 n_classes):
        super(RNNModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.character_set_size = character_set_size        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.n_classes = n_classes

        # WRITE CODE HERE
        self.embeds = nn.Embedding(character_set_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim,n_classes)
        

    def forward(self, inputs):
        # WRITE CODE HERE
        embeds = self.embeds(inputs) #.view(len(inputs),1,-1)
        # We recommend to use a single input for rnn layer (no special initialization of the hidden layer):
        rnn_out, hidden = self.rnn(embeds)
        
        # WRITE MORE CODE HERE
        output = self.linear(rnn_out[-1])
        output = F.log_softmax(output)
        return output

In [615]:
#practice padding
'''
max_length = 5
ordered = [[1,2,3],[1,2,3,4,5],[1,2]]
padded = [
    np.pad(li, pad_width=(0, max_length-len(li)), mode='constant', constant_values=75)
    for li in ordered
]
padded
'''

"\nmax_length = 5\nordered = [[1,2,3],[1,2,3,4,5],[1,2]]\npadded = [\n    np.pad(li, pad_width=(0, max_length-len(li)), mode='constant', constant_values=75)\n    for li in ordered\n]\npadded\n"

1

In [652]:
# --- auxilary functions ---
def get_max_min_length(batch):
    max_len = -9999
    min_len = 9999
    for word in batch:
        length = len(word["WORD"])
        if length > max_len:
            max_len = length
        if length < min_len:
            min_len = length
        
    return max_len,min_len
    
def make_words_same_length(batch, max_len):    
    for word in batch:
        length = len(word["WORD"])
        word["BALANCED"] = torch.tensor(np.pad(word["TENSOR"], pad_width=(0,max_len-length), mode='constant',constant_values=75))
    return batch

def get_minibatch(minibatchwords, character_map, languages):
    mb_x = None
    mb_y = None
    
    # WRITE CODE HERE
    
    # CASE 1 batch_size = 1
    '''
    bs = 1
    mb_x = torch.tensor(minibatchwords[0]["TENSOR"])
    mb_y = torch.tensor(label_to_idx(minibatchwords[0]["LANGUAGE"],languages))
    
    mb_x = mb_x.view(len(mb_x),bs)
    '''

    # CASE 2 batch_size > 1
    #LABELS - list to tensor, long type
    mb_y = torch.tensor([label_to_idx(word["LANGUAGE"],languages) for word in minibatchwords],dtype=torch.long)
    
    #max length of the words
    max_len, min_len = get_max_min_length(minibatchwords)
    balanced_batch = make_words_same_length(minibatchwords,max_len)
    
    mb_x = [word["BALANCED"] for word in balanced_batch]      
    mb_x = torch.stack(mb_x).t()

    return mb_x,mb_y

def label_to_idx(lan, languages):
    languages_ordered = list(languages)
    languages_ordered.sort()
    return torch.LongTensor([languages_ordered.index(lan)])


def get_word_length(word_ex):
    return len(word_ex['WORD'])    

def evaluate(dataset,model,eval_batch_size,character_map,languages):
    correct = 0
    
    # WRITE CODE HERE IF YOU LIKE
    for i in range(0,len(dataset),eval_batch_size):
        minibatchwords = dataset[i:i+eval_batch_size]    
        mb_x, mb_y = get_minibatch(minibatchwords, character_map, languages)
        
        # WRITE CODE HERE
        outputs = model(mb_x)
        _,predicted= torch.max(outputs.data,1)
        correct += (predicted == mb_y).sum()
    return correct * 100.0 / len(dataset)

In [653]:
#--- initialization ---

if BATCH_SIZE == 1:
    data, character_map, languages = read_datasets('uralic.mini',data_dir)
else:
    data, character_map, languages = read_datasets('uralic',data_dir)

trainset = [datapoint for lan in languages for datapoint in data['training'][lan]]
n_languages = len(languages)
character_set_size = len(character_map)



In [654]:
model = None


if MODEL_CHOICE == 'lstm':
    model = LSTMModel(embedding_dim=EMBEDDING_DIM,
                                character_set_size = character_set_size,
                                n_layers = N_LAYERS,
                                hidden_dim = HIDDEN_DIM,
                                n_classes = n_languages)
elif MODEL_CHOICE == 'gru':
    model = GRUModel(embedding_dim=EMBEDDING_DIM,
                                character_set_size = character_set_size,
                                n_layers = N_LAYERS,
                                hidden_dim = HIDDEN_DIM,
                                n_classes = n_languages)
elif MODEL_CHOICE == 'rnn':
    model = RNNModel(embedding_dim=EMBEDDING_DIM,
                                character_set_size = character_set_size,
                                n_layers = N_LAYERS,
                                hidden_dim = HIDDEN_DIM,
                                n_classes = n_languages)

In [655]:
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
loss_function = nn.NLLLoss()

In [656]:
# --- training loop ---
for epoch in range(N_EPOCHS):
    total_loss = 0
    
    # Generally speaking, it's a good idea to shuffle your
    # datasets once every epoch.
    shuffle(trainset)

    # WRITE CODE HERE
    # Sort your training set according to word-length, 
    # so that similar-length words end up near each other
    # You can use the function get_word_length as your sort key.
    
    #take care of this when the batch size > 1
    trainset = sorted(trainset, key = lambda i: len(i['WORD']))
    
    
    for i in range(0,len(trainset),BATCH_SIZE):
        minibatchwords = trainset[i:i+BATCH_SIZE]
        mb_x, mb_y = get_minibatch(minibatchwords, character_map, languages)
        
        # WRITE CODE HERE
        
        optimizer.zero_grad()        
        outputs = model(mb_x)
        loss = loss_function(outputs,mb_y)
        total_loss += loss
        loss.backward()
        optimizer.step()
       

    print('epoch: %d, loss: %.4f' % ((epoch+1), total_loss))
    if ((epoch+1) % REPORT_EVERY) == 0:
        train_acc = evaluate(trainset,model,BATCH_SIZE,character_map,languages)
        dev_acc = evaluate(data['dev'],model,BATCH_SIZE,character_map,languages)
        print('epoch: %d, loss: %.4f, train acc: %.2f%%, dev acc: %.2f%%' % 
              (epoch+1, total_loss, train_acc, dev_acc))




epoch: 1, loss: 426.3727
epoch: 2, loss: 349.3762
epoch: 3, loss: 326.9194
epoch: 4, loss: 306.2962
epoch: 5, loss: 299.2069
epoch: 6, loss: 295.4335
epoch: 7, loss: 293.2036
epoch: 8, loss: 278.4649
epoch: 9, loss: 275.5501
epoch: 10, loss: 264.3175
epoch: 10, loss: 264.3175, train acc: 81.37%, dev acc: 72.17%
epoch: 11, loss: 265.7043
epoch: 12, loss: 267.8220
epoch: 13, loss: 264.4397
epoch: 14, loss: 262.4412
epoch: 15, loss: 268.5876
epoch: 16, loss: 262.6564
epoch: 17, loss: 260.0608
epoch: 18, loss: 256.4471
epoch: 19, loss: 244.5830
epoch: 20, loss: 258.0428
epoch: 20, loss: 258.0428, train acc: 82.82%, dev acc: 68.33%


In [657]:
# --- test ---    
test_acc = evaluate(data['test'],model,BATCH_SIZE,character_map,languages)        
print('test acc: %.2f%%' % (test_acc))



test acc: 70.25%
