In [1]:
# -*- coding: utf-8 -*-
"""
   Introduction to Deep Learning (LDA-T3114)
   Skeleton code for Assignment 3: Language Identification for Uralic Languages

   Hande Celikkanat & Miikka Silfverberg
"""

'\n   Introduction to Deep Learning (LDA-T3114)\n   Skeleton code for Assignment 3: Language Identification for Uralic Languages\n\n   Hande Celikkanat & Miikka Silfverberg\n'

In [144]:
from random import choice, random, shuffle

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [145]:
torch.set_num_threads(10)

In [146]:
from data import read_datasets, WORD_BOUNDARY, UNK, HISTORY_SIZE

from paths import data_dir

In [550]:
#--- hyperparameters ---
N_EPOCHS = 1500
LEARNING_RATE = 0.1
REPORT_EVERY = 100
VERBOSE = False
EMBEDDING_DIM=30
HIDDEN_DIM=50
BATCH_SIZE=100 # raise from 1 to 10
N_LAYERS=2

In [546]:
#--- model ---
class LanguageModel(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 history_size, 
                 hidden_dim, 
                 n_layers,
                 character_set_size):
        super(LanguageModel, self).__init__()
        self.history_size = history_size
        self.embedding_dim = embedding_dim
        self.embed = nn.Embedding(character_set_size,embedding_dim)
        # WRITE CODE HERE
        self.linear1 = nn.Linear(history_size*embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim,character_set_size)

    def forward(self, inputs):
        # WRITE CODE HERE
        input_x = inputs.shape[0]
        embeds = self.embed(inputs).view(input_x,120)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        out = F.log_softmax(out, dim = 1)
        return out
        

In [553]:
#--- auxilary functions ---
def get_ll(word_ex,model,history_size,character_map):
    with torch.no_grad():        
        char_tuples = word_ex['TUPLES']
        # WRITE CODE HERE 
        
        total = 0
        
        #------Case 2----
        inputs = torch.cat([i for i,j in char_tuples],dim=0)
        classes = torch.cat([j for i,j in char_tuples],dim=0)
        
        inputs = inputs.reshape(int(inputs.shape[0]/history_size),history_size)
        
        result = model(inputs)
        x = 0
        for i in result:
            #get an index of the goal
            goal_index = classes[x]
            
            #add a predicted goal_value to total
            total += i[goal_index]
            x+=1
        
        return total
    
        #for a single word
         #------Case 1----
        """
        for tup in char_tuples:
            result = model(tup[0])
            gold = tup[1].item()
            
            value = result[0][gold]
           
            total+=value
            
        return total
        """
    
def guess_language(word_ex,models,history_size,character_map):
    lls = [(lan,get_ll(word_ex,models[lan],history_size,character_map)) 
           for lan in models]
    #choose the language that has the highest value
    return max(lls,key=lambda x: x[1])[0]

def evaluate(dataset,models,HISTORY_SIZE,character_map):
    corr = 0
    for word_ex in dataset:
        sys_lan = guess_language(word_ex,
                                 models,
                                 HISTORY_SIZE,
                                 character_map)
        
        #if the guess is right then correct +1
        if sys_lan == word_ex['LANGUAGE']:
            corr += 1
    return corr * 100.0 / len(dataset)

In [548]:
#--- initialization ---
data, character_map, languages = read_datasets('uralic',data_dir)
# We initialize one language model for each language.
models = {lan:LanguageModel(EMBEDDING_DIM,
                            HISTORY_SIZE,
                            HIDDEN_DIM,
                            N_LAYERS,
                            len(character_map)) for lan in languages}
# Each language model requires its own optimizer.
optimizers = {lan:optim.SGD(models[lan].parameters(), LEARNING_RATE) 
              for lan in languages}
# We can use the same loss function for training all language models.
loss_function = nn.NLLLoss() 

In [551]:
#--- training ---
for epoch in range(N_EPOCHS):
    total_loss = 0
    for lan in data['training']:  
        trainset = data['training'][lan]
        optimizer = optimizers[lan]
        model = models[lan]
        # Generally speaking, it's a good idea to shuffle your
        # training sets once every epoch. This serves as an
        # approximation of drawing training examples at random
        # from the training set.
        shuffle(trainset)

        for i in range(0,int(len(trainset)/BATCH_SIZE),BATCH_SIZE):
            minibatchwords = trainset[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
            minibatch = [choice(word_ex['TUPLES']) 
                         for word_ex in minibatchwords]
            
            # WRITE CODE HERE
           
            #------CASE1-------, minibatch size = 1
            
            #predicting_letters = torch.tensor(minibatch[0][0],dtype=torch.long)
            #following_letter = torch.tensor(minibatch[0][1],dtype=torch.long)
            
            
            #-----CASE 2--------, minibatch size = 10
            
            #create a tensor size 10x4 and goal tensor 10x1
            inputvectors = [i for i,j in minibatch]
            classes = [j for i,j in minibatch]
            mb_x = torch.cat(inputvectors,dim=0).view(BATCH_SIZE,4)
            following_letter = torch.cat(classes,dim=0)
            
            #sanity check:
            #print(mb_x)
            #print(following_letter.shape)
            
            
            optimizer.zero_grad()
            
            log_probs = model(mb_x) #predict
            
            loss = loss_function(log_probs, following_letter) #count loss
            
            loss.backward()
            optimizer.step()

            total_loss += loss
    
    if ((epoch+1) % REPORT_EVERY) == 0:
        
        acc = evaluate(data['dev'],models,HISTORY_SIZE,character_map)
        print('epoch: %d, loss: %.4f, dev acc: %.2f%%' % 
              (epoch+1, total_loss, acc))
        


epoch: 100, loss: 9.6846, dev acc: 76.75%
epoch: 200, loss: 10.0040, dev acc: 75.83%
epoch: 300, loss: 9.8815, dev acc: 76.42%
epoch: 400, loss: 9.1840, dev acc: 76.92%
epoch: 500, loss: 10.0524, dev acc: 76.75%
epoch: 600, loss: 9.7608, dev acc: 76.75%
epoch: 700, loss: 9.3985, dev acc: 76.42%
epoch: 800, loss: 9.4772, dev acc: 76.58%
epoch: 900, loss: 9.5494, dev acc: 77.00%
epoch: 1000, loss: 8.6998, dev acc: 75.17%
epoch: 1100, loss: 9.0632, dev acc: 74.67%
epoch: 1200, loss: 9.7724, dev acc: 75.42%
epoch: 1300, loss: 8.5290, dev acc: 76.17%
epoch: 1400, loss: 8.8916, dev acc: 74.50%
epoch: 1500, loss: 8.2444, dev acc: 76.00%


In [552]:
acc = evaluate(data['test'],models,HISTORY_SIZE,character_map)        
print('test acc: %.2f%%' % (acc))

test acc: 75.08%


### Accuracy up to 75-76% with batch size 100, n_epochs = 1500, report_every = 100