## Loading files

### Setup

In [1]:
# !pip install pyconll torchtext livelossplot

In [2]:
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu -o data/perseus-conllu/grc_perseus-ud-train.conllu
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-dev.conllu -o data/perseus-conllu/grc_perseus-ud-dev.conllu
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-test.conllu -o data/perseus-conllu/grc_perseus-ud-test.conllu

### Parsing

In [1]:
import pyconll

In [2]:
def parse_into_list(body):
    data = []
    for sentence in body:
        sentence_words = []
        sentence_tags = []
        for token in sentence:
            sentence_words.append(token.form)
            sentence_tags.append(token.upos)

        if len(sentence_words) > 0:
            data.append((sentence_words, sentence_tags))
    
    return data

In [3]:
train_file = pyconll.load_from_file('data/perseus-conllu/grc_perseus-ud-train.conllu')
val_file = pyconll.load_from_file('data/perseus-conllu/grc_perseus-ud-dev.conllu')

train = parse_into_list(train_file)
val = parse_into_list(val_file)

len(train),len(val)

(11476, 1306)

In [4]:
word_to_ix = { '<UNK>': 0 }
for words, tags in train:
    for word in words:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
len(word_to_ix)

33238

In [5]:
tag_to_ix = {}
for sent, tags in train:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

len(tag_to_ix)

14

In [6]:
# Preparing our vocabulary of characters
char_to_ix = {}
for word in word_to_ix.keys():
    for c in word:
        if c not in char_to_ix:
            char_to_ix[c] = len(char_to_ix)

print(char_to_ix)

{'<': 0, 'U': 1, 'N': 2, 'K': 3, '>': 4, 'ἐ': 5, 'ρ': 6, 'ᾷ': 7, 'μ': 8, 'ὲ': 9, 'ν': 10, 'ἁ': 11, 'γ': 12, 'ὸ': 13, 'ς': 14, 'ο': 15, 'ὐ': 16, 'α': 17, 'τ': 18, 'ῶ': 19, 'σ': 20, 'ι': 21, 'χ': 22, 'θ': 23, 'ό': 24, ',': 25, 'ἔ': 26, 'ω': 27, 'δ': 28, 'ῖ': 29, 'λ': 30, 'β': 31, 'ά': 32, 'ε': 33, 'υ': 34, '·': 35, 'ὄ': 36, '̓': 37, 'ἀ': 38, 'π': 39, 'ῦ': 40, 'ὼ': 41, 'κ': 42, 'ἡ': 43, 'ί': 44, 'ή': 45, 'ὰ': 46, 'ὶ': 47, 'Δ': 48, 'η': 49, 'ὥ': 50, 'ζ': 51, 'έ': 52, '.': 53, 'Ἱ': 54, 'ύ': 55, 'ῳ': 56, 'Ε': 57, 'Ἀ': 58, 'φ': 59, 'ὅ': 60, 'Π': 61, 'ἴ': 62, 'ὁ': 63, 'ὺ': 64, 'ἰ': 65, 'ᾶ': 66, 'ὴ': 67, 'ὀ': 68, 'ὔ': 69, 'Ἄ': 70, 'ἄ': 71, 'ἢ': 72, 'ῷ': 73, 'Ἔ': 74, 'ὢ': 75, 'ἶ': 76, 'ώ': 77, 'ὃ': 78, 'ὑ': 79, 'ὖ': 80, 'Κ': 81, 'ξ': 82, 'ᾠ': 83, 'Τ': 84, 'Ἑ': 85, 'ῆ': 86, 'ἠ': 87, 'ὒ': 88, 'ἒ': 89, 'ἂ': 90, 'ὕ': 91, 'ῃ': 92, 'ἱ': 93, 'ψ': 94, 'Β': 95, 'ἕ': 96, 'Μ': 97, 'Σ': 98, 'ᾀ': 99, 'ἃ': 100, 'Ῥ': 101, 'Ἴ': 102, 'ἅ': 103, 'Θ': 104, 'ϊ': 105, 'ἦ': 106, ';': 107, 'ὣ': 108, 'ᾳ': 109, 'ἥ': 110,

## Model setup

Based on https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils
import torch.autograd as autograd

from model.lstm_char import LSTMCharTagger

In [20]:
torch.manual_seed(1)

EMBEDDING_DIM = 100
HIDDEN_DIM = 1000

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [21]:
model = LSTMCharTagger(EMBEDDING_DIM, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix), len(char_to_ix)).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix['<UNK>'] for w in seq]
    return autograd.Variable(torch.tensor(idxs, dtype=torch.long).to(device))

## Model training

In [23]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

%matplotlib inline

In [24]:
def prepare_words_tensor(word_to_ix, char_to_ix):
    """Convert words(keys) in the dictionary word_to_ix into
    tensors that contains character indexes
    
    Args:
        word_to_ix(dict): key value pairs with words as keys and their indexes as values
        char_to_ix(dict): key value pairs with characters as keys and their indexes as values
    
    Returns:
        dict: Contains keys as index of words and values the tensors of words
    """
    list_words_tensor = {}
    for word, idx in word_to_ix.items():
        list_words_tensor[idx] = prepare_sequence(word, char_to_ix)
    return list_words_tensor

In [25]:
# TODO: evaluate loss on separate validation dataset

train_loader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True, num_workers=0)
val_loader = torch.utils.data.DataLoader(val, batch_size=32, shuffle=True, num_workers=0)
train_losses = []

words_tensors = prepare_words_tensor(word_to_ix, char_to_ix)

for epoch in range(10):
    total_loss = 0

    model.train()
    for i, data in enumerate(train_loader):        
        model.zero_grad()
        
        for i in range(len(data[0])):
            sentence = data[0][i]
            tags = data[1][i]
            
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            tag_scores = model(sentence_in, words_tensors)

            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss

    epoch_loss = total_loss / len(train)
    
    print('Epoch %d: %.4f' % (epoch, total_loss / len(train)))
    train_losses.append(total_loss / len(train))
    
    # Evaluate on validation dataset
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    for i, data in enumerate(val_loader):        
        model.zero_grad()
        
        for i in range(len(data[0])):
            sentence = data[0][i]
            tags = data[1][i]
            
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            tag_scores = model(sentence_in, words_tensors)
            loss = loss_function(tag_scores, targets)

            val_loss += loss.item()
            _, predicted = tag_scores.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    print((100.*correct/total))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

## Inference

In [86]:
ix_to_tag = {v: k for k, v in tag_to_ix.items()}
ix_to_tag

{0: 'VERB',
 1: 'ADV',
 2: 'ADJ',
 3: 'NOUN',
 4: 'PUNCT',
 5: 'CCONJ',
 6: 'ADP',
 7: 'DET',
 8: 'PRON',
 9: 'SCONJ',
 10: 'INTJ',
 11: 'NUM',
 12: 'X',
 13: 'PART'}

In [87]:
sentence = val[0][0] # first sentence in the validation dataset
targets = val[0][1]

with torch.no_grad():
    inputs = prepare_sequence(sentence, word_to_ix)
    token_scores = model(inputs)
    scores = [score.tolist() for score in token_scores]
    tag_ix = [score.index(max(score)) for score in scores]
    tags = [ix_to_tag[tag] if tag in ix_to_tag else '' for tag in tag_ix]

    for i, (word, tag) in enumerate(zip(sentence, tags)):
        print('%s = %s (should be %s)' % (word, tag, targets[i]))

τὰ = DET (should be DET)
γὰρ = PART (should be ADV)
πρὸ = ADP (should be ADP)
αὐτῶν = PRON (should be PRON)
καὶ = CCONJ (should be CCONJ)
τὰ = DET (should be DET)
ἔτι = ADV (should be ADV)
παλαίτερα = VERB (should be ADJ)
σαφῶς = ADV (should be ADV)
μὲν = PART (should be ADV)
εὑρεῖν = VERB (should be VERB)
διὰ = ADP (should be ADP)
χρόνου = NOUN (should be NOUN)
πλῆθος = NOUN (should be NOUN)
ἀδύνατα = ADJ (should be ADJ)
ἦν = VERB (should be VERB)
, = PUNCT (should be PUNCT)
ἐκ = ADP (should be ADP)
δὲ = ADV (should be CCONJ)
τεκμηρίων = VERB (should be NOUN)
ὧν = PRON (should be PRON)
ἐπὶ = ADP (should be ADP)
μακρότατον = PRON (should be ADJ)
σκοποῦντί = VERB (should be VERB)
μοι = PRON (should be PRON)
πιστεῦσαι = VERB (should be VERB)
ξυμβαίνει = VERB (should be VERB)
οὐ = ADV (should be ADV)
μεγάλα = ADJ (should be ADJ)
νομίζω = VERB (should be VERB)
γενέσθαι = VERB (should be VERB)
οὔτε = ADV (should be ADV)
κατὰ = ADP (should be ADP)
τοὺς = DET (should be DET)
πολέμους = NOUN (