## Loading files

### Setup

In [1]:
# !pip install pyconll torchtext livelossplot

In [2]:
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu -o data/perseus-conllu/grc_perseus-ud-train.conllu
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-dev.conllu -o data/perseus-conllu/grc_perseus-ud-dev.conllu
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-test.conllu -o data/perseus-conllu/grc_perseus-ud-test.conllu

### Parsing

In [1]:
import pyconll

In [2]:
def parse_into_list(body):
    data = []
    for sentence in body:
        sentence_words = []
        sentence_tags = []
        for token in sentence:
            sentence_words.append(token.form)
            sentence_tags.append(token.upos)

        if len(sentence_words) > 0:
            data.append((sentence_words, sentence_tags))
    
    return data

In [3]:
train_file = pyconll.load_from_file('data/perseus-conllu/grc_perseus-ud-train.conllu')
val_file = pyconll.load_from_file('data/perseus-conllu/grc_perseus-ud-dev.conllu')

train = parse_into_list(train_file)
val = parse_into_list(val_file)

len(train),len(val)

(11476, 1137)

In [18]:
word_to_ix = { '<UNK>': 0 }
char_to_ix = { '<UNK>': 0 }
for words, tags in train:
    for word in words:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
        for char in word:
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)

            
len(word_to_ix),len(char_to_ix)

(33238, 186)

In [5]:
tag_to_ix = {}
for sent, tags in train:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

len(tag_to_ix)

14

## Model setup

Based on https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils
import torch.autograd as autograd

from model.lstm_char import LSTMCharTagger

In [7]:
torch.manual_seed(1)

WORD_EMBEDDING_DIM = 5
CHAR_EMBEDDING_DIM = 6
CHAR_REPR_DIM = 3
HIDDEN_DIM = 6

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [19]:
def make_ixs(seq, to_ix):
    ixs = torch.tensor([to_ix[w] if w in to_ix else to_ix['<UNK>'] for w in seq])
    return ixs

In [20]:
model = LSTMCharTagger(WORD_EMBEDDING_DIM,
                   CHAR_EMBEDDING_DIM, CHAR_REPR_DIM,
                   HIDDEN_DIM,
                   len(word_to_ix), len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Model training

In [21]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm import tqdm

%matplotlib inline

In [None]:
# TODO: evaluate loss on separate validation dataset

bs = 32
train_loader = torch.utils.data.DataLoader(train, batch_size=bs, shuffle=True, num_workers=0)
val_loader = torch.utils.data.DataLoader(val, batch_size=bs, shuffle=True, num_workers=0)
train_losses = []

for epoch in range(10):
    total_loss = 0

    model.train()
    for i, data in tqdm(enumerate(train_loader), total=len(train) / bs):                
        for i in range(len(data[0])):
            sentence = data[0][i]
            tags = data[1][i]
            
            word_characters_ixs = {}
            for word in sentence:
                word_ix = torch.tensor([word_to_ix[word]]) if word in word_to_ix else torch.tensor([word_to_ix['<UNK>']])
                char_ixs = make_ixs(word, char_to_ix)
                word_characters_ixs[word_ix] = char_ixs

            targets = make_ixs(tags, tag_to_ix)

            model.zero_grad()

            model.init_word_hidden()
            tag_scores = model(sentence, word_characters_ixs)

            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss

    epoch_loss = total_loss / len(train)
    
    print('Epoch %d: %.4f' % (epoch, total_loss / len(train)))
    train_losses.append(total_loss / len(train))
    
    # Evaluate on validation dataset
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    for i, data in tqdm(enumerate(val_loader), total=len(val) / bs):        
        model.zero_grad()
        
        for i in range(len(data[0])):
            sentence = data[0][i]
            tags = data[1][i]
                        
            word_characters_ixs = {}
            for word in sentence:
                word_ix = torch.tensor([word_to_ix[word]]) if word in word_to_ix else torch.tensor([word_to_ix['<UNK>']])
                char_ixs = make_ixs(word, char_to_ix)
                word_characters_ixs[word_ix] = char_ixs
            
            targets = make_ixs(tags, tag_to_ix)

            model.zero_grad()

            model.init_word_hidden()
            tag_scores = model(sentence, word_characters_ixs)
            loss = loss_function(tag_scores, targets)

            val_loss += loss.item()
            _, predicted = tag_scores.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    print((100.*correct/total))

100%|██████████| 359/358.625 [01:24<00:00,  4.25it/s]
  0%|          | 0/35.53125 [00:00<?, ?it/s]

Epoch 0: 0.2544


101%|██████████| 36/35.53125 [00:05<00:00,  6.04it/s]
  0%|          | 0/358.625 [00:00<?, ?it/s]

17.421124828532236


100%|██████████| 359/358.625 [01:22<00:00,  4.38it/s]
  3%|▎         | 1/35.53125 [00:00<00:04,  7.01it/s]

Epoch 1: 0.1896


101%|██████████| 36/35.53125 [00:06<00:00,  5.97it/s]
  0%|          | 1/358.625 [00:00<00:40,  8.85it/s]

26.622354381668707


100%|██████████| 359/358.625 [01:25<00:00,  4.22it/s]
  0%|          | 0/35.53125 [00:00<?, ?it/s]

Epoch 2: 0.1565


101%|██████████| 36/35.53125 [00:05<00:00,  6.25it/s]
  0%|          | 0/358.625 [00:00<?, ?it/s]

36.373448461953586


100%|██████████| 359/358.625 [01:24<00:00,  4.22it/s]
  0%|          | 0/35.53125 [00:00<?, ?it/s]

Epoch 3: 0.1365


101%|██████████| 36/35.53125 [00:06<00:00,  5.46it/s]
  0%|          | 0/358.625 [00:00<?, ?it/s]

38.829615260265115


100%|██████████| 359/358.625 [01:25<00:00,  4.19it/s]
  3%|▎         | 1/35.53125 [00:00<00:06,  5.11it/s]

Epoch 4: 0.1258


101%|██████████| 36/35.53125 [00:06<00:00,  5.97it/s]
  0%|          | 1/358.625 [00:00<01:03,  5.62it/s]

43.52286558859329


 28%|██▊       | 102/358.625 [00:24<01:10,  3.65it/s]

## Inference

In [23]:
ix_to_tag = {v: k for k, v in tag_to_ix.items()}
# ix_to_tag

In [27]:
sentence = val[0][0] # first sentence in the validation dataset
targets = val[0][1]

with torch.no_grad():
    word_characters_ixs = {}
    for word in sentence:
        word_ix = torch.tensor([word_to_ix[word]]) if word in word_to_ix else torch.tensor([word_to_ix['<UNK>']])
        char_ixs = make_ixs(word, char_to_ix)
        word_characters_ixs[word_ix] = char_ixs

    inputs = make_ixs(sentence, word_to_ix)
    token_scores = model(inputs, word_characters_ixs)
    scores = [score.tolist() for score in token_scores]
    tag_ix = [score.index(max(score)) for score in scores]
    tags = [ix_to_tag[tag] if tag in ix_to_tag else '' for tag in tag_ix]

    for i, (word, tag) in enumerate(zip(sentence, tags)):
        print('%s = %s (should be %s)' % (word, tag, targets[i]))

τὰ = NUM (should be DET)
γὰρ = NUM (should be ADV)
πρὸ = NUM (should be ADP)
αὐτῶν = NUM (should be PRON)
καὶ = NUM (should be CCONJ)
τὰ = NUM (should be DET)
ἔτι = NUM (should be ADV)
παλαίτερα = NUM (should be ADJ)
σαφῶς = NUM (should be ADV)
μὲν = NUM (should be ADV)
εὑρεῖν = NUM (should be VERB)
διὰ = NUM (should be ADP)
χρόνου = NUM (should be NOUN)
πλῆθος = PRON (should be NOUN)
ἀδύνατα = NUM (should be ADJ)
ἦν = NUM (should be VERB)
, = NUM (should be PUNCT)
ἐκ = PRON (should be ADP)
δὲ = PRON (should be CCONJ)
τεκμηρίων = NUM (should be NOUN)
ὧν = NUM (should be PRON)
ἐπὶ = NUM (should be ADP)
μακρότατον = NUM (should be ADJ)
σκοποῦντί = NUM (should be VERB)
μοι = NUM (should be PRON)
πιστεῦσαι = NUM (should be VERB)
ξυμβαίνει = X (should be VERB)
οὐ = NUM (should be ADV)
μεγάλα = NUM (should be ADJ)
νομίζω = NUM (should be VERB)
γενέσθαι = NUM (should be VERB)
οὔτε = NUM (should be ADV)
κατὰ = NUM (should be ADP)
τοὺς = NUM (should be DET)
πολέμους = NUM (should be NOUN)
οὔτε 