## Loading files

### Setup

In [2]:
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-train.conllu -o data/perseus-conllu/grc_perseus-ud-train.conllu
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-dev.conllu -o data/perseus-conllu/grc_perseus-ud-dev.conllu
# !curl https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-test.conllu -o data/perseus-conllu/grc_perseus-ud-test.conllu

In [1]:
!pip install pyconll torchtext livelossplot

Collecting pyconll
  Downloading https://files.pythonhosted.org/packages/2c/6e/c325d0db05ac1b8d45645de903e4ba691d419e861c915c3d4ebfcaf8ac25/pyconll-2.2.1-py3-none-any.whl
Collecting livelossplot
  Downloading https://files.pythonhosted.org/packages/7c/e4/a7884b57113dfe84d3565418820feae7a20964438beb1088b2b08820ad94/livelossplot-0.5.0-py3-none-any.whl
Installing collected packages: pyconll, livelossplot
Successfully installed livelossplot-0.5.0 pyconll-2.2.1


### Parsing

In [1]:
import pyconll

In [2]:
def parse_into_list(body):
    data = []
    for sentence in body:
        sentence_words = []
        sentence_tags = []
        for token in sentence:
            sentence_words.append(token.form)
            sentence_tags.append(token.upos)

        if len(sentence_words) > 0:
            data.append((sentence_words, sentence_tags))
    
    return data

In [3]:
train_file = pyconll.load_from_file('data/perseus-conllu/grc_perseus-ud-train.conllu')
val_file = pyconll.load_from_file('data/perseus-conllu/grc_perseus-ud-dev.conllu')

train = parse_into_list(train_file)
val = parse_into_list(val_file)

len(train),len(val)

(11476, 1137)

In [4]:
word_to_ix = { '<UNK>': 0 }
char_to_ix = { '<UNK>': 0 }
for words, tags in train:
    for word in words:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
        for char in word:
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)

            
len(word_to_ix),len(char_to_ix)

(33238, 186)

In [5]:
tag_to_ix = {}
for sent, tags in train:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

len(tag_to_ix)

14

## Model setup

Based on https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils
import torch.utils.data
import torch.autograd as autograd

from model.lstm_char import LSTMCharTagger

In [7]:
torch.manual_seed(1)

WORD_EMBEDDING_DIM = 5
CHAR_EMBEDDING_DIM = 6
CHAR_REPR_DIM = 3
HIDDEN_DIM = 6

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [8]:
def make_ixs(seq, to_ix):
    ixs = torch.tensor([to_ix[w] if w in to_ix else to_ix['<UNK>'] for w in seq]).to(device)
    return ixs

In [9]:
model = LSTMCharTagger(WORD_EMBEDDING_DIM,
                   CHAR_EMBEDDING_DIM, CHAR_REPR_DIM,
                   HIDDEN_DIM,
                   len(word_to_ix), len(char_to_ix), len(tag_to_ix),
                   device)
model.to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Model training

In [10]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm import tqdm

%matplotlib inline

In [None]:
# TODO: evaluate loss on separate validation dataset

bs = 32
train_loader = torch.utils.data.DataLoader(train, batch_size=bs, shuffle=True, num_workers=0)
val_loader = torch.utils.data.DataLoader(val, batch_size=bs, shuffle=True, num_workers=0)
train_losses = []

for epoch in range(20):
    total_loss = 0

    model.train()
    for i, data in tqdm(enumerate(train_loader), total=len(train) / bs):                
        for i in range(len(data[0])):
            sentence = data[0][i]
            tags = data[1][i]
            
            word_characters_ixs = {}
            for word in sentence:
                word_ix = torch.tensor([word_to_ix[word]]).to(device) if word in word_to_ix else torch.tensor([word_to_ix['<UNK>']]).to(device)
                char_ixs = make_ixs(word, char_to_ix)
                word_characters_ixs[word_ix] = char_ixs

            targets = make_ixs(tags, tag_to_ix)

            model.zero_grad()

            model.init_word_hidden()
            tag_scores = model(word_characters_ixs)

            loss = loss_function(tag_scores, targets)
            loss.backward() # calculate gradients
            optimizer.step() # update hidden layers based on gradients

            total_loss += loss

    epoch_loss = total_loss / len(train)
    
    print('Epoch %d: %.4f' % (epoch, total_loss / len(train)))
    train_losses.append(total_loss / len(train))
    
    # Evaluate on validation dataset
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    for i, data in tqdm(enumerate(train_loader), total=len(train) / bs):        
        model.zero_grad()
        
        for i in range(len(data[0])):
            sentence = data[0][i]
            tags = data[1][i]
                        
            word_characters_ixs = {}
            for word in sentence:
                word_ix = torch.tensor([word_to_ix[word]]).to(device) if word in word_to_ix else torch.tensor([word_to_ix['<UNK>']]).to(device)
                char_ixs = make_ixs(word, char_to_ix)
                word_characters_ixs[word_ix] = char_ixs
            
            targets = make_ixs(tags, tag_to_ix)

            model.zero_grad()

            model.init_word_hidden()
            tag_scores = model(word_characters_ixs)
            loss = loss_function(tag_scores, targets)

            val_loss += loss.item()
            _, predicted = tag_scores.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    print()
    print((100.*correct/total))
    print()

  6%|▋         | 23/358.625 [00:05<01:34,  3.54it/s]

In [None]:
# import pickle
# with open('model.pickle', 'wb') as f:
#     pickle.dump(model, f)

## Inference

In [12]:
ix_to_tag = {v: k for k, v in tag_to_ix.items()}
# ix_to_tag

In [13]:
sentence = val[0][0] # first sentence in the validation dataset
targets = val[0][1]

with torch.no_grad():
    word_characters_ixs = {}
    for word in sentence:
        word_ix = torch.tensor([word_to_ix[word]]).to(device) if word in word_to_ix else torch.tensor([word_to_ix['<UNK>']]).to(device)
        char_ixs = make_ixs(word, char_to_ix)
        word_characters_ixs[word_ix] = char_ixs

    inputs = make_ixs(sentence, word_to_ix)
    token_scores = model(inputs, word_characters_ixs)
    scores = [score.tolist() for score in token_scores]
    tag_ix = [score.index(max(score)) for score in scores]
    tags = [ix_to_tag[tag] if tag in ix_to_tag else '' for tag in tag_ix]

    for i, (word, tag) in enumerate(zip(sentence, tags)):
        print('%s = %s (should be %s)' % (word, tag, targets[i]))

τὰ = DET (should be DET)
γὰρ = PART (should be ADV)
πρὸ = ADP (should be ADP)
αὐτῶν = PRON (should be PRON)
καὶ = CCONJ (should be CCONJ)
τὰ = DET (should be DET)
ἔτι = ADV (should be ADV)
παλαίτερα = NOUN (should be ADJ)
σαφῶς = VERB (should be ADV)
μὲν = PART (should be ADV)
εὑρεῖν = ADJ (should be VERB)
διὰ = ADP (should be ADP)
χρόνου = NOUN (should be NOUN)
πλῆθος = NOUN (should be NOUN)
ἀδύνατα = VERB (should be ADJ)
ἦν = VERB (should be VERB)
, = PUNCT (should be PUNCT)
ἐκ = ADP (should be ADP)
δὲ = PART (should be CCONJ)
τεκμηρίων = NOUN (should be NOUN)
ὧν = PRON (should be PRON)
ἐπὶ = ADP (should be ADP)
μακρότατον = VERB (should be ADJ)
σκοποῦντί = ADJ (should be VERB)
μοι = PRON (should be PRON)
πιστεῦσαι = NOUN (should be VERB)
ξυμβαίνει = NOUN (should be VERB)
οὐ = ADV (should be ADV)
μεγάλα = ADJ (should be ADJ)
νομίζω = VERB (should be VERB)
γενέσθαι = VERB (should be VERB)
οὔτε = ADV (should be ADV)
κατὰ = ADP (should be ADP)
τοὺς = DET (should be DET)
πολέμους = NOUN 