In [1]:
import itertools
import logging
from tqdm import tqdm

from datamaestro import prepare_dataset
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
import torch.nn.functional as F
from typing import List
import time
logging.basicConfig(level=logging.INFO)

ds = prepare_dataset('org.universaldependencies.french.gsd')


# Format de sortie décrit dans
# https://pypi.org/project/conllu/

class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """
    OOVID = 1
    PAD = 0

    def __init__(self, oov: bool):
        self.oov =  oov
        self.id2word = [ "PAD"]
        self.word2id = { "PAD" : Vocabulary.PAD}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self,idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self,idx: List[int]):
        return [self.getword(i) for i in idx]



class TaggingDataset():
    def __init__(self, data, words: Vocabulary, tags: Vocabulary, adding=True):
        self.sentences = []

        for s in data:
            self.sentences.append(([words.get(token["form"], adding) for token in s], [tags.get(token["upostag"], adding) for token in s]))
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, ix):
        return self.sentences[ix]


def collate(batch):
    """Collate using pad_sequence"""
    return tuple(pad_sequence([torch.LongTensor(b[j]) for b in batch]) for j in range(2))


logging.info("Loading datasets...")
words = Vocabulary(True)
tags = Vocabulary(False)
train_data = TaggingDataset(ds.train, words, tags, True)
dev_data = TaggingDataset(ds.validation, words, tags, True)
test_data = TaggingDataset(ds.test, words, tags, False)


logging.info("Vocabulary size: %d", len(words))


BATCH_SIZE=100

train_loader = DataLoader(train_data, collate_fn=collate, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_data, collate_fn=collate, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, collate_fn=collate, batch_size=BATCH_SIZE)

INFO:root:Loading datasets...
INFO:root:Vocabulary size: 42930


In [3]:
class LSTMTag(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMTag, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        lstm_out, (hidden_state, cell_state) = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        return tag_space

In [102]:
EMBEDDING_DIM = 32
HIDDEN_DIM = 32


model = LSTMTag(EMBEDDING_DIM, HIDDEN_DIM, len(words), len(tags))
criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding idx : 0
optimizer = optim.Adam(model.parameters(), lr=1e-3)


train_loss, train_accuracy, test_loss, test_accuracy= [], [], [], []
for epoch in range(50):
    # Train

    model.train()
    correct = 0; total = 0
    for sentence, tag in train_loader:
        b,t = target.shape

        # Compute Loss
        target = tag.permute(1,0)
        model.zero_grad()
        tag_scores = model(sentence).permute(1,2,0)
        l_train = criterion(tag_scores, target)
        l_train.backward()
        optimizer.step()

        # Compute accuracy
        predicted = tag_scores.max(1)[1]
        c = (target == predicted) 
        c[torch.where(target == 0)] = False
        correct += c.sum() # nb of correct prediction
        total += b*t - len(torch.where(target == 0)[0]) # nb of tags to predict
        
    train_acc = 100 * correct / float(total)
    train_accuracy.append(train_acc)
    train_loss.append(l_train)

    # Test

    model.eval()
    correct = 0 ; total = 0
    for sentence, tag in dev_loader:
        b,t = target.shape

        # Compute Loss
        target = tag.permute(1,0)
        tag_scores = model(sentence).permute(1,2,0)
        l_test = criterion(tag_scores, target)

        # Compute accuracy
        predicted = tag_scores.max(1)[1]
        c = (target == predicted) 
        c[torch.where(target == 0)] = False
        correct += c.sum() # nb of correct prediction
        total += b*t - len(torch.where(target == 0)[0]) # nb of tags to predict

    test_acc = 100 * correct / float(total)
    test_accuracy.append(test_acc)
    test_loss.append(l_test)


    print(f"epoch {epoch} : Train Loss {'%.2f'%l_train}, Train acc {'%.2f'%train_acc}, Test Loss {'%.2f'%l_test}, Test acc {'%.2f'%test_acc}")

epoch 0 : Train Loss 1.72, Train acc 32.17, Test Loss 1.70, Test acc 37.84
epoch 1 : Train Loss 1.04, Train acc 63.97, Test Loss 1.06, Test acc 52.21
epoch 2 : Train Loss 0.78, Train acc 74.57, Test Loss 0.80, Test acc 63.55
epoch 3 : Train Loss 0.69, Train acc 79.40, Test Loss 0.67, Test acc 61.73
epoch 4 : Train Loss 0.60, Train acc 82.16, Test Loss 0.58, Test acc 70.23
epoch 5 : Train Loss 0.49, Train acc 84.67, Test Loss 0.52, Test acc 69.84
epoch 6 : Train Loss 0.41, Train acc 86.61, Test Loss 0.47, Test acc 72.53
epoch 7 : Train Loss 0.39, Train acc 89.02, Test Loss 0.43, Test acc 59.73
epoch 8 : Train Loss 0.34, Train acc 89.89, Test Loss 0.40, Test acc 71.24
epoch 9 : Train Loss 0.39, Train acc 91.44, Test Loss 0.37, Test acc 65.69
epoch 10 : Train Loss 0.32, Train acc 91.97, Test Loss 0.34, Test acc 77.71
epoch 11 : Train Loss 0.24, Train acc 93.14, Test Loss 0.33, Test acc 71.27
epoch 12 : Train Loss 0.22, Train acc 93.85, Test Loss 0.31, Test acc 72.16
epoch 13 : Train Loss 

In [108]:
print(words.getwords(test_data[0][0]))
print(tags.getwords(test_data[0][0]))

['__OOV__', ',', 'un', 'film', 'sur', 'la', 'vie', 'de', 'Hughes', '.']


In [None]:
print("Training Finished!!!\nAgain testing on unknown data")
with torch.no_grad():
    for seq in [seq1, seq2]:
        inputs = prepare_sequence(seq, word_to_ix)
        tag_scores = model(inputs)
        _, indices = torch.max(tag_scores, 1)
        ret = []
        for i in range(len(indices)):
            for key, value in tag_to_ix.items():
                if indices[i] == value:
                    ret.append((seq[i], key))
        print(ret)