In [39]:
import itertools
import logging
from tqdm import tqdm

from datamaestro import prepare_dataset
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
from typing import List
import time
logging.basicConfig(level=logging.INFO)

ds = prepare_dataset('org.universaldependencies.french.gsd')


# Format de sortie décrit dans
# https://pypi.org/project/conllu/

class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """
    OOVID = 1
    PAD = 0

    def __init__(self, oov: bool):
        self.oov =  oov
        self.id2word = [ "PAD"]
        self.word2id = { "PAD" : Vocabulary.PAD}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self,idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self,idx: List[int]):
        return [self.getword(i) for i in idx]



class TaggingDataset():
    def __init__(self, data, words: Vocabulary, tags: Vocabulary, adding=True):
        self.sentences = []

        for s in data:
            self.sentences.append(([words.get(token["form"], adding) for token in s], [tags.get(token["upostag"], adding) for token in s]))
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, ix):
        return self.sentences[ix]


def collate(batch):
    """Collate using pad_sequence"""
    return tuple(pad_sequence([torch.LongTensor(b[j]) for b in batch]) for j in range(2))


logging.info("Loading datasets...")
words = Vocabulary(True)
tags = Vocabulary(False)
train_data = TaggingDataset(ds.train, words, tags, True)
dev_data = TaggingDataset(ds.validation, words, tags, True)
test_data = TaggingDataset(ds.test, words, tags, False)


logging.info("Vocabulary size: %d", len(words))


BATCH_SIZE=100

train_loader = DataLoader(train_data, collate_fn=collate, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_data, collate_fn=collate, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, collate_fn=collate, batch_size=BATCH_SIZE)


INFO:root:Loading datasets...
INFO:root:Vocabulary size: 42930


In [46]:
class LSTMTag(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMTag, self).__init__()
        
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        lstm_out, (hidden_state, cell_state) = self.lstm(embeds)
        import ipdb;ipdb.set_trace()
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        import ipdb;ipdb.set_trace()
        return tag_scores


In [47]:
EMBEDDING_DIM = 60
HIDDEN_DIM = 120


model = LSTMTag(EMBEDDING_DIM, HIDDEN_DIM, len(words), len(tags))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)



for epoch in range(50):
    for sentence, tags in train_loader:
        model.zero_grad()
        
        tag_scores = model(sentence)
        
        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
        print(f"epoch {epoch} : Loss {loss}")


> [0;32m<ipython-input-46-dbbaf23a7ecd>[0m(18)[0;36mforward[0;34m()[0m
[0;32m     17 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m[0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 18 [0;31m        [0mtag_space[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mhidden2tag[0m[0;34m([0m[0mlstm_out[0m[0;34m.[0m[0mview[0m[0;34m([0m[0mlen[0m[0;34m([0m[0msentence[0m[0;34m)[0m[0;34m,[0m [0;34m-[0m[0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0mtag_scores[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0mlog_softmax[0m[0;34m([0m[0mtag_space[0m[0;34m,[0m [0mdim[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
torch.Size([73, 100, 120])


BdbQuit: 

In [None]:
print("Training Finished!!!\nAgain testing on unknown data")
with torch.no_grad():
    for seq in [seq1, seq2]:
        inputs = prepare_sequence(seq, word_to_ix)
        tag_scores = model(inputs)
        _, indices = torch.max(tag_scores, 1)
        ret = []
        for i in range(len(indices)):
            for key, value in tag_to_ix.items():
                if indices[i] == value:
                    ret.append((seq[i], key))
        print(ret)

In [28]:
train_data.__getitem__(1)

([21, 22, 23, 24, 9, 18, 25, 26, 27, 14, 28, 29, 9, 30, 31, 27, 32, 20],
 [1, 2, 4, 5, 7, 1, 2, 11, 7, 1, 2, 10, 7, 1, 2, 7, 12, 10])

In [33]:
tags.getword(train_data.__getitem__(0)[1][1])

'œuvre'