# Reconnaissance d'entités nommées avec un Bi-LSTM (pytorch)

Avant toute chose, n'oubliez pas de choisir un environnement GPU dans Colab (`Exécution` $\rightarrow$ `Modifier le type d'exécution`)

Xavier Tannier

In [None]:
!pip install pytorch-lightning --quiet
!pip install torchmetrics --quiet

In [None]:
from os.path import isfile, isdir, join

import torch
from torch import nn
from torch import autograd
from torch import optim
from torch.utils.data import Dataset
import torch.nn.functional as F

import torchmetrics

import pytorch_lightning as pl

from tqdm.auto import tqdm

from collections import Counter
import codecs 

# Manual seed to ensure reproducibility
torch.manual_seed(1)

## Connexion à la source de données

In [None]:
# Colab & Drive libraries 
from google.colab import files
from googleapiclient.http import MediaIoBaseDownload
from google.colab import drive
# Mount Google drive. This will prompt for authorization.
drive.mount('/content/drive', force_remount=True)

In [None]:
train_file = '/content/drive/My Drive/data/conll/eng/train.txt'
val_file = '/content/drive/My Drive/data/conll/eng/valid.txt'
test_file = '/content/drive/My Drive/data/conll/eng/test.txt'

# minimum frequency for a word to have its own embeddings
min_word_freq = 2
# Batch size
batch_size = 64

# how big is each word vector (if not preloaded)
embed_size = 50 

# how many times to iterate over all samples
n_epochs = 15 

# CPU workers
workers = 1

assert isfile(train_file)
assert isfile(val_file)
assert isfile(test_file)

## Lecture des fichiers au format IOB

In [None]:
def read_words_tags(file, tag_ind, caseless=True):
    """
    Reads raw data in the CoNLL 2003 format and returns word and tag sequences.
    :param file: file with raw data in the CoNLL 2003 format
    :param tag_ind: column index of tag
    :param caseless: lowercase words?
    :return: word, tag sequences
    """
    with codecs.open(file, 'r', 'utf-8') as f:
        lines = f.readlines()
    words = []
    tags = []
    temp_w = []
    temp_t = []
    for line in lines:
        if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
            feats = line.rstrip('\n').split()
            temp_w.append(feats[0].lower() if caseless else feats[0])
            temp_t.append(feats[tag_ind])
        elif len(temp_w) > 0:
            assert len(temp_w) == len(temp_t)
            words.append(temp_w)
            tags.append(temp_t)
            temp_w = []
            temp_t = []
    # last sentence
    if len(temp_w) > 0:
        assert len(temp_w) == len(temp_t)
        words.append(temp_w)
        tags.append(temp_t)

    # Sanity check
    assert len(words) == len(tags)

    return words, tags


In [None]:
train_tokens, train_tags = read_words_tags(train_file,-1)
assert len(train_tokens) == len(train_tags)

## Comptage de mots

- Création d'un compteur de tokens
- Création d'un dictionnaire de tokens (token --> identifiant)
- Création d'un dictionnaire de labels (label --> identifiant)

In [None]:
# Compteur de tokens
word_freq = Counter()
# Dictionnaire de labels
tag_map = {}
# Dictionnaire de tokens
word_map = {}

## Encoder les phrases avec les identifiants des mots

avec un token `<end>` à la fin de chaque phrase

`[['dunston', 'checks', 'in', '<end>']]` -> `[[4670, 4670, 185, 4669]]`

In [None]:
# Encode sentences into word maps with <end> at the end
train_word_inputs = []


## Encoder les listes de labels avec les identifiants des labels

avec un token `<end>` à la fin

In [None]:
train_tag_inputs = []

## Padding

Le padding transforme les phrases de tailles différentes en une matrice dans laquelle les phrases plus courtes sont complétées par des `0`.

### Tailles et masques des phrases

- Création d'une liste contenant la longueur de chaque phrase
- Création des masques (pour application après le padding) : une matrice phrases/mots avec des `True` quand la case correspond à un vrai token dans la phrase, et `False` quand la phrase est terminée (padding).

In [None]:
train_sent_lengths = []


### Création des matrices avec padding

## Création du "`Dataset`"

Un `Dataset` est un objet *pytorch* qui permet d'itérer sur les objets du jeu de données (phrases).

In [None]:
class NERDataset(Dataset):
    """
    PyTorch Dataset 
    """

    def __init__(self, word_inputs, tag_inputs, sent_lengths, masks):
        """
        :param word_inputs: padded word sequences
        :param tag_inputs: padded tag sequences 
        :param sent_lengths: word sequence lengths
        :param masks: masks
        """
        self.word_inputs = word_inputs
        self.tag_inputs = tag_inputs
        self.sent_lengths = sent_lengths
        self.masks = masks

        self.data_size = len(self.word_inputs)

    def __getitem__(self, i):
        return self.word_inputs[i], self.tag_inputs[i], \
               self.sent_lengths[i], self.masks[i]

    def __len__(self):
        return self.data_size

## Création du `DataLoader`

Le `DataLoader` est l'objet qui enveloppe le `Dataset` dans un mécanisme permettant de livrer des **mini-batchs** de données au modèle lors de l'entraînement.

In [None]:
train_loader = torch.utils.data.DataLoader(NERDataset(train_word_inputs, train_tag_inputs, train_sent_lengths), 
                                           batch_size=batch_size, shuffle=True,
                                           num_workers=workers, pin_memory=False)

## Le modèle

In [None]:
class BiLSTM(pl.LightningModule):
    """
    Sequence classification module
    """
    def __init__(self, vocab_size, embed_size, label_number,
                 batch_size,
                 hidden_size=100, dropout=0.5):
        super(BiLSTM, self).__init__()
        self.automatic_optimization = True
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, 
                            batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.classif = nn.Linear(2*hidden_size, label_number)

        self.loss_fn= nn.NLLLoss(reduction='mean')

        self.train_metrics = torchmetrics.MetricCollection({
            'precision/train': torchmetrics.Precision(num_classes=label_number, average='macro'),
            'recall/train': torchmetrics.Recall(num_classes=label_number, average='macro'),
            'F1/train': torchmetrics.F1(num_classes=label_number, average='macro'),
            'accuracy/train': torchmetrics.Accuracy()
        })
        self.val_metrics = torchmetrics.MetricCollection({
            'precision/val': torchmetrics.Precision(num_classes=label_number, average='macro'),
            'recall/val': torchmetrics.Recall(num_classes=label_number, average='macro'),
            'F1/val': torchmetrics.F1(num_classes=label_number, average='macro'),
            'accuracy/val': torchmetrics.Accuracy()
        })
        self.hidden_size = hidden_size
        self.batch_size = batch_size


    def forward(self, x, lengths):

        #print('x', x.shape)

        h_embedding = self.embedding(x)
        #print('h_embedding', h_embedding.shape)
        h_embedding = torch.nn.utils.rnn.pack_padded_sequence(h_embedding,
                                                                lengths.cpu().numpy(),
                                                                batch_first=True,
                                                               enforce_sorted=False)
        hidden = None
        h_lstm, hidden = self.lstm(h_embedding, hidden)
        output, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(h_lstm, batch_first=True)  
        #print('output', output.shape)

        conc = output
        conc = self.dropout(conc)
        out = self.classif(conc)
        #print('out', out.shape)
        return out

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x, y, lengths, masks = batch
        out = self(x, lengths)

        pack_masks = torch.nn.utils.rnn.pack_padded_sequence(masks,
                                                        lengths.cpu().numpy(),
                                                        batch_first=True,
                                                        enforce_sorted=False)
        masks, _ = torch.nn.utils.rnn.pad_packed_sequence(pack_masks, batch_first=True)  
        pack_y = torch.nn.utils.rnn.pack_padded_sequence(y,
                                                        lengths.cpu().numpy(),
                                                        batch_first=True,
                                                        enforce_sorted=False)
        y, _ = torch.nn.utils.rnn.pad_packed_sequence(pack_y, batch_first=True)  
        masked_y = torch.masked_select(y, masks)
        masked_out = out[masks] 
        score = F.log_softmax(masked_out, 1)
        loss = self.loss_fn(score, masked_y)


        _, preds  = torch.max(score, 1)
        self.train_metrics(preds, masked_y)
        return loss

    def training_epoch_end(self, outs):
        m = self.train_metrics.compute()
        self.log_dict(m, on_step=False, on_epoch=True, prog_bar=True)
        print('train', m)
        self.train_metrics.reset()

    def validation_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x, y, lengths, masks = batch
        out = self(x, lengths)

        pack_masks = torch.nn.utils.rnn.pack_padded_sequence(masks,
                                                        lengths.cpu().numpy(),
                                                        batch_first=True,
                                                        enforce_sorted=False)
        masks, _ = torch.nn.utils.rnn.pad_packed_sequence(pack_masks, batch_first=True)  
        pack_y = torch.nn.utils.rnn.pack_padded_sequence(y,
                                                        lengths.cpu().numpy(),
                                                        batch_first=True,
                                                        enforce_sorted=False)
        y, _ = torch.nn.utils.rnn.pad_packed_sequence(pack_y, batch_first=True)  

        masked_y = torch.masked_select(y, masks)
        masked_out = out[masks] 
        score = F.log_softmax(masked_out, 1)
        loss = self.loss_fn(score, masked_y)
        _, preds  = torch.max(score, 1)

        self.val_metrics(preds, masked_y)
        return loss

    def validation_epoch_end(self, outs):
        # log epoch metric
        m = self.val_metrics.compute()
        self.log_dict(m, on_step=False, on_epoch=True, prog_bar=True)
        print('val', m)
        self.val_metrics.reset()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.015) 
        return optimizer

