#Requirements

In [None]:
!pip install polyglot
!pip install pyicu
!pip install pycld2
!pip install morfessor

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd 

import numpy as np 

from polyglot.text import Text
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")

from copy import copy

from random import shuffle

import re 

#Import the data 

In [None]:
URL = 'https://drive.google.com/file/d/1-6E4h5lH2AHRUBVJUtchXSggMRYr_-8F/view?usp=share_link'
path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]

df = pd.read_csv(path).dropna()
df = df.drop_duplicates('text')
df

Unnamed: 0.1,Unnamed: 0,speaker,text
0,0,Roselyne Bachelot,"Roselyne Bachelot en campagne à Angoulême : "" ..."
1,1,,"C'est un métier et à ce jeu, elle a quelques k..."
2,2,Roselyne Bachelot,* Roselyne Bachelot en campagne à Angoulême : ...
3,3,,"\n\nC 'est un métier et à ce jeu, elle a quelq..."
4,4,Roselyne Bachelot,\n\n* La graine de l'expérience *\n\nUn style ...
...,...,...,...
785,126,Marjolaine Meynier-Millefert,\n\nPar Pauline SEIGNEUR - Aujourd'hui à 17:23...
786,127,,"\n\nHabituellement, le samedi matin est un mom..."
787,128,,"Présidentielle : à Beauvais, le ministre Franc..."
788,129,,Le ministre du Commerce extérieur et de l'Attr...


# Generating train and test sets



Tokenizing and aligning output IOB tags

In [None]:
df['tags'] = df[['speaker', 'text']].apply(lambda x: ['I' if word in set(re.findall('[\w]+',x.speaker)) else 'O' for word in Text(x.text).words], axis=1 )
df['tokenized_text'] = df['text'].apply(lambda x: Text(x).words)
df = df[df.tokenized_text.apply(lambda x: len(x) < 200)]

df.head(5)

Unnamed: 0.1,Unnamed: 0,speaker,text,tags,tokenized_text
0,0,Roselyne Bachelot,"Roselyne Bachelot en campagne à Angoulême : "" ...","[I, I, O, O, O, O, O, O, O, O, O, O, O]","[Roselyne, Bachelot, en, campagne, à, Angoulêm..."
1,1,,"C'est un métier et à ce jeu, elle a quelques k...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[C'est, un, métier, et, à, ce, jeu, ,, elle, a..."
2,2,Roselyne Bachelot,* Roselyne Bachelot en campagne à Angoulême : ...,"[O, I, I, O, O, O, O, O, O, O, O, O, O, O, O, ...","[*, Roselyne, Bachelot, en, campagne, à, Angou..."
4,4,Roselyne Bachelot,\n\n* La graine de l'expérience *\n\nUn style ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[*, La, graine, de, l'expérience, *, Un, style..."
5,5,"Roselyne Bachelot, Stanislas Guerini","\n\nLa politique étrangère, justement. Une béq...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[La, politique, étrangère, ,, justement, ., Un..."


In [None]:
# Shuffle
df = df.sample(frac=1)

test_length  = int(df.shape[0] * 0.15)

test         = df[:test_length]
dev          = df[test_length:2*test_length]
train        = df[2*test_length:]

print(test.shape)
print(dev.shape)
print(train.shape)

(103, 5)
(103, 5)
(482, 5)


#Tokenization and computation of input and output vocabulary

In [None]:
#making character vocabulary 
def vocabulary(df, input_vocab, padding='<pad>', unknown='<unk>'):
    #input_vocab is a boolean flag that tells if we extract input or output vocabulary
    #the two optional flags indicate that a padding and an unknown token 
    #have to be added to the vocabulary if their value is not None

    if input_vocab:
      sequences_list = df['tokenized_text'].to_list()
    else:
      sequences_list = df['tags'].to_list()

    # Creates the mapping from ids to symbols (a list)
    idx2sym = list({element.strip() for sequence in sequences_list for element in sequence})

    # Decides whether to include the unknown token
    if unknown:
      idx2sym.append(unknown)

    # Decides whether to include the padding token
    if padding:
      idx2sym.append(padding)

    # Creates the mapping from symbols to ids (a dictionary)
    sym2idx = {idx2sym[j]: j for j in range(len(idx2sym))}

    # Return the two vocabulary maps idx2sym and sym2idx as a couple
    return idx2sym, sym2idx   

In [None]:
# Tests
idx2sym, sym2idx  = vocabulary(df,True,padding=None,unknown=None)
print(sym2idx)

{'Jules': 0, 'débutant': 1, 'logement': 2, "L'outil": 3, 'prenait': 4, 'Bonapartiste': 5, "l'eau": 6, 'reprochant': 7, 'réinvention': 8, 'cycle': 9, 'adressé': 10, 'hégémonique': 11, 'Joie': 12, 'cause': 13, 'Fugit': 14, 'réussis': 15, "n'intéresse": 16, "l'initiateur": 17, 'conclusions': 18, 'entendu': 19, 'Certes': 20, 'prenez': 21, 'indépendance': 22, 'rédaction': 23, 'Bruno': 24, 'stations': 25, 'convoyés': 26, "d'année": 27, 'scandent': 28, "qu'Éric": 29, 'dérive': 30, 'Pourtant': 31, 'recueille': 32, 'Leduc': 33, 'différentes': 34, 'exister': 35, 'Joël': 36, 'elle': 37, 'Pompili': 38, '0,31': 39, 'traceurs': 40, 'Soucheyre': 41, 'opposition': 42, 'cassé': 43, 'dividendes': 44, '20,5': 45, 'armes': 46, 'autre': 47, 'Lecoufle': 48, 'Reste': 49, 'pratique': 50, 'Chacun': 51, 'conseil': 52, 'dévisse': 53, 'locataire': 54, 'consultations': 55, 'sang': 56, 'Rouède': 57, 'ou': 58, 'battus': 59, 'st-ouest': 60, 'remis': 61, '3': 62, 'insécurité': 63, 'têtes': 64, 'Travail': 65, 'décider'

Coding and decoding sequences 

In [None]:
def pad_sequence(sequence,pad_size,pad_token):
    #returns a list with additional pad tokens to match pad_size if needed
    return [sequence[i] if i<len(sequence) else pad_token for i in range(pad_size)]
   
def code_sequence(sequence,coding_map,unk_token=None):
    #takes a list of strings and returns a list of integers
    return [coding_map[symbol] if symbol in coding_map else coding_map[unk_token] for symbol in sequence]

def decode_sequence(sequence,decoding_map):
    #takes a list of integers and returns a list of strings 
    return [decoding_map[id] for id in sequence]

#Dataloader (batch generator)

In [None]:
def df_to_examples(df):
    """
    Returns a list of sentences and a list of tags.
    A sentence is a list of strings (tokens)
    """
    paragraphs = df['text'].apply(lambda x: Text(x).words).to_list()
    paragraphs = [[word.strip() for word in paragraph] for paragraph in paragraphs]

    tags = df['speaker'].str.split(',').to_list()
    tags = [[tag.strip() for tag in paragraph_tags] for paragraph_tags in tags]

    assert len(paragraphs) == len(tags)

    X = []
    Y = []
    for i in range(len(paragraphs)):
        while(tags[i]):
            X.append(paragraphs[i])
            Y.append([tags[i].pop()])

    return X, Y

In [None]:
class DataGenerator:

        def __init__(self, df, parentgenerator = None, pad_token='<pad>',unk_token='<unk>'):

              if parentgenerator is not None: #Reuse the encodings of the parent if specified
                  self.pad_token      = parentgenerator.pad_token
                  self.unk_token      = parentgenerator.unk_token
                  self.input_sym2idx  = parentgenerator.input_sym2idx 
                  self.input_idx2sym  = parentgenerator.input_idx2sym 
                  self.output_sym2idx = parentgenerator.output_sym2idx 
                  self.output_idx2sym = parentgenerator.output_idx2sym  
              else:                           #Creates new encodings
                  self.pad_token = pad_token
                  self.unk_token = unk_token
                  # Creates 4 encoding maps from datafile 
                  self.input_idx2sym, self.input_sym2idx  = vocabulary(df, 
                                                                       input_vocab=True)
                  self.output_idx2sym,self.output_sym2idx = vocabulary(df, 
                                                                       input_vocab=False,
                                                                       padding=None)

              # Stores the dataset with sentence structure (a list of lists of strings) in the following fields 
              self.Xtokens = df['tokenized_text'].to_list()
              self.Ytokens = df['tags'].to_list()
      

        def generate_batches(self,batch_size):

              #Batches are lists of lists
              
              assert(len(self.Xtokens) == len(self.Ytokens))
              
              N     = len(self.Xtokens)
              idxes = list(range(N))

              #If we don't shuffle the train set after each epoch, the model may not converge
              shuffle(idxes)
              #For efficiency reasons, it is nice to have batches with a similar nb of elements,
              #so we don't do useless computations over the padding token
              idxes.sort(key=lambda idx: len(self.Xtokens[idx]))

              #batch generation
              bstart = 0
              while bstart < N:
                 bend        = min(bstart+batch_size,N)
                 batch_idxes = idxes[bstart:bend] 
                 batch_len   = max(len(self.Xtokens[idx]) for idx in batch_idxes)              
              
                 seqX = [ pad_sequence(self.Xtokens[idx],batch_len,self.pad_token) for idx in batch_idxes]
                 seqX = [ code_sequence(seq,self.input_sym2idx,self.unk_token) for seq in seqX]

                 seqY = [ pad_sequence(self.Ytokens[idx], batch_len, self.pad_token) for idx in batch_idxes]
                 seqY = [ code_sequence(seq,self.output_sym2idx,unk_token=self.unk_token) for seq in seqY]
                 
                 assert(len(seqX) == len(seqY))
                 yield (seqX,seqY)
                 bstart += batch_size

#Model 
- LSTM encoding of paragraphs 
- IOB tagging

In [None]:
# Early stopping

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    
    def __init__(self, patience=7, verbose=True):
        """
        patience (int): How long to wait after last time validation loss improved.
                        Default: 7    
        verbose (bool): Whether to print a message when early stop is activated.
                        Default: True       
        """
        self.patience       = patience
        self.counter        = 0         # number of epochs since last improvement
        self.best_loss      = None
        self.best_accuracy  = None
        self.verbose        = verbose

    def __call__(self, val_loss, val_accuracy):
        """
        val_loss (float): Validation loss obtained in the current epoch
        val_accuracy (float): Validation accuracy obtained in the current epoch (used only for information, early stopping is based on loss)
        """
        if self.best_loss is None:  # if first iteration
            self.best_loss      = val_loss
            self.best_accuracy  = val_accuracy

        if val_loss < self.best_loss: # if improvement
            self.best_loss      = val_loss
            self.counter        = 0

        if val_accuracy > self.best_accuracy:
            self.best_accuracy  = val_accuracy  # just for information

        if val_loss > self.best_loss: # if no improvement
            self.counter += 1
            if self.counter >= self.patience:
              if self.verbose:
                print("\nEARLY STOP\n")
              return True

        return False

In [None]:
# Core part

class NERtagger(nn.Module):

      def __init__(self,traingenerator,embedding_size,hidden_size,device='cpu',
                   patience=7,bidirectional=True,verbose=True):
        super(NERtagger, self).__init__()        
        self.embedding_size    = embedding_size
        self.hidden_size       = hidden_size
        self.bidirectional     = bidirectional  # whether the lstm is bidirectional or not
        self.verbose           = verbose
        self.allocate_params(traingenerator,device) 
        self.early_stopping    = EarlyStopping(verbose=verbose,patience=patience)

      def load(self,filename):
        self.load_state_dict(torch.load(filename))

      def allocate_params(self,datagenerator,device):
        # Creates fields for nn Layers

        invocab_size   = len(datagenerator.input_idx2sym)
        pad_idx        = datagenerator.input_sym2idx[datagenerator.pad_token]
        outvocab_size  = len(datagenerator.output_idx2sym)
        
        self.embeddings = nn.Embedding(invocab_size,
                                      self.embedding_size,
                                      padding_idx=pad_idx).to(device)
        
        self.lstm = nn.LSTM(input_size=self.embedding_size,
                            hidden_size=self.hidden_size,
                            batch_first=True,
                            bidirectional=self.bidirectional).to(device) 
                            # batch_first so it takes input of shape (batch, seq, feature)
        
        size_X         = self.hidden_size*2 if self.bidirectional else self.hidden_size # this line is necessary so the bidirectional switch works
        self.output    = nn.Linear(size_X,outvocab_size).to(device)
 

      def forward(self,X):
        # X is of shape batch_size x sequence_length

        # Prediction steps:
        # embedded_X of shape batch_size x sequence_length x embedding_size
        embedded_X  = self.embeddings(X)

        # lstm_encodings of shape batch_size x sequence_length x lstm_hidden_size
        lstm_encodings, (_, _) = self.lstm(embedded_X) 

        # output of shape batch_size x outvocab_size
        return self.output(lstm_encodings) 


      def evaluate(self, datagenerator, batch_size = 64, device = 'cpu'): 
        '''evaluates the performance of the model on a test set. 
        calculates recall and precision, considering 'O' tags to be the negative case 
        and all i tags to be positice cases '''
        self.to(device)
        #self.eval()
        true_neg = 0 
        true_pos = 0 
        false_neg = 0 
        false_pos = 0 
        pad_idx   = datagenerator.input_sym2idx[datagenerator.pad_token]

        for batch_x, batch_y in datagenerator.generate_batches(batch_size): 
          with torch.no_grad():
            X_in    = torch.tensor(batch_x).to(device) #batch_size x seq_length
            #print(f'x_in: {X_in.shape}')
            y_gold  = torch.tensor(batch_y).to(device) #batch_size*seq_length
            #print('y gold: ', y_gold.shape) 
            y_gold  = y_gold.view(-1) #batch_size*seq_length
            #print('y gold: ', y_gold.shape) 
            Yhat    = self.forward(X_in) #batch_size x seq_length x outvocab size 
            #print(f'yhat: {Yhat.shape}')
            y_pred = torch.argmax(Yhat, dim = 2) #batch_size x seq_length x 1
            #print(f'ypred: {y_pred.shape}')
            y_pred = y_pred.view(-1)
            #print('y pred ', y_pred.shape)

            pad_mask   = (y_gold != pad_idx)
            pos_mask   = (y_pred != datagenerator.output_sym2idx['O'])
            neg_mask   = (y_pred == datagenerator.output_sym2idx['O'])


            true_pos  += torch.sum((y_pred==y_gold)*pos_mask*pad_mask)
            #print('true_pos: ', true_pos)
            true_neg  += torch.sum((y_pred == y_gold)*neg_mask*pad_mask)
            #print('true negL ', true_neg)
            false_pos += torch.sum((y_pred != y_gold)*pos_mask*pad_mask)
            #print('false pos: ', false_pos)
            false_neg += torch.sum((y_pred != y_gold)*neg_mask*pad_mask)
            #print('false neg: ', false_neg)

        precision = true_pos/(true_pos+false_pos)
        recall    = true_pos/(true_pos+false_neg)
        f_score   = 2*((precision*recall)/(precision+recall))
        return precision, recall, f_score 

        
      def train(self,traingenerator,validgenerator,epochs,batch_size,device='cpu'): 

        self.minloss = 10000000 # the min loss found so far on validation data
        optimizer = torch.optim.Adam(self.parameters())
        
        device    = torch.device(device)
        pad_index = traingenerator.input_sym2idx[traingenerator.pad_token]
        loss_fnc  = nn.CrossEntropyLoss(ignore_index=pad_index)

        for epoch in range(epochs):

          self.training = True  # Tells PyTorch we are in training mode (so dropout is activated).

          epoch_loss = []
          batch_accuracies = []
          for seqX,seqY in traingenerator.generate_batches(batch_size):

            optimizer.zero_grad()
            #input(f'x: {seqX}\ny:{seqY}')
            X = torch.LongTensor(seqX).to(device)
            # Y is of shape batch_size x seq_length
            Y = torch.LongTensor(seqY).to(device).view(-1)
            #input(f'y gold: {Y.shape}')

            # Yhat is of shape batch_size x seq_length x nb_classes
            
            Yhat = self.forward(X)
            #input(f'YHAT initial: {Yhat.shape}')
            num_inputs, seq_length, _ = Yhat.shape
            Yhat = Yhat.view(num_inputs*seq_length, -1)
            #input(f'YHAT reshaped: {Yhat.shape}')

            loss = loss_fnc(Yhat, Y)
            loss.backward()
            epoch_loss.append(loss.item())

            optimizer.step() # Updates the parameters

            #Accuracy computation
            mask    = (Y != pad_index)
            Yargmax = torch.argmax(Yhat,dim=1)
            correct = torch.sum((Yargmax == Y) * mask)
            total   = torch.sum(mask)
            batch_accuracies.append(float(correct)/float(total))

          if self.verbose:
            print(f"\nEND OF EPOCH {epoch}")
            print(f"[train] mean Loss = {sum(epoch_loss) / len(epoch_loss):.6f} | mean accurracy = {sum(batch_accuracies) / len(batch_accuracies):.6f}")

          # Evaluation
          # This will automatically save the model with minimum loss
          valid_loss, valid_accuracy = self.validate(validgenerator,
                                                     batch_size, 
                                                     device=device,
                                                     save_min_model=True)
          
          # Early stopping
          if self.early_stopping(valid_loss, valid_accuracy):
            return self.early_stopping.best_accuracy, epoch

        # The train function returns the best dev accuracy 
        # and the number of epochs it trained for.
        return self.early_stopping.best_accuracy, epoch


      def validate(self,datagenerator,batch_size,device='cpu',save_min_model=False):

          self.training = False # Tells PyTorch we are in evaluation/inference mode (so dropout is deactivated).
          
          batch_accuracies = []
          wrongs = []
          gold_is = []
          batch_losses      = []

          device = torch.device(device)
          pad_index = datagenerator.input_sym2idx[datagenerator.pad_token]
          loss_fnc  = nn.CrossEntropyLoss(ignore_index=pad_index)

          for (seqX,seqY) in datagenerator.generate_batches(batch_size):
                with torch.no_grad():   
                  X = torch.LongTensor(seqX).to(device)
                  Y = torch.LongTensor(seqY).to(device).view(-1)
                
                  Yhat = self.forward(X)
                  num_inputs, seq_length, _ = Yhat.shape
                  Yhat = Yhat.view(num_inputs*seq_length, -1)

                  loss = loss_fnc(Yhat,Y)
                  batch_losses.append(loss.item())

                  #Accurracy computation
                  mask    = (Y != pad_index)
                  Yargmax = torch.argmax(Yhat,dim=1)
                  correct = torch.sum((Yargmax == Y) * mask)
                  wrong = torch.sum((Yargmax != Y) * mask)
                  gold_i = torch.sum(Y==datagenerator.output_sym2idx['I'])
                  gold_is.append(gold_i)
                  total   = torch.sum(mask)
                  batch_accuracies.append(float(correct)/float(total))
                  wrongs.append(wrong)

          L = len(batch_losses)                  
          valid_loss = sum(batch_losses)/L
          valid_accuracy = sum(batch_accuracies)/L

          if save_min_model and valid_loss < self.minloss:
            self.minloss = valid_loss
            torch.save(self.state_dict(), 'tagger_params.pt')

          if self.verbose:
            print('[valid] mean Loss = %f | mean accurracy = %f'%(valid_loss,valid_accuracy))
            #print(f'wrongs : {sum(wrongs)}\n I tags: {sum(gold_is)}')

          return valid_loss, valid_accuracy # used for early stopping in train

# Training

In [None]:
embed_size = 64
hidden_size = 128
epochs = 50
batch_size = 64

trainset = DataGenerator(train)
validset = DataGenerator(dev,parentgenerator=trainset)
testset = DataGenerator(test,parentgenerator=trainset)
tagger   = NERtagger(trainset,embed_size,hidden_size,bidirectional=True,device='cuda',patience=7)
tagger.train(trainset,validset,epochs,batch_size,device='cuda') 
tagger.validate(testset,batch_size,device='cuda')

p, r, f = tagger.evaluate(testset)
print(f'precision: {p}, recall: {r}, f1: {f}')


END OF EPOCH 0
[train] mean Loss = 0.974677 | mean accurracy = 0.709328
[valid] mean Loss = 0.855888 | mean accurracy = 0.605528

END OF EPOCH 1
[train] mean Loss = 0.541308 | mean accurracy = 0.827558
[valid] mean Loss = 1.397289 | mean accurracy = 0.606834

END OF EPOCH 2
[train] mean Loss = 0.410139 | mean accurracy = 0.829120
[valid] mean Loss = 1.026457 | mean accurracy = 0.606834

END OF EPOCH 3
[train] mean Loss = 0.291146 | mean accurracy = 0.828809
[valid] mean Loss = 0.464194 | mean accurracy = 0.606916

END OF EPOCH 4
[train] mean Loss = 0.252387 | mean accurracy = 0.867933
[valid] mean Loss = 0.383736 | mean accurracy = 0.989495

END OF EPOCH 5
[train] mean Loss = 0.221011 | mean accurracy = 0.979195
[valid] mean Loss = 0.282479 | mean accurracy = 0.988140

END OF EPOCH 6
[train] mean Loss = 0.173502 | mean accurracy = 0.976362
[valid] mean Loss = 0.127033 | mean accurracy = 0.983771

END OF EPOCH 7
[train] mean Loss = 0.133710 | mean accurracy = 0.966672
[valid] mean Loss

# Statistics on the data

In [None]:
# Most frequent class baseline (accuracy)
test['tags'].apply(lambda x: x.count('O')/len(x)).mean()

0.9850698504758424

In [None]:
# Precision baseline
test['tags'].apply(lambda x: (x.count('I') + x.count('B'))/len(x)).mean()

0.0149301495241576