# Named Entity Recognition using Flair on CONLL-2003
## Experiment description
This notebook contains a ML fabric flow for Named Entity Recognition using the [flair NLP package](https://github.com/flairNLP/flair/)

##### Jupyter helpers:

In [1]:
%reload_ext autoreload
%autoreload

Define imports

In [18]:
import requests
from pathlib import Path
from typing import List

import pandas as pd
import flair
from flair.datasets import CONLL_03
from flair.data import Corpus, Sentence
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import torch
from seqeval.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm

from src.data import DataLoader
from src.models import BaseModel
from src.data_processing import DataProcessor, EmptyProcessor
from src.experimentation import MlflowExperimentation
from src.evaluation import Evaluator, EvaluationMetrics
from src import ExperimentRunner


In [3]:
class ConllDataLoader(DataLoader):

    def __init__(self, dataset_name, dataset_version='1', local_data_path = '../data/processed/'):
        self.folds = ('eng.train', 'eng.testa', 'eng.testb')
        self.local_data_path = local_data_path
        super().__init__(dataset_name=dataset_name, dataset_version=dataset_version)

    def download_dataset(self) -> None:
        
        if self.dataset_name=="conll_03" and self.dataset_version == '1':
            
            for fold in self.folds:
                local_path = Path(self.local_data_path,self.dataset_name).resolve()
                
                if not local_path.exists():
                    local_path.mkdir(parents=True)

                dataset_file = Path(local_path, fold)
                if dataset_file.exists():
                    print("Dataset already exists, skipping download")
                    return

                dataset_path=f"https://raw.githubusercontent.com/glample/tagger/master/dataset/{fold}"
                response = requests.get(dataset_path)
                dataset_raw = response.text
                with open(dataset_file, "w") as f:
                    f.write(dataset_raw)
                print(f"Finished writing fold {fold} to {self.local_data_path}")

            print(f"Finished downloading dataset {self.dataset_name} version {self.dataset_version}")
        
        else:
            raise ValueError("Selected dataset was not found")

    def get_dataset(self) -> Corpus:
        try:
            return CONLL_03(base_path=self.local_data_path, in_memory=True)

        except FileNotFoundError:
            print(f"Dataset {self.dataset_name} with version {self.dataset_version} not found in data/raw")


## Load data
*replace MyDataLoader with your DataLoader implementation*

In [4]:
data_loader = ConllDataLoader(dataset_name = "conll_03")
data_loader.download_dataset()
corpus = data_loader.get_dataset()
corpus=corpus.downsample(0.05) # Just for example purposes
corpus

Dataset already exists, skipping download
2020-06-04 11:59:05,577 Reading data from ..\data\processed\conll_03
2020-06-04 11:59:05,580 Train: ..\data\processed\conll_03\eng.train
2020-06-04 11:59:05,581 Dev: ..\data\processed\conll_03\eng.testa
2020-06-04 11:59:05,582 Test: ..\data\processed\conll_03\eng.testb


<flair.datasets.sequence_labeling.CONLL_03 at 0x25a85ae2d08>

In [5]:
print(f"First sample in train sample:\n {corpus.train.dataset[0]}")

First sample in train sample:
 Sentence: "EU rejects German call to boycott British lamb ."   [− Tokens: 9  − Token-Labels: "EU <NNP/I-NP/S-ORG> rejects <VBZ/I-VP> German <JJ/I-NP/S-MISC> call <NN/I-NP> to <TO/I-VP> boycott <VB/I-VP> British <JJ/I-NP/S-MISC> lamb <NN/I-NP> . <.>"]


Define experimentation object, which will be used for logging the experiments parameters, metrics and artifacts
*Replace MlflowExperimentation if you use a different experimentation system*

In [6]:
experimentation = MlflowExperimentation()

Create model/logic:

In [7]:
flair.device = torch.device("cpu")

class FlairNERModel(BaseModel):

    def __init__(self, 
                 corpus: Corpus,
                 hidden_size: int =256, 
                 pooling: str = 'min', 
                 word_embeddings: str='glove',
                 train_with_dev: bool = True,
                 max_epochs: int = 10):
        self.tag_type = 'ner'
        self.tag_dictionary = None
        self.tagger = None
        self.embeddings = None

        self.hidden_size = hidden_size
        self.pooling = pooling
        self.word_embeddings = word_embeddings
        self.train_with_dev = train_with_dev
        self.max_epochs = max_epochs
        
        self.set_tagger_definition(corpus)
        
        hyper_params = self.get_hyper_params(hidden_size=hidden_size, 
                             pooling=pooling, 
                             word_embeddings=word_embeddings, 
                             train_with_dev=train_with_dev, 
                             max_epochs=max_epochs)
        
        super().__init__(**hyper_params)

    def get_hyper_params(self, **hyper_params):
        basic_params = {param_name: param_value 
                        for (param_name, param_value) in self.tagger.__dict__.items() 
                        if type(param_value) in (bool, float, int, str)}
        hyper_params.update(basic_params)
        return hyper_params
        
    def set_embeddings_definition(self) -> List[TokenEmbeddings]:
        """
        Sets the embedding layers used by this tagger
        """
        # initialize embeddings
        embedding_types: List[TokenEmbeddings] = [

        # Word embeddings (default = GloVe)
        WordEmbeddings(self.word_embeddings),

        # contextual string embeddings, forward
        PooledFlairEmbeddings('news-forward', pooling=self.pooling),

        # contextual string embeddings, backward
        PooledFlairEmbeddings('news-backward', pooling=self.pooling)
        ]
        self.embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
    
    def set_tagger_definition(self, corpus:Corpus) -> SequenceTagger:
        """
        Returns the definition of the Flair SequenceTagger (the full model)
        :param corpus: Used only for setting the tag_dictionary
        """
        
        if not self.embeddings:
            self.set_embeddings_definition()
        self.tag_dictionary = corpus.make_tag_dictionary(tag_type=self.tag_type)
        
        tagger: SequenceTagger = SequenceTagger(hidden_size=self.hidden_size,
                                                embeddings=self.embeddings,
                                                tag_dictionary=self.tag_dictionary,
                                                tag_type=self.tag_type,
                                                use_crf=False)
        self.tagger = tagger
        
    def fit(self, corpus: Corpus) -> None:
        # initialize trainer
        trainer: ModelTrainer = ModelTrainer(self.tagger, corpus)

        trainer.train('models/taggers/flair-ner',
                      train_with_dev=self.train_with_dev,  
                      max_epochs=self.max_epochs)

    def predict(self, sentences):
        tagged_sentences = []
        for sentence in tqdm(sentences):
            self.tagger.predict(sentence)
            tagged_sentences.append(sentence)
        return tagged_sentences
            
model = FlairNERModel(corpus=corpus)

2020-06-04 11:59:15,116 | INFO : loading Word2VecKeyedVectors object from C:\Users\ommendel\.flair\embeddings\glove.gensim
2020-06-04 11:59:16,625 | INFO : loading vectors from C:\Users\ommendel\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
2020-06-04 11:59:16,802 | INFO : setting ignored attribute vectors_norm to None
2020-06-04 11:59:16,803 | INFO : loaded C:\Users\ommendel\.flair\embeddings\glove.gensim
2020-06-04 11:59:18,669 | INFO : Created model FlairNERModel with hyperparams {'hidden_size': 256, 'pooling': 'min', 'word_embeddings': 'glove', 'train_with_dev': True, 'max_epochs': 10, 'training': True, 'use_rnn': True, 'use_crf': False, 'rnn_layers': 1, 'trained_epochs': 0, 'tag_type': 'ner', 'tagset_size': 20, 'beta': 1.0, 'nlayers': 1, 'use_dropout': 0.0, 'use_word_dropout': 0.05, 'use_locked_dropout': 0.5, 'pickle_module': 'pickle', 'reproject_to': 8292, 'relearn_embeddings': True, 'train_initial_hidden_state': False, 'bidirectional': True, 'rnn_type': 'LSTM'}


In [8]:
TRAIN=False

if TRAIN:
    model.fit(corpus)
else:
    # Simulate training has finished by downloading a pretrained model
    model.tagger = SequenceTagger.load('ner')

2020-06-04 11:59:18,682 loading file C:\Users\ommendel\.flair\models\en-ner-conll03-v0.4.pt


Example prediction

In [9]:
example_sentence = Sentence("In Penny Lane, there is a barber showing photographs")

model.predict([example_sentence])
for token in example_sentence.tokens:
    #print(token.__dict__)
    print(f" {token.text} | {token.get_tag('ner')}")

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


 In | O (1.0)
 Penny | B-LOC (0.9802)
 Lane, | E-LOC (0.999)
 there | O (1.0)
 is | O (1.0)
 a | O (1.0)
 barber | O (1.0)
 showing | O (1.0)
 photographs | O (1.0)


In [10]:
for sentence in [corpus.test[i] for i in range(5)]:
    [token.add_tag_label("gold_ner",token.get_tag("ner")) for token in sentence.tokens]
    [token.set_label("ner",value="O") for token in sentence.tokens]
#for token in test1:
    

for token in corpus.test[1]:
    print(token.get_tag("ner"))


O (1.0)
O (1.0)
O (1.0)
O (1.0)
O (1.0)
O (1.0)


In [11]:
corpus.test

<torch.utils.data.dataset.Subset at 0x25ab881f1c8>

In [None]:
for sentence in [corpus.test[i] for i in range(5)]:
    [token.add_tag_label("gold_ner",token.get_tag("ner")) for token in sentence.tokens]
    [token.set_label("ner",value="O") for token in sentence.tokens]

predictions = model.predict(corpus.test)

Define evaluation

In [30]:
class NEREvaluationMetrics:
    """
    This class holds the metrics calculated during the experiment run
    """
    def __init__(self,f1, accuracy):
        self.f1 = f1
        self.accuracy = accuracy

class NEREvaluator(Evaluator):
    """
    This class holds the logic for evaluating a prediction outcome
    """
    def evaluate(self, y_test, predicted_sentences) -> EvaluationMetrics:
        golds = []
        predicted = []
        print(y_test)
        print(predicted_sentences)
        for sentence in predicted_sentences:

            gold_tags = [token.get_tag('gold_ner').value for token in sentence.tokens]
            golds.append(gold_tags)
            predicted_tags = [token.get_tag('ner').value for token in sentence.tokens]
            predicted.append(predicted_tags)
        
        f1 = f1_score(golds, predicted)
        accuracy = accuracy_score(golds, predicted)
        return NEREvaluationMetrics(f1=f1,accuracy=accuracy)
        
evaluator = NEREvaluator()

Define experiment runner

In [31]:
class NERExperimentRunner(ExperimentRunner):
    
    def __init__(self,
                model,
                corpus,
                data_loader,
                log_experiment,
                experiment_logger,
                evaluator,
                experiment_name):
        self.corpus = corpus
        super().__init__(model=model,
                         data_loader=data_loader, 
                         log_experiment=log_experiment,
                         experiment_logger=experiment_logger,
                         evaluator=evaluator, 
                         experiment_name=experiment_name,
                         X_train=None, X_test=None)
    
    def predict(self):
        """
        Overwriting the ExperimentRunner predict to verify that tags are marked correctly before prediction
        """
        corpus = self.corpus
        # Copy gold NER to new label and assign O to all ner labels (to be populated during inference)
        for sentence in corpus.test:
            [token.add_tag_label("gold_ner",token.get_tag("ner")) for token in sentence.tokens]
            [token.set_label("ner",value="O") for token in sentence.tokens]

        self._predictions = self.model.predict(corpus.test)


experiment_runner = NERExperimentRunner(
    model=model,
    corpus=corpus,
    data_loader=data_loader,
    log_experiment=True,
    experiment_logger=experimentation,
    evaluator=evaluator,
    experiment_name="Experiment",
)

results = experiment_runner.evaluate()
print(results)


2020-06-04 15:15:41,999 | INFO : Starting experiment: Experiment...
2020-06-04 15:15:42,000 | INFO : Connecting to MlflowExperimentation
Logging package in C:\Users\ommendel\OneDrive - Microsoft\Projects\MLExperimentationFramework\samples\ner_sample\src
None
[]


ZeroDivisionError: division by zero