# Evaluation: BertNer and FlairNer

## Import packages, models, and test set

In [65]:
# import packages
from danlp.models import load_bert_ner_model, load_flair_ner_model
from danlp.datasets import DDT
import time
from flair.data import Sentence, Token

In [66]:
# load models
bert = load_bert_ner_model()
flair = load_flair_ner_model()

2021-01-07 02:27:00,947 loading file /Users/jorgentaule/.danlp/flair.ner.pt


In [67]:
# get data (splitted into a training set, a validation set, and a test set)
ddt = DDT()
train, valid, test = ddt.load_as_simple_ner(True)

In [None]:
# divide the observations and the targets of the testset into new variables
sentences, categories = test

## Time models and get their test set predictions

In [69]:
def get_bert_predictions():
    start = time.time()
    
    predictions = []
    
    for sentence in sentences:
        predictions.append(bert.predict(sentence)[1])
    
    time_spent = time.time()-start
    
    return predictions, time_spent

In [70]:
bert_preds, bert_time_spent = get_bert_predictions()

In [75]:
print(f'Time: {bert_time_spent}, time per sentence: {bert_time_spent/len(bert_preds)}')

Time: 63.273518085479736, time per sentence: 0.11198852758491988


In [72]:
def get_flair_predictions():
    start = time.time()
    
    predictions = []
    
    flair_sentences = []
    for sentence in sentences:
        flair_sentence = Sentence()
        for token in sentence:
            flair_sentence.add_token(Token(token))
        flair_sentences.append(flair_sentence)
    flair.predict(flair_sentences)
    
    for s in flair_sentences:
        predicted_categories = []
        for t in s:
            predicted_categories.append(t.tags['ner'].value)
        predictions.append(predicted_categories)
    
    time_spent = time.time()-start
    
    return predictions, time_spent

In [73]:
flair_preds, flair_time_spent = get_flair_predictions()

In [76]:
print(f'Time: {flair_time_spent}, time per sentence: {flair_time_spent/len(flair_preds)}')

Time: 28.956867933273315, time per sentence: 0.051251093687209405


## Accuracy, precision, recall og f1-score

### Some definitions

- Accuracy = TP+TN/TP+FP+FN+TN
- Precision = TP/TP+FP
- Recall = TP/TP+FN
- F1 Score = 2 * (Recall * Precision) / (Recall + Precision)

(https://blog.exsilio.com/all/accuracy-precision-recall-f1-score-interpretation-of-performance-measures/)

### Which situations are there?

- Look at all the predictions similarly.
- Only look at PER, ORG, LOC, MISC separately.
- Look at all the predictions similarly, while setting MISC=O.

## Look at sentences where the models do wrong predictions (new notebook?)

In [77]:
categories

[['O',
  'O',
  'B-MISC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-MISC', 'O', 'O', 'B-MISC', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O'],
 ['O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [None]:
def is_misc(ent: str):
    if len(ent) < 4:
        return False
    return ent[-4:] == 'MISC'


def remove_miscs(se: list):
    return [
        [entity if not is_misc(entity) else 'O' for entity in entities]
        for entities in se
    ]