# Evaluation: BertNer and FlairNer

## Import packages, models, and test set

In [69]:
# general
import numpy as np
import time

# models
from danlp.models import load_bert_ner_model, load_flair_ner_model

# dataset
from danlp.datasets import DDT

# utils
from flair.data import Sentence, Token
from copy import deepcopy
from sklearn.metrics import confusion_matrix, classification_report

In [70]:
# load models
bert = load_bert_ner_model()
flair = load_flair_ner_model()

2021-01-08 13:15:27,016 loading file /Users/jorgentaule/.danlp/flair.ner.pt


In [71]:
# get data (splitted into a training set, a validation set, and a test set)
ddt = DDT()
train, valid, test = ddt.load_as_simple_ner(True)

In [72]:
# divide the observations and the targets of the testset into new variables
sentences, categories = test

## Time models and get their test set predictions

In [73]:
def get_bert_predictions():
    start = time.time()
    
    predictions = []
    
    for sentence in sentences:
        predictions.append(bert.predict(sentence)[1])
    
    time_spent = time.time()-start
    
    return predictions, time_spent

In [74]:
bert_preds, bert_time_spent = get_bert_predictions()

ERROR: Could not find file <ipython-input-73-0ce36fe57670>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.


In [7]:
print(f'Time: {bert_time_spent}, time per sentence: {bert_time_spent/len(bert_preds)}')

Time: 58.782347202301025, time per sentence: 0.1040395525704443


In [8]:
def get_flair_predictions():
    start = time.time()
    
    predictions = []
    
    flair_sentences = []
    for sentence in sentences:
        flair_sentence = Sentence()
        for token in sentence:
            flair_sentence.add_token(Token(token))
        flair_sentences.append(flair_sentence)
    flair.predict(flair_sentences)
    
    for s in flair_sentences:
        predicted_categories = []
        for t in s:
            predicted_categories.append(t.tags['ner'].value)
        predictions.append(predicted_categories)
    
    time_spent = time.time()-start
    
    return predictions, time_spent

In [9]:
flair_preds, flair_time_spent = get_flair_predictions()

  word_embedding, device=flair.device, dtype=torch.float


In [10]:
print(f'Time: {flair_time_spent}, time per sentence: {flair_time_spent/len(flair_preds)}')

Time: 29.312140941619873, time per sentence: 0.05187989547189358


## Accuracy, precision, recall og f1-score

Remove MISC from the dataset, as this is not predicted.

In [11]:
for i, sentence in enumerate(categories):
    for j, token in enumerate(sentence):
        if token=='I-MISC' or token=='B-MISC':
            categories[i][j] = 'O'

### Which situations are there?

- Look at all the predictions similarly.
- Only look at PER, ORG, LOC separately.

In [12]:
def compute_metrics(token, token_prediction, metric_dict):
    # count all incorrect predictions
    if token!=token_prediction:
        metric_dict['total']['incorrect'] += 1
        if token!='O':
            metric_dict[token[2:]]['incorrect'] += 1
    
    # both are 'O': add to correct
    if token=='O' and token_prediction=='O':
        metric_dict['total']['correct'] += 1
    
    # token is 'O', prediction is something else: HYPOTHESIZE
    elif token=='O' and token_prediction!='O':
        metric_dict['total']['hypothesize'] += 1
        metric_dict[token_prediction[2:]]['hypothesize'] += 1
        
    # token is not 'O', prediction is 'O': MISSED
    elif token!='O' and token_prediction=='O':
        metric_dict['total']['missed'] += 1
        metric_dict[token[2:]]['missed'] += 1
    
    # token is not 'O', predictions is not 'O'
    elif token!='O' and token_prediction!='O':
        if token==token_prediction:
            metric_dict['total']['correct'] += 1
            metric_dict['total']['correct-not-O'] += 1
            metric_dict[token[2:]]['correct'] += 1
            metric_dict[token[2:]]['correct-not-O'] += 1
        elif token[:2]==token_prediction[:2]:
            metric_dict['total']['correct-type-only'] += 1
            metric_dict[token[2:]]['correct-type-only'] += 1
            
    else:
        print('An error has occured.')
        print(token)
        print(token_prediction)
    
    return metric_dict

In [13]:
category_counter = {'O': 0, 'not-O': 0,
                    'B-PER': 0, 'I-PER': 0, 
                    'B-LOC': 0, 'I-LOC': 0, 
                    'B-ORG': 0, 'I-ORG': 0}

flair_counter = deepcopy(category_counter)
bert_counter = deepcopy(category_counter)

metrics = {
    'correct': 0,          # exactly the same
    'correct-not-O': 0,    # exactly the same and not 'O'
    'incorrect': 0,        # not exactly the same
    'missed': 0,           # token is not 'O', but 'O' is predicted
    'hypothesize': 0,      # token is 'O', but something else is predicted
    'correct-type-only': 0 # type not 'O' and is correct, disregarding IOB
}

flair_total_metrics = {
    'total': deepcopy(metrics),
    'PER': deepcopy(metrics),
    'LOC': deepcopy(metrics),
    'ORG': deepcopy(metrics)
}

bert_total_metrics = deepcopy(flair_total_metrics)

for i, sentence in enumerate(categories):
    for j, token in enumerate(sentence):
        
        category_counter[token] += 1
        if token!='O':
            category_counter['not-O'] += 1
            
        flair_counter[flair_preds[i][j]] += 1
        if flair_preds[i][j]!='O':
            flair_counter['not-O'] += 1
            
        bert_counter[bert_preds[i][j]] += 1
        if bert_preds[i][j]!='O':
            bert_counter['not-O'] += 1
        
        flair_total_metrics = compute_metrics(token, flair_preds[i][j], flair_total_metrics)
        bert_total_metrics = compute_metrics(token, bert_preds[i][j], bert_total_metrics)    

In [14]:
flair_total_metrics

{'total': {'correct': 9886,
  'correct-not-O': 515,
  'incorrect': 137,
  'missed': 91,
  'hypothesize': 12,
  'correct-type-only': 32},
 'PER': {'correct': 308,
  'correct-not-O': 308,
  'incorrect': 10,
  'missed': 8,
  'hypothesize': 1,
  'correct-type-only': 2},
 'LOC': {'correct': 84,
  'correct-not-O': 84,
  'incorrect': 17,
  'missed': 12,
  'hypothesize': 4,
  'correct-type-only': 4},
 'ORG': {'correct': 123,
  'correct-not-O': 123,
  'incorrect': 98,
  'missed': 71,
  'hypothesize': 7,
  'correct-type-only': 26}}

In [15]:
bert_total_metrics

{'total': {'correct': 9896,
  'correct-not-O': 535,
  'incorrect': 127,
  'missed': 53,
  'hypothesize': 22,
  'correct-type-only': 51},
 'PER': {'correct': 306,
  'correct-not-O': 306,
  'incorrect': 12,
  'missed': 10,
  'hypothesize': 2,
  'correct-type-only': 2},
 'LOC': {'correct': 90,
  'correct-not-O': 90,
  'incorrect': 11,
  'missed': 10,
  'hypothesize': 4,
  'correct-type-only': 1},
 'ORG': {'correct': 139,
  'correct-not-O': 139,
  'incorrect': 82,
  'missed': 33,
  'hypothesize': 16,
  'correct-type-only': 48}}

In [16]:
category_counter

{'O': 9383,
 'not-O': 640,
 'B-PER': 180,
 'I-PER': 138,
 'B-LOC': 96,
 'I-LOC': 5,
 'B-ORG': 161,
 'I-ORG': 60}

In [17]:
bert_counter

{'O': 9414,
 'not-O': 609,
 'B-PER': 181,
 'I-PER': 141,
 'B-LOC': 108,
 'I-LOC': 21,
 'B-ORG': 123,
 'I-ORG': 35}

In [18]:
flair_counter

{'O': 9462,
 'not-O': 561,
 'B-PER': 185,
 'I-PER': 143,
 'B-LOC': 95,
 'I-LOC': 2,
 'B-ORG': 90,
 'I-ORG': 46}

### Some definitions

- Accuracy = (TP+TN)/(TP+FP+FN+TN)
- Precision = TP/(TP+FP)
- Recall = TP/(TP+FN)
- F1 Score = 2 * (Recall * Precision) / (Recall + Precision)

(https://blog.exsilio.com/all/accuracy-precision-recall-f1-score-interpretation-of-performance-measures/)

In [19]:
flair_preds_flattened = [item for sublist in flair_preds for item in sublist]
bert_preds_flattened = [item for sublist in bert_preds for item in sublist]
categories_flattened = [item for sublist in categories for item in sublist]

In [38]:
labels = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']

bert_cm = np.array(confusion_matrix(categories_flattened, bert_preds_flattened, labels=labels))
flair_cm = np.array(confusion_matrix(categories_flattened, flair_preds_flattened, labels=labels))

In [39]:
print(labels)
print(bert_cm)

['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
[[9361    2    0    2    2   14    2]
 [  10  168    0    1    0    1    0]
 [   0    0  138    0    0    0    0]
 [   9    0    0   86    0    1    0]
 [   1    0    0    0    4    0    0]
 [  25   11    0   19    0  106    0]
 [   8    0    3    0   15    1   33]]


In [40]:
print(labels)
print(flair_cm)

['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
[[9371    1    0    4    0    5    2]
 [   8  170    0    1    0    1    0]
 [   0    0  138    0    0    0    0]
 [  10    1    0   82    0    2    1]
 [   2    0    0    0    2    0    1]
 [  59   13    0    8    0   81    0]
 [  12    0    5    0    0    1   42]]


In [41]:
types = {
    'O': 0,
    'B-PER': 0, 'I-PER': 0, 
    'B-LOC': 0, 'I-LOC': 0, 
    'B-ORG': 0, 'I-ORG': 0
}

In [42]:
bert_acc = sum([bert_cm[i][i] for i in range(len(bert_cm))])/sum([item for sublist in bert_cm for item in sublist])
flair_acc = sum([flair_cm[i][i] for i in range(len(flair_cm))])/sum([item for sublist in flair_cm for item in sublist])

In [43]:
bert_acc, flair_acc

(0.9873291429711664, 0.9863314376933054)

In [44]:
bert_prec = deepcopy(types)
flair_prec = deepcopy(types)

In [54]:
for i, key in enumerate(types.keys()):
    bert_prec[key] = bert_cm[i][i]/sum(bert_cm.T[i])
    flair_prec[key] = flair_cm[i][i]/sum(flair_cm.T[i])

In [55]:
bert_prec

{'O': 0.9943700871043127,
 'B-PER': 0.9281767955801105,
 'I-PER': 0.9787234042553191,
 'B-LOC': 0.7962962962962963,
 'I-LOC': 0.19047619047619047,
 'B-ORG': 0.8617886178861789,
 'I-ORG': 0.9428571428571428}

In [56]:
flair_prec

{'O': 0.9903825829634326,
 'B-PER': 0.918918918918919,
 'I-PER': 0.965034965034965,
 'B-LOC': 0.8631578947368421,
 'I-LOC': 1.0,
 'B-ORG': 0.9,
 'I-ORG': 0.9130434782608695}

In [57]:
bert_recall = deepcopy(types)
flair_recall = deepcopy(types)

In [58]:
for i, key in enumerate(types.keys()):
    bert_recall[key] = bert_cm[i][i]/sum(bert_cm[i])
    flair_recall[key] = flair_cm[i][i]/sum(flair_cm[i])

In [59]:
bert_recall

{'O': 0.9976553341148886,
 'B-PER': 0.9333333333333333,
 'I-PER': 1.0,
 'B-LOC': 0.8958333333333334,
 'I-LOC': 0.8,
 'B-ORG': 0.6583850931677019,
 'I-ORG': 0.55}

In [60]:
flair_recall

{'O': 0.9987210913353938,
 'B-PER': 0.9444444444444444,
 'I-PER': 1.0,
 'B-LOC': 0.8541666666666666,
 'I-LOC': 0.4,
 'B-ORG': 0.5031055900621118,
 'I-ORG': 0.7}

In [64]:
bert_rep = classification_report(categories_flattened, bert_preds_flattened, labels=labels)
flair_rep = classification_report(categories_flattened, flair_preds_flattened, labels=labels)

In [66]:
print(bert_rep)

              precision    recall  f1-score   support

           O       0.99      1.00      1.00      9383
       B-PER       0.93      0.93      0.93       180
       I-PER       0.98      1.00      0.99       138
       B-LOC       0.80      0.90      0.84        96
       I-LOC       0.19      0.80      0.31         5
       B-ORG       0.86      0.66      0.75       161
       I-ORG       0.94      0.55      0.69        60

    accuracy                           0.99     10023
   macro avg       0.81      0.83      0.79     10023
weighted avg       0.99      0.99      0.99     10023



In [67]:
print(flair_rep)

              precision    recall  f1-score   support

           O       0.99      1.00      0.99      9383
       B-PER       0.92      0.94      0.93       180
       I-PER       0.97      1.00      0.98       138
       B-LOC       0.86      0.85      0.86        96
       I-LOC       1.00      0.40      0.57         5
       B-ORG       0.90      0.50      0.65       161
       I-ORG       0.91      0.70      0.79        60

    accuracy                           0.99     10023
   macro avg       0.94      0.77      0.83     10023
weighted avg       0.99      0.99      0.98     10023



## Space complexity

In [68]:
!pip install -U memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.58.0.tar.gz (36 kB)
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25ldone
[?25h  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-py3-none-any.whl size=29147 sha256=a3119a589005368a0c7d809c2494be950d1e9a8d9449431dd7c5ddd55a88f5b8
  Stored in directory: /Users/jorgentaule/Library/Caches/pip/wheels/56/19/d5/8cad06661aec65a04a0d6785b1a5ad035cb645b1772a4a0882
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0
You should consider upgrading via the '/Users/jorgentaule/.pyenv/versions/anaconda3-5.3.1/bin/python -m pip install --upgrade pip' command.[0m


## Look at sentences where the models do wrong predictions (new notebook?)