# Evaluation: BertNer and FlairNer

## Import packages, models, and test set

In [1]:
# general
import numpy as np
import time

# models
from danlp.models import load_bert_ner_model, load_flair_ner_model

# dataset
from danlp.datasets import DDT

# utils
from flair.data import Sentence, Token
from copy import deepcopy
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# load models
bert = load_bert_ner_model()
flair = load_flair_ner_model()

2021-01-12 12:28:45,599 loading file /Users/jorgentaule/.danlp/flair.ner.pt


In [3]:
# get data (splitted into a training set, a validation set, and a test set)
ddt = DDT()
train, valid, test = ddt.load_as_simple_ner(True)

In [4]:
# divide the observations and the targets of the testset into new variables
sentences, categories = test

## Time models and get their test set predictions

In [5]:
def get_bert_predictions():
    start = time.time()
    
    predictions = []
    
    for sentence in sentences:
        predictions.append(bert.predict(sentence)[1])
    
    time_spent = time.time()-start
    
    return predictions, time_spent

In [6]:
bert_preds, bert_time_spent = get_bert_predictions()

In [7]:
print(f'Time: {bert_time_spent}, time per sentence: {bert_time_spent/len(bert_preds)}')

Time: 70.5121967792511, time per sentence: 0.12480034828186035


In [8]:
def get_flair_predictions():
    start = time.time()
    
    predictions = []
    
    flair_sentences = []
    for sentence in sentences:
        flair_sentence = Sentence()
        for token in sentence:
            flair_sentence.add_token(Token(token))
        flair_sentences.append(flair_sentence)
    flair.predict(flair_sentences)
    
    for s in flair_sentences:
        predicted_categories = []
        for t in s:
            predicted_categories.append(t.tags['ner'].value)
        predictions.append(predicted_categories)
    
    time_spent = time.time()-start
    
    return predictions, time_spent

In [9]:
flair_preds, flair_time_spent = get_flair_predictions()

  word_embedding, device=flair.device, dtype=torch.float


In [10]:
print(f'Time: {flair_time_spent}, time per sentence: {flair_time_spent/len(flair_preds)}')

Time: 39.77071785926819, time per sentence: 0.07039065107835078


## Accuracy, precision, recall og f1-score

Remove MISC from the dataset, as this is not predicted.

In [11]:
for i, sentence in enumerate(categories):
    for j, token in enumerate(sentence):
        if token=='I-MISC' or token=='B-MISC':
            categories[i][j] = 'O'

### Which situations are there?

- Look at all the predictions similarly.
- Only look at PER, ORG, LOC separately.

### Some definitions

- Accuracy = (TP+TN)/(TP+FP+FN+TN)
- Precision = TP/(TP+FP)
- Recall = TP/(TP+FN)
- F1 Score = 2 * (Recall * Precision) / (Recall + Precision)

In [12]:
flair_preds_flattened = [item for sublist in flair_preds for item in sublist]
bert_preds_flattened = [item for sublist in bert_preds for item in sublist]
categories_flattened = [item for sublist in categories for item in sublist]

In [14]:
labels = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
bert_rep = classification_report(categories_flattened, bert_preds_flattened, labels=labels)
flair_rep = classification_report(categories_flattened, flair_preds_flattened, labels=labels)

In [15]:
print(bert_rep)

              precision    recall  f1-score   support

           O       0.99      1.00      1.00      9383
       B-PER       0.93      0.93      0.93       180
       I-PER       0.98      1.00      0.99       138
       B-LOC       0.80      0.90      0.84        96
       I-LOC       0.19      0.80      0.31         5
       B-ORG       0.86      0.66      0.75       161
       I-ORG       0.94      0.55      0.69        60

    accuracy                           0.99     10023
   macro avg       0.81      0.83      0.79     10023
weighted avg       0.99      0.99      0.99     10023



In [16]:
print(flair_rep)

              precision    recall  f1-score   support

           O       0.99      1.00      0.99      9383
       B-PER       0.92      0.94      0.93       180
       I-PER       0.97      1.00      0.98       138
       B-LOC       0.86      0.85      0.86        96
       I-LOC       1.00      0.40      0.57         5
       B-ORG       0.90      0.50      0.65       161
       I-ORG       0.91      0.70      0.79        60

    accuracy                           0.99     10023
   macro avg       0.94      0.77      0.83     10023
weighted avg       0.99      0.99      0.98     10023



## Output model mistakes

In [50]:
def faulty_sentences(preds):
    fault_indices = []
    fault_pred_sents = []
    
    for i, sentence in enumerate(categories):
        if sentence!=preds[i]:
            fault_indices.append(i)
            fault_pred_sents.append(preds[i])
            
    return fault_indices, fault_pred_sents

flair_fault_indices, flair_fault_pred_sents = faulty_sentences(flair_preds)
bert_fault_indices, bert_fault_pred_sents = faulty_sentences(bert_preds)

In [56]:
len(bert_fault_indices), len(categories)

(73, 565)

In [62]:
separator = ','
print(separator.join(bert_fault_pred_sents[0]))

O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [53]:
for i in range(len(bert_fault_indices)):
    print(categories[bert_fault_indices[i]]==bert_fault_pred_sents[i])

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


## Old

### Counting number of intances

In [17]:
def compute_metrics(token, token_prediction, metric_dict):
    # count all incorrect predictions
    if token!=token_prediction:
        metric_dict['total']['incorrect'] += 1
        if token!='O':
            metric_dict[token[2:]]['incorrect'] += 1
    
    # both are 'O': add to correct
    if token=='O' and token_prediction=='O':
        metric_dict['total']['correct'] += 1
    
    # token is 'O', prediction is something else: HYPOTHESIZE
    elif token=='O' and token_prediction!='O':
        metric_dict['total']['hypothesize'] += 1
        metric_dict[token_prediction[2:]]['hypothesize'] += 1
        
    # token is not 'O', prediction is 'O': MISSED
    elif token!='O' and token_prediction=='O':
        metric_dict['total']['missed'] += 1
        metric_dict[token[2:]]['missed'] += 1
    
    # token is not 'O', predictions is not 'O'
    elif token!='O' and token_prediction!='O':
        if token==token_prediction:
            metric_dict['total']['correct'] += 1
            metric_dict['total']['correct-not-O'] += 1
            metric_dict[token[2:]]['correct'] += 1
            metric_dict[token[2:]]['correct-not-O'] += 1
        elif token[:2]==token_prediction[:2]:
            metric_dict['total']['correct-type-only'] += 1
            metric_dict[token[2:]]['correct-type-only'] += 1
            
    else:
        print('An error has occured.')
        print(token)
        print(token_prediction)
    
    return metric_dict

In [18]:
category_counter = {'O': 0, 'not-O': 0,
                    'B-PER': 0, 'I-PER': 0, 
                    'B-LOC': 0, 'I-LOC': 0, 
                    'B-ORG': 0, 'I-ORG': 0}

flair_counter = deepcopy(category_counter)
bert_counter = deepcopy(category_counter)

metrics = {
    'correct': 0,          # exactly the same
    'correct-not-O': 0,    # exactly the same and not 'O'
    'incorrect': 0,        # not exactly the same
    'missed': 0,           # token is not 'O', but 'O' is predicted
    'hypothesize': 0,      # token is 'O', but something else is predicted
    'correct-type-only': 0 # type not 'O' and is correct, disregarding IOB
}

flair_total_metrics = {
    'total': deepcopy(metrics),
    'PER': deepcopy(metrics),
    'LOC': deepcopy(metrics),
    'ORG': deepcopy(metrics)
}

bert_total_metrics = deepcopy(flair_total_metrics)

for i, sentence in enumerate(categories):
    for j, token in enumerate(sentence):
        
        category_counter[token] += 1
        if token!='O':
            category_counter['not-O'] += 1
            
        flair_counter[flair_preds[i][j]] += 1
        if flair_preds[i][j]!='O':
            flair_counter['not-O'] += 1
            
        bert_counter[bert_preds[i][j]] += 1
        if bert_preds[i][j]!='O':
            bert_counter['not-O'] += 1
        
        flair_total_metrics = compute_metrics(token, flair_preds[i][j], flair_total_metrics)
        bert_total_metrics = compute_metrics(token, bert_preds[i][j], bert_total_metrics)  

### Confusion matrix, precision and recall computations

In [19]:
bert_cm = np.array(confusion_matrix(categories_flattened, bert_preds_flattened, labels=labels))
flair_cm = np.array(confusion_matrix(categories_flattened, flair_preds_flattened, labels=labels))

In [20]:
types = {
    'O': 0,
    'B-PER': 0, 'I-PER': 0, 
    'B-LOC': 0, 'I-LOC': 0, 
    'B-ORG': 0, 'I-ORG': 0
}

In [21]:
bert_acc = sum([bert_cm[i][i] for i in range(len(bert_cm))])/sum([item for sublist in bert_cm for item in sublist])
flair_acc = sum([flair_cm[i][i] for i in range(len(flair_cm))])/sum([item for sublist in flair_cm for item in sublist])

In [22]:
bert_prec = deepcopy(types)
flair_prec = deepcopy(types)

In [23]:
for i, key in enumerate(types.keys()):
    bert_prec[key] = bert_cm[i][i]/sum(bert_cm.T[i])
    flair_prec[key] = flair_cm[i][i]/sum(flair_cm.T[i])

In [24]:
bert_recall = deepcopy(types)
flair_recall = deepcopy(types)

In [25]:
for i, key in enumerate(types.keys()):
    bert_recall[key] = bert_cm[i][i]/sum(bert_cm[i])
    flair_recall[key] = flair_cm[i][i]/sum(flair_cm[i])

In [27]:
flair_counter

{'O': 9462,
 'not-O': 561,
 'B-PER': 185,
 'I-PER': 143,
 'B-LOC': 95,
 'I-LOC': 2,
 'B-ORG': 90,
 'I-ORG': 46}