# <span style="font-family:Courier New; color:#CCCCCC">**Dutch Named Entity Recognition CRF**</span>

In [None]:
%pip install eli5

## <span style="font-family:Courier New; color:#336666">**Load Data and Imports**</span>

In [10]:
from preprocessing import convert_BIO
from ner_evaluation import *
from feature_getter import Feature_getter
import pycrfsuite
from collections import Counter

import nltk
nltk.download('conll2002')
from nltk.corpus import conll2002

ned_train = conll2002.iob_sents('ned.train')
ned_val = conll2002.iob_sents('ned.testa')
ned_test = conll2002.iob_sents('ned.testb')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\jerez\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


## <span style="font-family:Courier New; color:#336666">**Preprocessing Data**</span>

In [2]:
ned_train_BIO = convert_BIO(ned_train)
ned_val_BIO = convert_BIO(ned_val)
ned_test_BIO = convert_BIO(ned_test)

X_val_BIO = [[word[0] for word in sent] for sent in ned_val_BIO]
y_val_BIO = [[word[1] for word in sent] for sent in ned_val_BIO]
X_test_BIO = [[word[0] for word in sent] for sent in ned_test_BIO]
y_test_BIO = [[word[1] for word in sent] for sent in ned_test_BIO]

In [None]:
'''names =  []
for sent in ned_train_BIO:
    for token, label in sent:
        if label == 'B-PER':
            names.append(token)
r = Counter(names)
print(r.keys())'''

## <span style="font-family:Courier New; color:#336666">**Train Classifier**</span>

In [14]:
model = nltk.tag.CRFTagger()
model.train(ned_train_BIO, 'models/ned_baseline_BIO.tagger')

In [15]:
pred_ned_BIO = model.tag_sents(X_test_BIO)


In [16]:
info = model._tagger.info()

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.3f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
8.391 O      PUNCTUATION
4.917 O      SUF_n
4.892 O      SUF_t
4.850 O      SUF_e
4.548 O      SUF_E
4.544 O      SUF_k
4.496 O      WORD_U
4.492 O      SUF_p
4.450 O      SUF_m
4.441 O      SUF_f
4.416 O      SUF_g
4.284 O      SUF_d
4.256 O      SUF_s
4.081 O      SUF_r
4.034 O      HAS_NUM
4.001 O      WORD_.
3.968 O      SUF_l
3.907 O      WORD_Ik
3.872 O      WORD_Algemeen
3.833 I-ORG  WORD_Morgen

Top negative:
-1.242 B-ORG  SUF_g
-1.248 O      WORD_graf
-1.256 O      WORD_eredienst
-1.323 B-ORG  SUF_e
-1.335 I-MISC SUF_i
-1.338 O      SUF_ck
-1.392 O      WORD_the
-1.400 B-ORG  SUF_n
-1.404 B-ORG  SUF_.
-1.446 B-PER  SUF_w
-1.462 O      SUF_adt
-1.568 B-PER  HAS_NUM
-1.576 I-ORG  WORD_De
-1.592 B-ORG  SUF_k
-1.592 O      SUF_our
-1.643 B-LOC  HAS_NUM
-1.749 O      WORD_der
-1.788 O      WORD_den
-2.936 O      WORD_&
-6.022 O      CAPITALIZATION


In [17]:
y_pred_BIO = [[word[1] for word in sent] for sent in pred_ned_BIO]
print(bio_classification_report(y_test_BIO, y_pred_BIO))

              precision    recall  f1-score   support

       B-LOC       0.75      0.69      0.72       774
       I-LOC       0.44      0.37      0.40        49
      B-MISC       0.80      0.60      0.68      1187
      I-MISC       0.27      0.33      0.30       410
       B-ORG       0.78      0.60      0.68       882
       I-ORG       0.60      0.57      0.58       551
       B-PER       0.67      0.73      0.70      1098
       I-PER       0.76      0.91      0.83       807

   micro avg       0.69      0.66      0.67      5758
   macro avg       0.63      0.60      0.61      5758
weighted avg       0.70      0.66      0.67      5758
 samples avg       0.05      0.05      0.05      5758



In [18]:
results, _ = compute_metrics(ned_test_BIO, pred_ned_BIO)
results

{'correct': 2452,
 'incorrect': 715,
 'partial': 48,
 'missed': 711,
 'spurious': 281,
 'possible': 3926,
 'actual': 3496,
 'precision': 0.701,
 'recall': 0.625,
 'F1-score': 0.661}