# Named Entity Recognition using sklearn-crfsuite

In [None]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

## Let's use CoNLL 2003 data to build a NER system¶
CoNLL2002 corpus processing functions are available in NLTK. We use these functions to preprocess coNLL2003 dataset

In [None]:
nltk.corpus.conll2002.fileids()

## 1. Training data
CoNLL 2003 dataset contains a list of english sentences, with Named Entities annotated. It uses IOB2 encoding. CoNLL 2003 data also provide POS tags.

In [None]:
train_sents = list(nltk.corpus.conll2002.iob_sents('eng.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('eng.testb'))

In [None]:
train_sents[0]

## Feature Extraction
Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used. This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')

    return features

In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [None]:
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [None]:
def sent2tokens(sent):
    return [token for token, postag, label in sent]

This is what word2features extracts:

In [None]:
sent2features(train_sents[0])[0]

Extract the features from the data:

In [None]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [None]:
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

## Train the model
To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method. First, create pycrfsuite.Trainer and load the training data to CRFsuite:

In [None]:
trainer = pycrfsuite.Trainer(verbose=False)

In [None]:
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Set training parameters. We will use L-BFGS training algorithm (it is default) with Elastic Net (L1 + L2) regularization.

In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

Possible parameters for the default training algorithm:

In [None]:
trainer.params()

Train the model:

In [None]:
trainer.train('conll2002-esp.crfsuite')

We can also get information about the final state of the model by looking at the trainer's logparser. If we had tagged our input data using the optional group argument in add, and had used the optional holdout argument during train, there would be information about the trainer's performance on the holdout set as well.

In [None]:
trainer.logparser.last_iteration

We can also get this information for every step using trainer.logparser.iterations

In [None]:
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

## Make predictions
To use the trained model, create pycrfsuite.Tagger, open the model and use "tag" method:

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

example_sent = test_sents[0]

In [None]:
print(' '.join(sent2tokens(example_sent)), end='\n\n')

In [None]:
print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

## Evaluate the model

In [None]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )


Predict entity labels for all sentences in our testing set ('testb' data):

In [None]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

check the result.

In [None]:
print(bio_classification_report(y_test, y_pred))

## Let's check what classifier learned

In [None]:
from collections import Counter
info = tagger.info()

In [None]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

In [None]:
print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

In [None]:
print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Check the state features:

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))

In [None]:
print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

In [None]:
print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])