# Named Entity Recognition - Conditional Random Fields

## Import

In [1]:
cd ..

/mnt/Data/repo/nlp


In [2]:
import gzip
import json
import pickle

import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import metrics

from algorithms.ner_crf import sent2features, sent2labels

In [13]:
SEED = 42

## Load Data

In [4]:
with gzip.open('datasets/ner/data.json.gz', mode='rt', encoding='utf-8') as fp:
    data = json.load(fp)

In [5]:
data[0]

[[['Thousands', 'NNS'], 'O'],
 [['of', 'IN'], 'O'],
 [['demonstrators', 'NNS'], 'O'],
 [['have', 'VBP'], 'O'],
 [['marched', 'VBN'], 'O'],
 [['through', 'IN'], 'O'],
 [['London', 'NNP'], 'B-geo'],
 [['to', 'TO'], 'O'],
 [['protest', 'VB'], 'O'],
 [['the', 'DT'], 'O'],
 [['war', 'NN'], 'O'],
 [['in', 'IN'], 'O'],
 [['Iraq', 'NNP'], 'B-geo'],
 [['and', 'CC'], 'O'],
 [['demand', 'VB'], 'O'],
 [['the', 'DT'], 'O'],
 [['withdrawal', 'NN'], 'O'],
 [['of', 'IN'], 'O'],
 [['British', 'JJ'], 'B-gpe'],
 [['troops', 'NNS'], 'O'],
 [['from', 'IN'], 'O'],
 [['that', 'DT'], 'O'],
 [['country', 'NN'], 'O'],
 [['.', '.'], 'O']]

In [6]:
labeled_texts = []

for text in data:
    temp = []
    for word in text:
        temp.append((word[0][0], word[1]))
    labeled_texts.append(temp)

In [8]:
labeled_texts[0]

[('Thousands', 'O'),
 ('of', 'O'),
 ('demonstrators', 'O'),
 ('have', 'O'),
 ('marched', 'O'),
 ('through', 'O'),
 ('London', 'B-geo'),
 ('to', 'O'),
 ('protest', 'O'),
 ('the', 'O'),
 ('war', 'O'),
 ('in', 'O'),
 ('Iraq', 'B-geo'),
 ('and', 'O'),
 ('demand', 'O'),
 ('the', 'O'),
 ('withdrawal', 'O'),
 ('of', 'O'),
 ('British', 'B-gpe'),
 ('troops', 'O'),
 ('from', 'O'),
 ('that', 'O'),
 ('country', 'O'),
 ('.', 'O')]

## Feature extraction functions

In [9]:
sent2features(labeled_texts[0])[3]

{'+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'marched',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.lower()': 'demonstrators',
 '-2:word.istitle()': False,
 '-2:word.isupper()': False,
 '-2:word.lower()': 'of',
 'bias': 1.0,
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': 'have',
 'word_length': 4}

## Train, test split

In [10]:
texts_length = len(labeled_texts)
texts_length

62010

In [11]:
train_share = .75
train_size = int(texts_length * train_share)
train_size

46507

In [14]:
np.random.seed(SEED)
train_indices = np.random.choice(texts_length, size=train_size, replace=False)

In [15]:
X_train = [sent2features(labeled_texts[i]) for i in train_indices]
y_train = [sent2labels(labeled_texts[i]) for i in train_indices]

X_test = [sent2features(labeled_texts[i]) for i in range(texts_length) if i not in train_indices]
y_test = [sent2labels(labeled_texts[i]) for i in range(texts_length) if i not in train_indices]

## Train

In [16]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=300, 
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 46507/46507 [00:09<00:00, 4985.54it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 183073
Seconds required: 1.519

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=2.99  loss=1375912.83 active=181496 feature_norm=1.00
Iter 2   time=1.52  loss=1321711.13 active=174592 feature_norm=0.94
Iter 3   time=3.00  loss=1118300.38 active=171217 feature_norm=0.94
Iter 4   time=1.50  loss=973376.08 active=169631 feature_norm=1.26
Iter 5   time=3.04  loss=880417.44 active=170174 feature_norm=2.20
Iter 6   time=3.10  loss=749928.65 active=171999 feature_norm=2.27
Iter 7   time=1.53  loss=698928.58 active=172368 feature_norm=2.87
Iter 8   time=3.06  loss=674832.37 active=182545 feature_norm=3.25
Iter 9   time=1.53  loss=650271.99 active=182534 feature_no

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

## Evaluation

In [17]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-gpe',
 'B-tim',
 'B-per',
 'I-per',
 'B-geo',
 'B-org',
 'I-org',
 'I-geo',
 'I-tim',
 'I-gpe',
 'B-nat',
 'B-art',
 'B-eve',
 'I-eve',
 'I-art',
 'I-nat']

In [18]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.84741542204009912

In [19]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

             precision    recall  f1-score   support

      B-art      0.463     0.152     0.229       125
      I-art      0.500     0.231     0.316       104
      B-eve      0.583     0.389     0.467        90
      I-eve      0.467     0.189     0.269        74
      B-geo      0.858     0.906     0.881     12080
      I-geo      0.823     0.796     0.809      2428
      B-gpe      0.967     0.933     0.949      4883
      I-gpe      0.868     0.623     0.725        53
      B-nat      0.659     0.453     0.537        64
      I-nat      0.688     0.524     0.595        21
      B-org      0.807     0.725     0.764      6653
      I-org      0.788     0.804     0.796      5502
      B-per      0.852     0.813     0.832      5441
      I-per      0.847     0.886     0.866      5550
      B-tim      0.923     0.881     0.902      6580
      I-tim      0.827     0.761     0.792      2076

avg / total      0.855     0.842     0.847     51724



## Classifier performance

In [20]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-art  -> I-art   7.799578
B-nat  -> I-nat   6.688903
I-art  -> I-art   6.196308
I-gpe  -> I-gpe   6.021471
B-eve  -> I-eve   5.996310
I-eve  -> I-eve   5.820601
I-nat  -> I-nat   5.469920
B-per  -> I-per   4.951632
B-geo  -> I-geo   4.410065
B-gpe  -> I-gpe   4.378625
I-tim  -> I-tim   4.315825
B-tim  -> I-tim   3.960423
B-org  -> I-org   3.893750
O      -> O       3.864032
I-geo  -> I-geo   3.842113
I-org  -> I-org   3.631559
I-per  -> I-per   3.305204
O      -> B-per   1.606649
O      -> B-tim   1.394839
O      -> B-art   1.287362

Top unlikely transitions:
B-gpe  -> I-geo   -4.545596
B-org  -> I-per   -4.600611
I-org  -> I-geo   -4.640364
B-tim  -> I-org   -4.657284
I-org  -> I-tim   -4.818423
I-tim  -> B-tim   -5.066340
B-geo  -> B-geo   -5.222233
I-org  -> I-per   -5.398635
B-org  -> B-org   -5.678400
B-geo  -> I-org   -5.714928
O      -> I-per   -5.740573
B-gpe  -> I-org   -5.893992
I-per  -> B-per   -6.216838
B-gpe  -> B-gpe   -6.245837
B-tim  -> B-tim  

In [21]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
13.899187 B-org    word.lower():al-qaida
11.673425 O        word.lower():a
11.398022 B-tim    word.lower():monday
11.393706 B-tim    word.lower():tuesday
11.391254 B-tim    word.lower():wednesday
11.356989 B-tim    word.lower():thursday
11.356911 O        word.lower():i
11.193764 B-tim    word.lower():friday
10.944232 B-tim    word.lower():sunday
10.800134 B-tim    word.lower():saturday
10.349381 O        word.lower():h5n1
9.661956 B-gpe    word.lower():nepal
9.186212 B-gpe    word.lower():israeli
9.175654 B-gpe    word.lower():arabs
9.162647 B-gpe    word.lower():iraqi
9.134573 B-gpe    word.lower():afghan
8.997418 B-per    word.lower():vice
8.815316 B-per    word.lower():al-zarqawi
8.716122 O        word.lower():last
8.696344 B-gpe    word.lower():iranian
8.629663 B-gpe    word.lower():niger
8.415910 O        BOS
8.340910 B-tim    word.lower():today
8.111946 B-tim    word.lower():1990s
8.070171 B-org    word.lower():taleban
8.005969 B-art    word.lower():spaceshipone
7.

## Save Model

In [22]:
with gzip.open('models/ner/crf.pkl.gz', mode='wb') as fp:
    pickle.dump(crf, fp)

# Test

In [23]:
from algorithms.ner_crf import NerCrf

In [24]:
labeler = NerCrf('models/ner/crf.pkl.gz')

In [25]:
text = (
    'Mr. Puigdemont has appeared in public in Brussels with several colleagues'
    'after declaring independence from Spain on October 27.'
)

In [26]:
labeler.predict(text)

[('Mr', 'B-per'),
 ('Puigdemont', 'I-per'),
 ('has', 'O'),
 ('appeared', 'O'),
 ('in', 'O'),
 ('public', 'O'),
 ('in', 'O'),
 ('Brussels', 'B-geo'),
 ('with', 'O'),
 ('several', 'O'),
 ('colleaguesafter', 'O'),
 ('declaring', 'O'),
 ('independence', 'O'),
 ('from', 'O'),
 ('Spain', 'B-geo'),
 ('on', 'O'),
 ('October', 'B-tim'),
 ('27', 'I-tim')]