# Named Entity Recognition - Conditional Random Fields

## Import

In [2]:
cd ..

/home/vitaliy/Documents/repo/nlp


In [54]:
import gzip
import json
import pickle

import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import metrics

from algorithms.ner_crf import sent2features, sent2labels

## Load Data

In [3]:
with gzip.open('datasets/ner/data.json', mode='rt', encoding='utf-8') as fp:
    data = json.load(fp)

In [4]:
data[0]

[[['Thousands', 'NNS'], 'O'],
 [['of', 'IN'], 'O'],
 [['demonstrators', 'NNS'], 'O'],
 [['have', 'VBP'], 'O'],
 [['marched', 'VBN'], 'O'],
 [['through', 'IN'], 'O'],
 [['London', 'NNP'], 'B-geo'],
 [['to', 'TO'], 'O'],
 [['protest', 'VB'], 'O'],
 [['the', 'DT'], 'O'],
 [['war', 'NN'], 'O'],
 [['in', 'IN'], 'O'],
 [['Iraq', 'NNP'], 'B-geo'],
 [['and', 'CC'], 'O'],
 [['demand', 'VB'], 'O'],
 [['the', 'DT'], 'O'],
 [['withdrawal', 'NN'], 'O'],
 [['of', 'IN'], 'O'],
 [['British', 'JJ'], 'B-gpe'],
 [['troops', 'NNS'], 'O'],
 [['from', 'IN'], 'O'],
 [['that', 'DT'], 'O'],
 [['country', 'NN'], 'O'],
 [['.', '.'], 'O']]

In [7]:
labeled_texts = []

for text in data:
    temp = []
    for word in text:
        temp.append((word[0][0], word[1]))
    labeled_texts.append(temp)

In [8]:
labeled_texts[0]

[('Thousands', 'O'),
 ('of', 'O'),
 ('demonstrators', 'O'),
 ('have', 'O'),
 ('marched', 'O'),
 ('through', 'O'),
 ('London', 'B-geo'),
 ('to', 'O'),
 ('protest', 'O'),
 ('the', 'O'),
 ('war', 'O'),
 ('in', 'O'),
 ('Iraq', 'B-geo'),
 ('and', 'O'),
 ('demand', 'O'),
 ('the', 'O'),
 ('withdrawal', 'O'),
 ('of', 'O'),
 ('British', 'B-gpe'),
 ('troops', 'O'),
 ('from', 'O'),
 ('that', 'O'),
 ('country', 'O'),
 ('.', 'O')]

## Feature extraction functions

In [11]:
sent2features(labeled_texts[0])[3]

{'+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'marched',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:word.lower()': 'demonstrators',
 '-2:word.istitle()': False,
 '-2:word.isupper()': False,
 '-2:word.lower()': 'of',
 'bias': 1.0,
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': 'have',
 'word_length': 4}

## Train, test split

In [22]:
texts_length = len(labeled_texts)
texts_length

62010

In [28]:
train_share = .75
train_size = int(texts_length * train_share)
train_size

46507

In [29]:
train_indices = np.random.choice(texts_length, size=train_size, replace=False)

In [34]:
X_train = [sent2features(labeled_texts[i]) for i in train_indices]
y_train = [sent2labels(labeled_texts[i]) for i in train_indices]

X_test = [sent2features(labeled_texts[i]) for i in range(texts_length) if i not in train_indices]
y_test = [sent2labels(labeled_texts[i]) for i in range(texts_length) if i not in train_indices]

In [35]:
X_train[0]

[{'+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'government',
  'BOS': True,
  'bias': 1.0,
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'the',
  'word_length': 3},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'approved',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'the',
  'bias': 1.0,
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'government',
  'word_length': 10},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'two',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'government',
  '-2:word.istitle()': True,
  '-2:word.isupper()': False,
  '-2:word.lower()': 'the',
  'bias': 1.0,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'approved',
  'word_length': 8},
 {'+1

In [36]:
y_train[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-tim',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-tim',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-org',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

## Train

In [37]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=200, 
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 46507/46507 [00:10<00:00, 4510.80it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 183213
Seconds required: 1.654

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=3.62  loss=1368882.80 active=181615 feature_norm=1.00
Iter 2   time=1.85  loss=1315108.69 active=174930 feature_norm=0.94
Iter 3   time=3.70  loss=1111802.94 active=171318 feature_norm=0.95
Iter 4   time=1.85  loss=968330.12 active=169832 feature_norm=1.26
Iter 5   time=3.70  loss=874595.63 active=170230 feature_norm=2.20
Iter 6   time=3.70  loss=746253.87 active=172120 feature_norm=2.28
Iter 7   time=1.85  loss=695858.54 active=172435 feature_norm=2.86
Iter 8   time=3.69  loss=670263.15 active=182658 feature_norm=3.25
Iter 9   time=1.85  loss=646173.13 active=182638 feature_no

Iter 162 time=1.95  loss=46163.13 active=75444 feature_norm=304.90
Iter 163 time=1.94  loss=46156.06 active=75431 feature_norm=304.92
Iter 164 time=1.86  loss=46148.87 active=75425 feature_norm=304.96
Iter 165 time=1.86  loss=46141.92 active=75382 feature_norm=304.97
Iter 166 time=1.88  loss=46132.87 active=75373 feature_norm=305.02
Iter 167 time=1.88  loss=46126.58 active=75368 feature_norm=305.03
Iter 168 time=1.94  loss=46116.80 active=75358 feature_norm=305.08
Iter 169 time=2.02  loss=46110.37 active=75354 feature_norm=305.09
Iter 170 time=1.95  loss=46100.88 active=75326 feature_norm=305.13
Iter 171 time=1.90  loss=46095.05 active=75296 feature_norm=305.14
Iter 172 time=1.90  loss=46085.28 active=75301 feature_norm=305.17
Iter 173 time=1.89  loss=46079.07 active=75278 feature_norm=305.17
Iter 174 time=1.89  loss=46070.06 active=75264 feature_norm=305.21
Iter 175 time=1.88  loss=46063.80 active=75238 feature_norm=305.21
Iter 176 time=1.86  loss=46055.25 active=75226 feature_norm=30

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=200,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

## Evaluation

In [38]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-tim',
 'B-org',
 'B-geo',
 'I-geo',
 'I-org',
 'B-per',
 'I-per',
 'B-gpe',
 'I-tim',
 'I-gpe',
 'B-eve',
 'I-eve',
 'B-art',
 'I-art',
 'B-nat',
 'I-nat']

In [39]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.84606456543431818

In [62]:
t = crf.predict(X_test)

In [65]:
X_test[:1]

[[{'+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'party',
   'BOS': True,
   'bias': 1.0,
   'word.isdigit()': False,
   'word.istitle()': True,
   'word.isupper()': False,
   'word.lower()': 'the',
   'word_length': 3},
  {'+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'is',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:word.lower()': 'the',
   'bias': 1.0,
   'word.isdigit()': False,
   'word.istitle()': False,
   'word.isupper()': False,
   'word.lower()': 'party',
   'word_length': 5},
  {'+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lower()': 'divided',
   '-1:word.istitle()': False,
   '-1:word.isupper()': False,
   '-1:word.lower()': 'party',
   '-2:word.istitle()': True,
   '-2:word.isupper()': False,
   '-2:word.lower()': 'the',
   'bias': 1.0,
   'word.isdigit()': False,
   'word.istitle()': False,
   'word.isupper()': False,
   'word.lower()': 'is',
   'word_le

In [41]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

             precision    recall  f1-score   support

      B-art      0.340     0.119     0.177       134
      I-art      0.300     0.147     0.197       102
      B-eve      0.540     0.362     0.433        94
      I-eve      0.292     0.189     0.230        74
      B-geo      0.862     0.905     0.883     12377
      I-geo      0.806     0.800     0.803      2350
      B-gpe      0.973     0.933     0.953      5199
      I-gpe      0.884     0.528     0.661        72
      B-nat      0.703     0.481     0.571        54
      I-nat      0.800     0.571     0.667        14
      B-org      0.800     0.724     0.760      6635
      I-org      0.788     0.796     0.792      5418
      B-per      0.847     0.814     0.830      5547
      I-per      0.841     0.887     0.864      5661
      B-tim      0.923     0.874     0.898      6653
      I-tim      0.835     0.744     0.787      2259

avg / total      0.854     0.840     0.846     52643



## Classifier performance

In [44]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-art  -> I-art   6.406446
B-nat  -> I-nat   6.316445
I-nat  -> I-nat   5.734051
I-art  -> I-art   5.618790
I-eve  -> I-eve   5.397647
I-gpe  -> I-gpe   4.984686
B-gpe  -> I-gpe   4.813652
B-eve  -> I-eve   4.790266
B-per  -> I-per   4.218022
I-tim  -> I-tim   4.171658
B-geo  -> I-geo   3.757398
O      -> O       3.671554
I-geo  -> I-geo   3.589052
B-tim  -> I-tim   3.520231
I-org  -> I-org   3.327588
B-org  -> I-org   3.092358
I-per  -> I-per   2.550598
O      -> B-tim   1.795916
O      -> B-eve   1.373165
O      -> B-per   1.195057

Top unlikely transitions:
I-org  -> I-geo   -4.759526
B-per  -> I-org   -4.885403
B-org  -> I-geo   -4.977081
B-tim  -> I-org   -5.325729
B-geo  -> I-per   -5.362560
I-org  -> I-per   -5.721334
B-org  -> I-per   -5.757149
B-geo  -> B-geo   -5.829179
O      -> I-per   -5.896453
B-org  -> B-org   -5.997359
I-org  -> B-org   -6.060812
B-gpe  -> I-org   -6.117887
B-gpe  -> B-gpe   -6.363654
B-tim  -> B-tim   -6.402795
B-geo  -> I-org  

In [45]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
13.672950 B-org    word.lower():al-qaida
11.935420 O        word.lower():a
11.430654 B-tim    word.lower():thursday
11.352922 O        word.lower():i
11.319374 B-tim    word.lower():tuesday
11.254931 B-tim    word.lower():friday
11.186154 B-tim    word.lower():wednesday
11.167234 B-tim    word.lower():monday
10.825521 B-tim    word.lower():sunday
10.799703 B-tim    word.lower():saturday
10.269927 O        word.lower():h5n1
9.791987 B-gpe    word.lower():arabs
9.429518 B-per    word.lower():vice
9.377613 B-gpe    word.lower():iraqi
9.294318 B-gpe    word.lower():nepal
9.201875 O        word.lower():last
9.174137 B-gpe    word.lower():afghan
8.701398 B-gpe    word.lower():iranian
8.684571 O        BOS
8.610892 B-tim    word.lower():one-year
8.539483 B-org    word.lower():conocophillips
8.504920 B-per    word.lower():al-zarqawi
8.486828 B-gpe    word.lower():israeli
8.268152 B-tim    word.lower():today
8.252255 B-per    word.lower():mr.
8.236853 B-org    word.lower():taleban

## Save Model

In [55]:
with gzip.open('models/ner/crf.pkl.gz', mode='wb') as fp:
    pickle.dump(crf, fp)

# Test

In [66]:
from algorithms.ner_crf import NerCrf

In [67]:
labeler = NerCrf('models/ner/crf.pkl.gz')

In [68]:
text = 'Mr. Puigdemont has appeared in public in Brussels with several colleagues after declaring independence from Spain on October 27.'

In [70]:
labeler.predict(text)

[('Mr', 'B-per'),
 ('Puigdemont', 'I-per'),
 ('has', 'O'),
 ('appeared', 'O'),
 ('in', 'O'),
 ('public', 'O'),
 ('in', 'O'),
 ('Brussels', 'B-geo'),
 ('with', 'O'),
 ('several', 'O'),
 ('colleagues', 'O'),
 ('after', 'O'),
 ('declaring', 'O'),
 ('independence', 'O'),
 ('from', 'O'),
 ('Spain', 'B-geo'),
 ('on', 'O'),
 ('October', 'B-tim'),
 ('27', 'I-tim')]