python crf库
https://github.com/TeamHG-Memex/sklearn-crfsuite
https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html
http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/


CRF++ 使用起来感觉要麻烦一点：
https://taku910.github.io/crfpp/#install
https://blog.csdn.net/u010626937/article/details/78414292
https://blog.csdn.net/Gransand/article/details/80284436

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from itertools import chain
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



In [3]:
import nltk
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [4]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

CPU times: user 2.05 s, sys: 137 ms, total: 2.19 s
Wall time: 2.22 s


In [5]:
train_sents[0]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [7]:
sent2features(train_sents[0])[0]

{'+1:postag': 'Fpa',
 '+1:postag[:2]': 'Fp',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': '(',
 'BOS': True,
 'bias': 1.0,
 'postag': 'NP',
 'postag[:2]': 'NP',
 'word.isdigit()': False,
 'word.istitle()': True,
 'word.isupper()': False,
 'word.lower()': 'melbourne',
 'word[-2:]': 'ne',
 'word[-3:]': 'rne'}

In [8]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 1.04 s, sys: 227 ms, total: 1.27 s
Wall time: 1.31 s


In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 31.4 s, sys: 640 ms, total: 32 s
Wall time: 32.9 s


In [10]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [11]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7964686316443963

In [12]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.810     0.784     0.797      1084
      I-LOC      0.690     0.637     0.662       325
     B-MISC      0.731     0.569     0.640       339
     I-MISC      0.699     0.589     0.639       557
      B-ORG      0.807     0.832     0.820      1400
      I-ORG      0.852     0.786     0.818      1104
      B-PER      0.850     0.884     0.867       735
      I-PER      0.893     0.943     0.917       634

avg / total      0.809     0.787     0.796      6178



In [13]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.810     0.784     0.797      1084
      I-LOC      0.690     0.637     0.662       325
     B-MISC      0.731     0.569     0.640       339
     I-MISC      0.699     0.589     0.639       557
      B-ORG      0.807     0.832     0.820      1400
      I-ORG      0.852     0.786     0.818      1104
      B-PER      0.850     0.884     0.867       735
      I-PER      0.893     0.943     0.917       634

avg / total      0.809     0.787     0.796      6178



In [14]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-ORG  -> I-ORG   7.500912
I-ORG  -> I-ORG   7.206322
B-MISC -> I-MISC  6.833142
I-MISC -> I-MISC  6.753222
B-PER  -> I-PER   6.404557
B-LOC  -> I-LOC   5.696274
I-LOC  -> I-LOC   4.877422
I-PER  -> I-PER   4.709231
O      -> O       3.784430
O      -> B-ORG   2.754974
O      -> B-PER   2.549453
O      -> B-LOC   1.846099
O      -> B-MISC  1.804584
B-LOC  -> B-LOC   0.578393
B-ORG  -> O       0.325175
I-PER  -> B-LOC   0.300667
B-MISC -> B-ORG   0.298525
B-ORG  -> B-LOC   0.266688
B-LOC  -> B-PER   -0.046324
B-MISC -> O       -0.143646

Top unlikely transitions:
I-LOC  -> B-MISC  -1.976574
I-MISC -> I-PER   -2.008671
B-ORG  -> B-ORG   -2.107974
I-ORG  -> B-LOC   -2.199630
I-MISC -> B-LOC   -2.240108
I-ORG  -> I-PER   -2.272384
B-PER  -> B-MISC  -2.325289
I-PER  -> I-LOC   -2.455352
I-ORG  -> B-MISC  -2.486495
I-PER  -> B-ORG   -2.512129
I-ORG  -> I-LOC   -2.536158
I-MISC -> I-LOC   -2.557052
B-ORG  -> B-MISC  -2.581202
B-PER  -> B-PER   -2.825385
I-PER  -> B-MIS

In [15]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
9.810583 B-ORG    word.lower():efe-cantabria
8.587255 B-ORG    word.lower():psoe-progresistas
6.026318 I-ORG    -1:word.lower():l
4.902771 B-ORG    word.lower():xfera
4.896558 B-LOC    -1:word.lower():cantabria
4.867565 O        BOS
4.810829 B-LOC    word.lower():líbano
4.760313 B-ORG    word.lower():telefónica
4.723549 B-MISC   word.lower():justicia
4.674730 B-ORG    word[-2:]:-e
4.597672 B-MISC   word.lower():competencia
4.582394 O        word.lower():r.
4.582394 O        word[-3:]:R.
4.545455 B-MISC   word.lower():diversia
4.409233 B-ORG    word.lower():petrobras
4.277603 B-ORG    word.lower():coag-extremadura
4.261705 B-PER    -1:word.lower():según
4.229368 I-LOC    -1:word.lower():calle
4.223481 B-ORG    word.isupper()
4.189817 B-ORG    word.lower():esquerra
4.188726 B-PER    word.lower():valedor
4.156011 O        word.lower():b
4.156011 O        word[-3:]:B
4.156011 O        word[-2:]:B
4.150794 B-ORG    word.lower():terra
4.121745 B-ORG    -1:word.lower():distancia