In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import sklearn_crfsuite
from itertools import chain
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import re
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



In [3]:
%%time
def get_sents(path):
    #'../data/train.txt';'../data/dev.txt'
    sentences = []
    sentence = []
    cnt = 0
    split_pattern = re.compile(r',|\.|;|，|。|；') #.要转义，不然表示的是通配符
    with open(path,'r',encoding = 'utf8') as f:
        for line in f.readlines():#每行为一个字符和其tag，中间用tab隔开
            line = line.strip().split('\t')
            if(not line or len(line) < 2): continue
            word = (line[0],line[1])
            if split_pattern.match(word[0]):
                sentence.append(word)
                sentences.append(sentence.copy())
                sentence.clear()
            else:
                sentence.append(word)
        if(len(sentence)):
            sentences.append(sentence.copy())
            sentence.clear()
    return sentences
train_sentences = get_sents('../data/train.txt')
print(len(train_sentences))

251768
CPU times: user 6.79 s, sys: 608 ms, total: 7.4 s
Wall time: 7.6 s


In [4]:
train_sentences[0]

[('当', 'O'),
 ('希', 'O'),
 ('望', 'O'),
 ('工', 'O'),
 ('程', 'O'),
 ('救', 'O'),
 ('助', 'O'),
 ('的', 'O'),
 ('百', 'O'),
 ('万', 'O'),
 ('儿', 'O'),
 ('童', 'O'),
 ('成', 'O'),
 ('长', 'O'),
 ('起', 'O'),
 ('来', 'O'),
 ('，', 'O')]

In [5]:
[label for token,label in train_sentences[0]]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [6]:
%%time
train_sents = train_sentences
test_sents = get_sents('../data/dev.txt')

CPU times: user 264 ms, sys: 30.8 ms, total: 295 ms
Wall time: 302 ms


In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
#         'word[-3:]': word[-3:],
#         'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
#         'postag': postag, #原始数据中没有词性标记
#         'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent): #一个句子对应了一个序列
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]
#     return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token,label in sent]

In [8]:
sent2features(train_sents[0])[0] #注意中文和英文/spanish的区别，一是需要切词，二是word[-2:]这种是没意义的

{'bias': 1.0,
 'word.lower()': '当',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'BOS': True,
 '+1:word.lower()': '希',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False}

In [9]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 14.8 s, sys: 2.51 s, total: 17.3 s
Wall time: 18.6 s


In [10]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 5min 2s, sys: 11.1 s, total: 5min 13s
Wall time: 5min 29s


In [12]:
labels = list(crf.classes_)
labels.remove('O')
labels.remove('OO')
labels

['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [13]:
y_pred = crf.predict(X_test)
#输入要求是二维list
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7584794834915617

In [15]:
len(y_pred)

10754

In [17]:
y_pred[:10]

[['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O

In [16]:
len(y_test)

10754

In [13]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.887     0.729     0.800      2877
      I-LOC      0.837     0.705     0.765      4394
      B-ORG      0.826     0.497     0.620      1331
      I-ORG      0.855     0.587     0.696      5670
      B-PER      0.927     0.644     0.760      1973
      I-PER      0.821     0.899     0.858      3851

avg / total      0.854     0.693     0.758     20096



In [14]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-ORG  -> I-ORG   6.693297
I-ORG  -> I-ORG   5.917964
I-PER  -> I-PER   4.503271
B-PER  -> I-PER   4.424285
B-LOC  -> I-LOC   3.558887
I-LOC  -> I-LOC   3.147341
O      -> O       2.915145
I-PER  -> O       0.291362
O      -> B-PER   0.222525
O      -> OO      -0.009825
OO     -> B-LOC   -0.011809
OO     -> I-ORG   -0.056277
I-LOC  -> O       -0.085696
I-PER  -> B-PER   -0.092894
O      -> B-LOC   -0.099661
I-ORG  -> OO      -0.186779
OO     -> B-ORG   -0.210731
B-LOC  -> OO      -0.226664
B-LOC  -> B-LOC   -0.248156
O      -> B-ORG   -0.305862

Top unlikely transitions:
I-LOC  -> B-ORG   -3.568171
B-ORG  -> B-PER   -3.631431
I-ORG  -> B-ORG   -3.632806
I-ORG  -> I-LOC   -3.685865
O      -> I-PER   -3.881603
B-LOC  -> B-ORG   -4.392521
O      -> I-LOC   -4.430602
B-ORG  -> B-LOC   -4.602847
B-ORG  -> I-PER   -4.648572
B-PER  -> I-ORG   -4.869667
I-ORG  -> I-PER   -5.236413
O      -> I-ORG   -5.244606
B-LOC  -> B-PER   -5.305368
B-ORG  -> I-LOC   -5.520174
B-PER 

In [15]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])
#结果挺合理，寇确实是姓比较多
#禺确实是名比较多

Top positive:
10.012017 B-PER    word.lower():寇
9.400738 I-PER    word.lower():禺
9.329036 O        word.lower():、
8.872628 B-PER    word.lower():赵
8.669026 B-PER    word.lower():薛
8.159837 I-PER    word.lower():赵
8.038186 O        word.lower():：
7.970054 B-PER    word.lower():袁
7.968682 B-PER    word.lower():褚
7.889821 O        word.lower():在
7.671005 B-PER    word.lower():崔
7.267791 O        word.lower():到
7.266307 I-LOC    word.lower():圳
7.239389 O        EOS
7.220608 B-PER    word.lower():靳
7.177016 O        +1:word.lower():锅
7.141145 B-LOC    word.lower():淮
7.077142 B-LOC    -1:word.lower():℃
7.001757 O        word.lower():讯
6.832341 O        word.lower():说
6.810339 O        word.lower():是
6.627558 B-PER    word.lower():邱
6.602011 B-PER    word.lower():董
6.568463 O        word.lower():将
6.411261 B-PER    word.lower():翟
6.353531 B-PER    word.lower():滕
6.289114 O        word.lower():给
6.258655 O        word.lower():访
6.227652 B-LOC    word.lower():秘
6.214056 O        word.lower():等
