In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import sklearn_crfsuite
from itertools import chain
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import re
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



In [2]:
%%time
def get_sents(path):
    #'../data/train.txt';'../data/dev.txt'
    sentences = []
    sentence = []
    cnt = 0
    split_pattern = re.compile(r',|\.|;|，|。|；') #.要转义，不然表示的是通配符
    with open(path,'r',encoding = 'utf8') as f:
        for line in f.readlines():#每行为一个字符和其tag，中间用tab隔开
            line = line.strip().split('\t')
            if(not line or len(line) < 2): continue
            if line[1] == 'OO': # fix
                line[1] = 'O'
            word = (line[0],line[1])
            if split_pattern.match(word[0]):
                sentence.append(word)
                sentences.append(sentence.copy())
                sentence.clear()
            else:
                sentence.append(word)
        if(len(sentence)):
            sentences.append(sentence.copy())
            sentence.clear()
    return sentences
train_sentences = get_sents('../data/train.txt')
print(len(train_sentences))

251768
CPU times: user 6.89 s, sys: 541 ms, total: 7.43 s
Wall time: 8.07 s


In [3]:
train_sentences[0]

[('当', 'O'),
 ('希', 'O'),
 ('望', 'O'),
 ('工', 'O'),
 ('程', 'O'),
 ('救', 'O'),
 ('助', 'O'),
 ('的', 'O'),
 ('百', 'O'),
 ('万', 'O'),
 ('儿', 'O'),
 ('童', 'O'),
 ('成', 'O'),
 ('长', 'O'),
 ('起', 'O'),
 ('来', 'O'),
 ('，', 'O')]

In [4]:
[label for token,label in train_sentences[0]]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [5]:
%%time
train_sents = train_sentences
test_sents = get_sents('../data/dev.txt')

CPU times: user 243 ms, sys: 24.4 ms, total: 268 ms
Wall time: 271 ms


In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
#         'word[-3:]': word[-3:],
#         'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
#         'postag': postag, #原始数据中没有词性标记
#         'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent): #一个句子对应了一个序列
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]
#     return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token,label in sent]

In [7]:
sent2features(train_sents[0])[0] #注意中文和英文/spanish的区别，一是需要切词，二是word[-2:]这种是没意义的

{'bias': 1.0,
 'word.lower()': '当',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'BOS': True,
 '+1:word.lower()': '希',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False}

In [8]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 14.5 s, sys: 2.54 s, total: 17 s
Wall time: 18.2 s


In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 5min 1s, sys: 11 s, total: 5min 12s
Wall time: 5min 25s


In [10]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [11]:
y_pred = crf.predict(X_test)
#输入要求是二维list
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7542816896014515

In [12]:
len(y_pred)

10754

In [13]:
y_pred[:2]

[['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [14]:
len(y_test)

10754

In [15]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.887     0.724     0.797      2877
      I-LOC      0.830     0.699     0.759      4394
      B-ORG      0.832     0.505     0.628      1331
      I-ORG      0.845     0.579     0.687      5670
      B-PER      0.927     0.642     0.758      1973
      I-PER      0.818     0.901     0.857      3851

avg / total      0.850     0.689     0.754     20096



In [16]:
from seqeval.metrics import classification_report
from functools import reduce
y_test = reduce(lambda x,y : x + y, y_test)
y_pred = reduce(lambda x,y : x + y, y_pred)
print(classification_report(y_test,y_pred,digits=4))

             precision    recall  f1-score   support

        ORG     0.7649    0.4643    0.5778      1331
        PER     0.8105    0.5636    0.6649      1973
        LOC     0.7969    0.6628    0.7237      2877

avg / total     0.7943    0.5884    0.6735      6181



In [17]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-PER  -> I-PER   4.708168
B-LOC  -> I-LOC   3.938230
I-ORG  -> I-ORG   3.917188
B-ORG  -> I-ORG   3.657933
O      -> O       3.648308
I-PER  -> I-PER   3.501100
I-LOC  -> I-LOC   3.035867
O      -> B-LOC   0.233900
O      -> B-ORG   0.036867
I-LOC  -> O       0.033424
B-LOC  -> B-LOC   0.024545
I-PER  -> O       -0.192546
O      -> B-PER   -0.692180
I-ORG  -> O       -0.696039
I-LOC  -> B-LOC   -0.860370
B-LOC  -> O       -0.891950
B-PER  -> O       -1.157294
I-PER  -> B-PER   -2.262832
B-ORG  -> B-ORG   -2.305008
B-PER  -> B-PER   -2.675551

Top unlikely transitions:
I-LOC  -> B-ORG   -3.894457
I-PER  -> I-LOC   -3.948705
O      -> I-LOC   -3.991779
I-LOC  -> B-PER   -4.272581
B-LOC  -> B-ORG   -4.322920
I-ORG  -> I-PER   -4.992102
B-ORG  -> B-LOC   -5.182726
B-PER  -> B-ORG   -5.300838
B-ORG  -> I-PER   -5.524905
B-ORG  -> B-PER   -5.533414
B-PER  -> B-LOC   -5.567242
B-ORG  -> I-LOC   -6.002072
B-PER  -> I-ORG   -6.166633
I-PER  -> I-ORG   -6.215160
B-LOC  -

In [18]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])
#结果挺合理，寇确实是姓比较多
#禺确实是名比较多

Top positive:
10.414564 O        word.lower():、
9.880271 B-PER    word.lower():寇
9.001572 I-PER    word.lower():禺
8.232928 B-PER    word.lower():薛
8.101207 O        word.lower():：
8.098005 B-PER    word.lower():袁
8.025911 I-LOC    word.lower():圳
7.675100 O        word.lower():是
7.607252 O        word.lower():到
7.533269 B-PER    word.lower():赵
7.517758 B-PER    word.lower():崔
7.473806 O        word.lower():讯
7.407550 B-PER    word.lower():褚
7.330991 B-PER    word.lower():靳
7.010629 O        word.lower():说
6.885585 O        +1:word.lower():锅
6.765853 O        EOS
6.760879 O        word.lower():等
6.759627 B-PER    word.lower():邱
6.694659 O        word.lower():将
6.676578 O        word.lower():在
6.437327 B-LOC    word.lower():淮
6.381003 B-PER    word.lower():翟
6.319936 B-LOC    -1:word.lower():℃
6.194711 O        word.lower():／
6.134503 O        word.lower():还
6.083107 B-PER    word.lower():尧
6.072706 B-PER    word.lower():俞
6.062680 O        word.lower():副
6.024941 O        word.lower():与


In [19]:
import re
from pyhanlp import *

def get_sents(path):
    #'../data/train.txt';'../data/dev.txt'
    sentences = []
    sentence = []
    cnt = 0
    split_pattern = re.compile(r',|\.|;|，|。|；|\?|\!|\.\.\.\.\.\.|……')
    with open(path,'r',encoding = 'utf8') as f:
        for line in f.readlines():#每行为一个字符和其tag，中间用tab隔开
            line = line.strip().split('\t')
            if(not line or len(line) < 2): continue
            if line[1] == 'OO':
                line[1] = 'O'
            word_unit = [line[0],line[1]]
            if split_pattern.match(word_unit[0]):
                sentence.append(word_unit)
                sent = ''.join((word_unit[0] for word_unit in sentence))
                nature_list = []
                for term in HanLP.segment(sent):
                    for i in range(len(term.word)):# 分词
                        nature = '{}'.format(term.nature)
                        nature_list.append(nature)
                for idx,word_unit in enumerate(sentence):
                    word_unit.insert(1,nature_list[idx]) # insert损失一些性能
                sentences.append(sentence.copy())
                sentence.clear()
            else:
                sentence.append(word_unit)
        if(len(sentence)):
            sent = ''.join((word_unit[0] for word_unit in sentence))
            nature_list = []
            for term in HanLP.segment(sent):
                for i in range(len(term.word)):  # 分词
                    nature = '{}'.format(term.nature)
                    nature_list.append(nature)
            for idx, word_unit in enumerate(sentence):
                word_unit.insert(1, nature_list[idx])  # insert损失一些性能
            sentences.append(sentence.copy())
            sentence.clear()
    return sentences

In [20]:
%%time
train_sentences = get_sents('../data/train.txt')
print(len(train_sentences))

251768
CPU times: user 4min 18s, sys: 11.2 s, total: 4min 29s
Wall time: 4min 35s


In [21]:
print(train_sentences[0])

[['当', 'p', 'O'], ['希', 'nz', 'O'], ['望', 'nz', 'O'], ['工', 'nz', 'O'], ['程', 'nz', 'O'], ['救', 'vn', 'O'], ['助', 'vn', 'O'], ['的', 'ude1', 'O'], ['百', 'm', 'O'], ['万', 'm', 'O'], ['儿', 'n', 'O'], ['童', 'n', 'O'], ['成', 'vi', 'O'], ['长', 'vi', 'O'], ['起', 'vf', 'O'], ['来', 'vf', 'O'], ['，', 'w', 'O']]


In [22]:
%%time
train_sents = train_sentences
test_sents = get_sents('../data/dev.txt')

CPU times: user 10.6 s, sys: 594 ms, total: 11.2 s
Wall time: 11.2 s


In [23]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import sklearn_crfsuite
from itertools import chain
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [24]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(), #对于中文数据集来说，这两条用处不大
        'word.isdigit()': word.isdigit(),
        'postag': postag, 
#         'postag[:2]': postag[:2], #感觉这条特征没意义
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.isupper()': word1.isupper(),
             '-1:word.isdigit()': word.isdigit(),
            '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent): #一个句子对应了一个序列
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
#     return [label for token,label in sent]
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token,label in sent]

In [25]:
sent2features(train_sents[0])[0] #注意中文和英文/spanish的区别，一是需要切词，二是word[-2:]这种是没意义的

{'bias': 1.0,
 'word.lower()': '当',
 'word.isupper()': False,
 'word.isdigit()': False,
 'postag': 'p',
 'BOS': True,
 '+1:word.lower()': '希',
 '+1:word.isupper()': False,
 '+1:postag': 'nz'}

In [26]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 16.3 s, sys: 11.3 s, total: 27.6 s
Wall time: 38.2 s


In [27]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 5min 7s, sys: 14.1 s, total: 5min 22s
Wall time: 5min 30s


In [28]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [29]:
y_pred = crf.predict(X_test)
#输入要求是二维list
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7943372374877178

In [30]:
y_pred[:2]

[['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [31]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.921     0.754     0.829      2877
      I-LOC      0.883     0.734     0.801      4394
      B-ORG      0.889     0.566     0.691      1331
      I-ORG      0.884     0.653     0.751      5670
      B-PER      0.947     0.646     0.768      1973
      I-PER      0.836     0.913     0.873      3851

avg / total      0.886     0.729     0.794     20096



In [32]:
from seqeval.metrics import classification_report
from functools import reduce
y_test = reduce(lambda x,y : x + y, y_test)
y_pred = reduce(lambda x,y : x + y, y_pred)
print(classification_report(y_test,y_pred,digits=4))
# 加了词性，f1-score 从0.754->0.794
# 0.6735 ->0.7170

             precision    recall  f1-score   support

        ORG     0.8406    0.5349    0.6538      1331
        PER     0.8376    0.5753    0.6821      1973
        LOC     0.8457    0.7070    0.7702      2877

avg / total     0.8420    0.6279    0.7170      6181



In [33]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-ORG  -> I-ORG   3.794039
B-ORG  -> I-ORG   3.626199
I-LOC  -> I-LOC   3.303950
I-PER  -> I-PER   3.264526
B-LOC  -> I-LOC   3.166419
O      -> O       2.766636
B-PER  -> I-PER   2.664200
I-LOC  -> B-LOC   1.801765
I-PER  -> B-PER   1.744497
B-LOC  -> B-LOC   0.878877
I-PER  -> O       0.724236
O      -> B-PER   0.550293
O      -> B-LOC   0.227626
O      -> B-ORG   0.164311
I-LOC  -> O       -0.114932
B-ORG  -> B-ORG   -0.479315
I-PER  -> B-LOC   -0.853423
I-ORG  -> B-LOC   -0.896773
I-ORG  -> O       -1.071338
I-LOC  -> B-ORG   -1.266796

Top unlikely transitions:
B-LOC  -> B-ORG   -3.233270
B-ORG  -> O       -3.572566
I-ORG  -> I-LOC   -3.770497
B-ORG  -> B-LOC   -3.922222
B-PER  -> B-LOC   -4.460793
B-PER  -> B-ORG   -4.512799
I-PER  -> I-ORG   -4.718796
B-LOC  -> B-PER   -4.797871
O      -> I-LOC   -4.988522
I-LOC  -> I-PER   -5.016383
B-ORG  -> I-LOC   -5.906916
B-PER  -> I-ORG   -5.977107
O      -> I-PER   -6.174754
B-ORG  -> I-PER   -6.492344
B-PER  -> I

In [34]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
7.581093 B-PER    word.lower():赵
7.119993 B-PER    word.lower():袁
6.996876 B-PER    word.lower():寇
6.991663 B-PER    word.lower():薛
6.683290 B-PER    word.lower():滕
6.383685 I-PER    word.lower():赵
6.107862 O        word.lower():在
5.971466 O        word.lower():到
5.888545 O        word.lower():、
5.887745 O        word.lower():杯
5.825564 B-PER    word.lower():靳
5.700890 O        postag:ude1
5.630110 B-PER    -1:word.lower():号
5.501566 B-PER    word.lower():崔
5.499171 B-LOC    word.lower():淮
5.461032 O        word.lower():说
5.387137 O        word.lower():对
5.353412 B-PER    +1:word.lower():奶
5.325446 I-PER    -1:word.lower():扈
5.300552 O        EOS
5.292643 I-PER    word.lower():冯
5.162670 B-LOC    word.lower():漯
5.142041 B-PER    word.lower():尧
5.122140 B-LOC    -1:word.lower():℃
5.109856 B-PER    word.lower():褚
5.109544 B-PER    +1:word.lower():娘
5.061371 O        word.lower():副
5.053542 I-LOC    -1:word.lower():喀
5.033499 O        postag:udeng
5.008554 B-PER    word.lowe

结果挺合理，赵、寇、薛 确实是姓比较多     
禺 确实是名比较多