In [67]:
# -*- coding: utf-8 -*-
## 探索不同特征组合对性能的影响
#Data analysis
import pandas as pd
import numpy as np
#Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font_scale=1)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
#Modeling
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
import scipy.stats
import eli5

In [68]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        # 'bias': 1.0,
        'word.lower()': word.lower(),   #小写
        # 'word[0]': word[0],         #首字母
        # 'word[0:2]': word[0:2],         #前两个字母
        # 'word[0:3]': word[0:3],         #前三个字母
        'word[-3:]': word[-3:],         #后三个字符
        'word[-2:]': word[-2:],         #后两个字符
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),#检查首字符是否大些，且其他字母为小写
        'word.isdigit()': word.isdigit(),
        # 'word.@': True if word[0]=='@' else False,
        # 'word.#': True if word[0]=='#' else False
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            # '-1:word.isdigit()': word1.isdigit(),
            # '-1:word[-3:]': word1[-3:],         #后三个字符
            # '-1:word[-2:]': word1[-2:],         #后两个字符
        })
        # if i>1:
        #     word2 = sent[i-2][0]
        #     features.update({
        #         '-2:word.lower()': word2.lower(),
        #         '-2:word.istitle()': word2.istitle(),
        #         '-2:word.isupper()': word2.isupper(),
        #         # '-1:word.isdigit()': word1.isdigit(),
        #         # '-1:word[-3:]': word1[-3:],         #后三个字符
        #         # '-1:word[-2:]': word1[-2:],         #后两个字符
        #     })

    else:
        #如果是句子中的第一个单词，额外增加一个特征BOS
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            # '+1:word.isdigit()': word1.isdigit(),
            # '+1:word[-3:]': word1[-3:],         #后三个字符
            # '+1:word[-2:]': word1[-2:],         #后两个字符
        })
        # if i < len(sent)-2:
        #     word2 = sent[i+2][0]
        #     features.update({
        #         '+2:word.lower()': word2.lower(),
        #         '+2:word.istitle()': word2.istitle(),
        #         '+2:word.isupper()': word2.isupper(),
        #         # '+1:word.isdigit()': word1.isdigit(),
        #         # '+1:word[-3:]': word1[-3:],         #后三个字符
        #         # '+1:word[-2:]': word1[-2:],         #后两个字符
        #     })
    else:
        #如果是句子中的最后一个单词，额外增加一个特征EOS
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, label in sent]
def anns_to_df(path):
    count=1
    word=[]
    tag=[]
    sentence = []
    s = []
    s_id = []
    with open(path, "r") as f:
        for line in f.readlines():
            line = line.strip('\n')  #去掉列表中每一个元素的换行符
            if line!='':
                word.append(line.split()[0])
                s.append(line.split()[0])
                tag.append(line.split()[1])
                s_id.append('Sentence: {}'.format(count))
            else:
                count+=1
                sentence.append(' '.join(s))
                s= []
    dic = {'Sentence #':s_id,'Word':word,'Tag':tag}
    data=pd.DataFrame(dic)#将字典转换成为数据框
    return data
def extract_sentences_from_df(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(),
                                                    s['Tag'].values.tolist())]
    grouped_df = data.groupby('Sentence #').apply(agg_func)
    sentences = [s for s in grouped_df]
    return sentences

In [69]:
train_df = anns_to_df('../../datasets/clinical_ner/train-6000.anns')
valid_df = anns_to_df('../../datasets/clinical_ner/valid-2000.anns')
test_df = anns_to_df('../../datasets/clinical_ner/test-2000.anns')
cross_df = anns_to_df('../../datasets/clinical_ner/cross-8000.anns')
train_sentences = extract_sentences_from_df(train_df)
valid_sentences = extract_sentences_from_df(valid_df)
test_sentences = extract_sentences_from_df(test_df)
cross_sentences = extract_sentences_from_df(cross_df)
X_train = np.array([sent2features(s) for s in train_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
X_valid = np.array([sent2features(s) for s in valid_sentences])
y_valid = np.array([sent2labels(s) for s in valid_sentences])
X_test = np.array([sent2features(s) for s in test_sentences])
y_test = np.array([sent2labels(s) for s in test_sentences])
X_cross = np.array([sent2features(s) for s in cross_sentences])
y_cross = np.array([sent2labels(s) for s in cross_sentences])

In [73]:
# 训练CRF
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1= 0.01,
                           c2=0.01,
                           max_iterations=200,
                           all_possible_transitions=False,
                           verbose=True)
## 模型训练
crf.fit(X_train, y_train)
# crf.fit(X_valid, y_valid)
# crf.fit(X_test, y_test)
## 模型评估
y_pred = crf.predict(X_test)
# y_pred = crf.predict(X_train)
labels = list(crf.classes_)
labels.remove('O')
print(flat_classification_report(y_test, y_pred, labels=labels))

loading training data to CRFsuite: 100%|██████████| 6000/6000 [00:01<00:00, 3030.30it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 96797
Seconds required: 0.547

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.78  loss=589896.58 active=96735 feature_norm=1.00
Iter 2   time=2.30  loss=326553.82 active=96743 feature_norm=11.83
Iter 3   time=0.39  loss=288133.11 active=91011 feature_norm=10.53
Iter 4   time=4.52  loss=148042.08 active=93175 feature_norm=6.03
Iter 5   time=2.60  loss=112660.86 active=90968 feature_norm=4.63
Iter 6   time=0.76  loss=102959.98 active=89722 feature_norm=6.89
Iter 7   time=0.38  loss=95574.07 active=87563 feature_norm=11.27
Iter 8   time=0.39  loss=81714.85 active=94707 feature_norm=10.82
Iter 9   time=0.37  loss=79028.91 active=95429 feature_norm=12.19
Ite



                   precision    recall  f1-score   support

        B-Symptom       0.69      0.58      0.63       487
        E-Symptom       0.71      0.60      0.65       487
        M-Symptom       0.51      0.43      0.47       363
       B-Location       0.68      0.53      0.59        99
       E-Location       0.75      0.58      0.65        99
        S-Disease       0.82      0.77      0.79       330
        S-Symptom       0.85      0.80      0.83       787
   B-Organization       0.65      0.55      0.59       207
   E-Organization       0.69      0.57      0.62       207
B-Vaccine-related       0.94      0.87      0.91       291
E-Vaccine-related       0.96      0.89      0.92       291
         S-Person       0.83      0.46      0.59       531
         B-Person       0.72      0.46      0.56       115
         E-Person       0.76      0.49      0.59       115
   S-Organization       0.69      0.41      0.51       316
       S-Location       0.83      0.61      0.71       

In [74]:
# Cross validation
#First we select all the tags that are relevant for us i.e. remove the 'O' tag from the list.
labels = list(crf.classes_)
labels = list(filter(lambda a: a != 'O', labels))
print(labels)
crf = CRF(
    algorithm='lbfgs',
    max_iterations=200,
    all_possible_transitions=False
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

['B-Symptom', 'E-Symptom', 'M-Symptom', 'B-Location', 'E-Location', 'S-Disease', 'S-Symptom', 'B-Organization', 'E-Organization', 'B-Vaccine-related', 'E-Vaccine-related', 'S-Person', 'B-Person', 'E-Person', 'S-Organization', 'S-Location', 'B-Drug', 'E-Drug', 'B-Disease', 'E-Disease', 'S-Drug', 'M-Organization', 'M-Vaccine-related', 'M-Location', 'M-Disease', 'S-Vaccine-related', 'M-Drug', 'M-Person']
Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  3.8min


KeyboardInterrupt: 

In [1]:
#Lets check the best estimated parameters and CV score
print('Best parameters:', rs.best_params_)
print('Best CV score:', rs.best_score_)
print('Model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
#We sort the tags a bit so that they appear in an orderly fashion in the classification report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0]))
#Now we create the model again using the best estimators
crf3 = rs.best_estimator_
y_pred = crf3.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3))

NameError: name 'rs' is not defined

In [13]:
crf = CRF(algorithm='lbfgs',
                           c1=0.01,
                           c2=0.001,
                           max_iterations=200,
                           all_possible_transitions=True,
                           verbose=False)
pred = cross_val_predict(estimator=crf, X=X_cross, y=y_cross, cv=3)
#模型评估
report = flat_classification_report(y_pred=pred, y_true=y_cross)
print(report)



                   precision    recall  f1-score   support

        B-Disease       0.60      0.52      0.55       357
           B-Drug       0.63      0.39      0.48        44
       B-Location       0.71      0.47      0.57       167
   B-Organization       0.65      0.29      0.40       175
         B-Person       0.66      0.34      0.45       123
        B-Symptom       0.71      0.61      0.65      1737
B-Vaccine-related       0.77      0.68      0.72       259
        E-Disease       0.62      0.54      0.58       357
           E-Drug       0.50      0.30      0.37        44
       E-Location       0.74      0.49      0.59       167
   E-Organization       0.66      0.30      0.42       175
         E-Person       0.60      0.32      0.41       123
        E-Symptom       0.75      0.65      0.70      1737
E-Vaccine-related       0.86      0.76      0.81       259
        M-Disease       0.39      0.31      0.34        59
           M-Drug       0.08      0.08      0.08       

In [16]:
crf.fit(X_cross,y_cross)
#去掉O之后的评估结果
labels = list(crf.classes_)
labels.remove('O')
report = flat_classification_report(y_pred=pred, y_true=y_cross,labels=labels)
print(report)



                   precision    recall  f1-score   support

        B-Disease       0.60      0.52      0.55       357
        E-Disease       0.62      0.54      0.58       357
        S-Disease       0.86      0.81      0.83      1078
       B-Location       0.71      0.47      0.57       167
       M-Location       0.48      0.23      0.31        53
       E-Location       0.74      0.49      0.59       167
        S-Symptom       0.84      0.83      0.83      3041
B-Vaccine-related       0.77      0.68      0.72       259
E-Vaccine-related       0.86      0.76      0.81       259
        B-Symptom       0.71      0.61      0.65      1737
        E-Symptom       0.75      0.65      0.70      1737
           S-Drug       0.80      0.42      0.55       126
           B-Drug       0.63      0.39      0.48        44
           E-Drug       0.50      0.30      0.37        44
   S-Organization       0.66      0.27      0.39       368
        M-Symptom       0.65      0.50      0.56      1

In [33]:
#将中文标点符号转换为英文标点符号
#https://blog.csdn.net/nanbei2463776506/article/details/82967140
def C_trans_to_E(string):
    E_pun = u',.!?[]()<>"\'\''
    C_pun = u'，。！？【】（）《》“‘’'
    #ord返回ASCII码对应的int
    #zip将合并为列表，元素为元祖，元祖为对应位置所有元素依次的集合，如这种形式[(',','，')...]
    #s生成对应字典
    table= {
    ord(f):ord(t) for f,t in zip(C_pun,E_pun)}
    #将字符传对应转换
    return string.translate(table)

#移除汉子和特殊字符
def removeChnAndCharacter(str1):
    C_pun = u'，。！？【】（）《》“‘'
    strTmp = ''

    if not isinstance(str1,str):
        return strTmp

    for i in range(len(str1)):
        #中文字符范围
        #https://blog.csdn.net/qq_22520587/article/details/62454354
        if str1[i] >= u'\u4e00' and str1[i] <= u'\u9fa5' \
                or str1[i] >= u'\u3300' and str1[i] <= u'\u33FF' \
                or str1[i] >= u'\u3200' and str1[i] <= u'\u32FF' \
                or str1[i] >= u'\u2700' and str1[i] <= u'\u27BF' \
                or str1[i] >= u'\u2600' and str1[i] <= u'\u26FF' \
                or str1[i] >= u'\uFE10' and str1[i] <= u'\uFE1F' \
                or str1[i] >= u'\u2E80' and str1[i] <= u'\u2EFF' \
                or str1[i] >= u'\u3000' and str1[i] <= u'\u303F' \
                or str1[i] >= u'\u31C0' and str1[i] <= u'\u31EF' \
                or str1[i] >= u'\u2FF0' and str1[i] <= u'\u2FFF' \
                or str1[i] >= u'\u3100' and str1[i] <= u'\u312F' \
                or str1[i] >= u'\u21A0' and str1[i] <= u'\u31BF' \
                :
            pass
        else:
            if str1[i] in C_pun:
                st = C_trans_to_E(str1[i])
            else:
                st = str1[i]
            strTmp += st

    return strTmp
import re
regexp = re.compile(r'[\u2019]')
for index,item in enumerate(valid_sentences):
    for e in item:
        # en = C_trans_to_E(e[0])
        if e[0] == '\u2019':
            print(e[0])
            print(e)
            print(item)
            print(index)
        # if regexp.search(e[0]):
        #     print('matched')
        #     print(e[0])


can't
