In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
data_path = "D:\\data\\nlp\\命名实体识别\\BosonNLP_NER_6C\\BosonNLP_NER_6C.txt"

In [3]:
with open(data_path, "r", encoding="utf-8") as f:
    data = f.read()

In [4]:
data_list = data.split("\n")


In [5]:
test_data = data_list[0]
test_data

'{{product_name:浙江在线杭州}}{{time:4月25日}}讯（记者{{person_name: 施宇翔}} 通讯员 {{person_name:方英}}）毒贩很“时髦”，用{{product_name:微信}}交易毒品。没料想警方也很“潮”，将计就计，一举将其擒获。记者从{{org_name:杭州江干区公安分局}}了解到，经过一个多月的侦查工作，{{org_name:江干区禁毒专案组}}抓获吸贩毒人员5名，缴获“冰毒”400余克，毒资30000余元，扣押汽车一辆。{{location:黑龙江}}籍男子{{person_name:钱某}}长期落脚于宾馆、单身公寓，经常变换住址。他有一辆车，经常半夜驾车来往于{{location:杭州主城区}}的各大宾馆和单身公寓，并且常要活动到{{time:凌晨6、7点钟}}，{{time:白天}}则在家里呼呼大睡。{{person_name:钱某}}不寻常的特征，引起了警方注意。禁毒大队通过侦查，发现{{person_name:钱某}}实际上是在向落脚于宾馆和单身公寓的吸毒人员贩送“冰毒”。'

In [34]:
import nltk
from sklearn_crfsuite import CRF
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [12]:
nltk.download("conll2002")

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\neo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2002.zip.


True

In [13]:
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [15]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

Wall time: 2.61 s


In [17]:
def get_label_list(d_content, d_split):
    raw_data = d_split[0]
    inx = 0
    label_list = ["o" for _ in range(len(raw_data))]
    for d in d_split[1:]:
        label_value = d_content[inx].split(":")[0]
        label_key = d_content[inx].split(":")[1]
        
        if len(label_key) == 1:
            label_list.append(label_value+"-s")
        else:
            label_list.append(label_value+"-b")
            for x in label_key[1:-1]:
                label_list.append(label_value+"-m")
            label_list.append(label_value+"-e")
        for _ in d:
            label_list.append("o")
        raw_data += label_key+d
        inx += 1
    return label_list, raw_data

In [18]:
observed_list = []
status_list = []
for dat in data.split("\n"):
    d_split = re.split(r"{{.+?}}", dat)
    d_content = re.findall(r"{{(.+?)}}", dat)
    label_list, raw_data = get_label_list(d_content, d_split)
    
    observed_list.append(raw_data)
    status_list.append(label_list) 

In [25]:
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'bias': 1.0,
        'word_lower()': word.lower()
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower()
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower()
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for label in sent]

In [26]:
X_train = [sent2features(s) for s in observed_list]
y_train = [sent2labels(s) for s in status_list]

X_test = [sent2features(s) for s in observed_list]
y_test = [sent2labels(s) for s in status_list]

In [30]:
%%time
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 2min 17s




CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [31]:
y_pred = crf.predict(X_test)

In [35]:
labels = ["time-s", "time-b", "time-m", "time-e", 
         "location-s", "location-b", "location-m", "location-e", 
         "person_name-s", "person_name-b", "person_name-m", "person_name-e", 
         "org_name-s", "org_name-b", "org_name-m", "org_name-e", 
         "company_name-s", "company_name-b", "company_name-m", "company_name-e", 
         "product_name-s", "product_name-b", "product_name-m", "product_name-e", 
          "o"]

In [36]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9695126077720243