In [8]:
import re
import numpy as np
import pandas as pd

In [3]:
data_path = "D:\\data\\nlp\\命名实体识别\\BosonNLP_NER_6C\\BosonNLP_NER_6C.txt"

In [5]:
with open(data_path, "r", encoding="utf-8") as f:
    data = f.read()

In [6]:
data_list = data.split("\n")

In [10]:
test_data = data_list[0]
test_data

'{{product_name:浙江在线杭州}}{{time:4月25日}}讯（记者{{person_name: 施宇翔}} 通讯员 {{person_name:方英}}）毒贩很“时髦”，用{{product_name:微信}}交易毒品。没料想警方也很“潮”，将计就计，一举将其擒获。记者从{{org_name:杭州江干区公安分局}}了解到，经过一个多月的侦查工作，{{org_name:江干区禁毒专案组}}抓获吸贩毒人员5名，缴获“冰毒”400余克，毒资30000余元，扣押汽车一辆。{{location:黑龙江}}籍男子{{person_name:钱某}}长期落脚于宾馆、单身公寓，经常变换住址。他有一辆车，经常半夜驾车来往于{{location:杭州主城区}}的各大宾馆和单身公寓，并且常要活动到{{time:凌晨6、7点钟}}，{{time:白天}}则在家里呼呼大睡。{{person_name:钱某}}不寻常的特征，引起了警方注意。禁毒大队通过侦查，发现{{person_name:钱某}}实际上是在向落脚于宾馆和单身公寓的吸毒人员贩送“冰毒”。'

In [14]:
def get_label_list(d_content, d_split):
    raw_data = d_split[0]
    inx = 0
    label_list = ["o" for _ in range(len(raw_data))]
    for d in d_split[1:]:
        label_value = d_content[inx].split(":")[0]
        label_key = d_content[inx].split(":")[1]
        
        if len(label_key) == 1:
            label_list.append(label_value+"-s")
        else:
            label_list.append(label_value+"-b")
            for x in label_key[1:-1]:
                label_list.append(label_value+"-m")
            label_list.append(label_value+"-e")
        for _ in d:
            label_list.append("o")
        raw_data += label_key+d
        inx += 1
    return label_list, raw_data

# 数据预处理

## https://www.jianshu.com/p/191d1e21f7ed

## markdown 语法解析

In [15]:
observed_list = []
status_list = []
for dat in data.split("\n"):
    d_split = re.split(r"{{.+?}}", dat)
    d_content = re.findall(r"{{(.+?)}}", dat)
    label_list, raw_data = get_label_list(d_content, d_split)
    
    observed_list.append(raw_data)
    status_list.append(label_list)        

In [17]:
observed_list[0]

'浙江在线杭州4月25日讯（记者 施宇翔 通讯员 方英）毒贩很“时髦”，用微信交易毒品。没料想警方也很“潮”，将计就计，一举将其擒获。记者从杭州江干区公安分局了解到，经过一个多月的侦查工作，江干区禁毒专案组抓获吸贩毒人员5名，缴获“冰毒”400余克，毒资30000余元，扣押汽车一辆。黑龙江籍男子钱某长期落脚于宾馆、单身公寓，经常变换住址。他有一辆车，经常半夜驾车来往于杭州主城区的各大宾馆和单身公寓，并且常要活动到凌晨6、7点钟，白天则在家里呼呼大睡。钱某不寻常的特征，引起了警方注意。禁毒大队通过侦查，发现钱某实际上是在向落脚于宾馆和单身公寓的吸毒人员贩送“冰毒”。'

In [20]:
# 检查字符数是否对其
for i, x in enumerate(observed_list):
    if len(status_list[i]) != len(x):
        print(i)
        break
        

In [33]:
label = ["time-s", "time-b", "time-m", "time-e", 
         "location-s", "location-b", "location-m", "location-e", 
         "person_name-s", "person_name-b", "person_name-m", "person_name-e", 
         "org_name-s", "org_name-b", "org_name-m", "org_name-e", 
         "company_name-s", "company_name-b", "company_name-m", "company_name-e", 
         "product_name-s", "product_name-b", "product_name-m", "product_name-e", 
          "o"]

In [22]:
observer_set = set()

In [23]:
for olist in observed_list:
    for o in olist:
        observer_set.add(o)

In [31]:
observer_slist = list(observer_set)

In [34]:
n_label = len(label)
n_observer = len(observer_set)

In [35]:
transferring_matrix  = np.zeros((n_label, n_label))
trainsmit_matrix = np.zeros((n_observer, n_label))

In [36]:
init_transferring = np.zeros((n_label, 1))

In [37]:
for i, olist in enumerate(status_list):
    for j, o in enumerate(olist):
        label_id = label.index(o)
        o_id = observer_slist.index(observed_list[i][j])
        
        trainsmit_matrix[o_id][label_id] += 1

In [44]:
for i, olist in enumerate(status_list):
    first_o = olist[0]
    first_o_id = label.index(first_o)
    
    init_transferring[first_o_id][0] += 1
    for x in olist[1:]:
        x_id = label.index(x)
        transferring_matrix[first_o_id][x_id] += 1
        
        first_o_id = x_id

In [47]:
transferring_matrix_sum = transferring_matrix.sum(axis=1)
transferring_matrix_sum = transferring_matrix_sum.reshape((n_label, 1))

In [48]:
trainsmit_matrix_sum = trainsmit_matrix.sum(axis=1)
trainsmit_matrix_sum = trainsmit_matrix_sum.reshape((n_observer, 1))

In [49]:
transferring_matrix = transferring_matrix/transferring_matrix_sum
trainsmit_matrix = trainsmit_matrix/trainsmit_matrix_sum

In [55]:
init_transferring = init_transferring/init_transferring.sum()

In [52]:
test_sentence = observed_list[0]

In [54]:
start_s_id = observer_slist.index(test_sentence[0])


In [57]:
v1 = trainsmit_matrix[start_s_id]
v1 = v1.reshape((n_label, 1))
v1 = v1 * init_transferring

In [61]:
path = [[] for _ in range(n_label)]
for s in test_sentence[1:]:
    
    s_id = observer_slist.index(s)
    sv = trainsmit_matrix[s_id]
    sv = sv.reshape((n_label, 1))
    
    new_path = [[] for _ in range(n_label)]
    v_matrix = v1 *  transferring_matrix
    argmax_path = v_matrix.argmax(axis=0)
    max_value = v_matrix.max(axis=0)
    
    max_value = max_value.reshape((n_label, 1))
    v1 = max_value * sv
    v1 = v1/v1.sum()
    
    for i, p in enumerate(argmax_path):
        new_path[i] = path[p] + [p]
    
    path = new_path

In [67]:
seg_status = [label[i] for i in path[24]]

In [68]:
seg_status

['o',
 'o',
 'o',
 'o',
 'location-b',
 'location-e',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'location-b',
 'location-e',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o',
 'o'

In [69]:
test_data

'{{product_name:浙江在线杭州}}{{time:4月25日}}讯（记者{{person_name: 施宇翔}} 通讯员 {{person_name:方英}}）毒贩很“时髦”，用{{product_name:微信}}交易毒品。没料想警方也很“潮”，将计就计，一举将其擒获。记者从{{org_name:杭州江干区公安分局}}了解到，经过一个多月的侦查工作，{{org_name:江干区禁毒专案组}}抓获吸贩毒人员5名，缴获“冰毒”400余克，毒资30000余元，扣押汽车一辆。{{location:黑龙江}}籍男子{{person_name:钱某}}长期落脚于宾馆、单身公寓，经常变换住址。他有一辆车，经常半夜驾车来往于{{location:杭州主城区}}的各大宾馆和单身公寓，并且常要活动到{{time:凌晨6、7点钟}}，{{time:白天}}则在家里呼呼大睡。{{person_name:钱某}}不寻常的特征，引起了警方注意。禁毒大队通过侦查，发现{{person_name:钱某}}实际上是在向落脚于宾馆和单身公寓的吸毒人员贩送“冰毒”。'