In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import tensorflow as tf
tf.__version__

'2.1.0'

In [2]:
import keras
keras.__version__

Using TensorFlow backend.


'2.3.1'

In [3]:
import sys
sys.path.insert(0,'/notebook/.custom/TF2.1.0_JUPYTER2_gpu/pylib/Python3')
import bert4keras
bert4keras.__version__

'0.8.3'

In [4]:
#! -*- coding: utf-8 -*-
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, ViterbiDecoder, to_array
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm



In [5]:

maxlen = 250
epochs = 10
batch_size = 16
bert_layers = 12
learing_rate = 1e-5  # bert_layers越小，学习率应该要越大
crf_lr_multiplier = 1000  # 必要时扩大CRF层的学习率

# bert配置
config_path = '/notebook/comp/rba/roberta_base/bert_config.json'
checkpoint_path = '/notebook/comp/rba/roberta_base/bert_model.ckpt'
dict_path = '/notebook/comp/rba/roberta_base/vocab.txt'



def load_data(filename):
    D = []
    with open(filename, encoding='utf-8') as f:
        f = f.read()
        for l in f.split('\n\n'):
            if not l:
                continue
            d, last_flag = [], ''
            for c in l.split('\n'):
                try:
                    char, this_flag = c.split(' ')
                except:
                    print(c)
                    continue
                if this_flag == 'O' and last_flag == 'O':
                    d[-1][0] += char
                elif this_flag == 'O' and last_flag != 'O':
                    d.append([char, 'O'])
                elif this_flag[:1] == 'B':
                    d.append([char, this_flag[2:]])
                else:
                    d[-1][0] += char
                last_flag = this_flag
            D.append(d)
    return D


# 标注数据
train_data = load_data('./round1_train/data/train.txt')
valid_data = load_data('./round1_train/data/val.txt')











In [6]:
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 类别映射

labels = ['SYMPTOM',
 'DRUG_EFFICACY',
 'PERSON_GROUP',
 'SYNDROME',
 'DRUG_TASTE',
 'DISEASE',
 'DRUG_DOSAGE',
 'DRUG_INGREDIENT',
 'FOOD_GROUP',
 'DISEASE_GROUP',
 'DRUG',
 'FOOD',
 'DRUG_GROUP']

id2label = dict(enumerate(labels))
label2id = {j: i for i, j in id2label.items()}
num_labels = len(labels) * 2 + 1


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, item in self.sample(random):
            token_ids, labels = [tokenizer._token_start_id], [0]
            for w, l in item:
                w_token_ids = tokenizer.encode(w)[0][1:-1]
                if len(token_ids) + len(w_token_ids) < maxlen:
                    token_ids += w_token_ids
                    if l == 'O':
                        labels += [0] * len(w_token_ids)
                    else:
                        B = label2id[l] * 2 + 1
                        I = label2id[l] * 2 + 2
                        labels += ([B] + [I] * (len(w_token_ids) - 1))
                else:
                    break
            token_ids += [tokenizer._token_end_id]
            labels += [0]
            segment_ids = [0] * len(token_ids)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append(labels)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []



In [7]:
model = build_transformer_model(
    config_path,
    checkpoint_path,
)

output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
output = model.get_layer(output_layer).output
output = Dense(num_labels)(output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)

model = Model(model.input, output)
model.summary()

model.compile(
    loss=CRF.sparse_loss,
    optimizer=Adam(learing_rate),
    metrics=[CRF.sparse_accuracy]
)



Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]              
____________________________________________________________________________________________

In [8]:

class NamedEntityRecognizer(ViterbiDecoder):
    """命名实体识别器
    """
    def recognize(self, text):
        tokens = tokenizer.tokenize(text)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        nodes = model.predict([token_ids, segment_ids])[0]
        labels = self.decode(nodes)
        entities, starting = [], False
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 2]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]


NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])


def evaluate(data):
    """评测函数
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for d in tqdm(data):
        text = ''.join([i[0] for i in d])
        R = set(NER.recognize(text)) # 预测
        T = set([tuple(i) for i in d if i[1] != 'O']) #真实
        X += len(R & T) 
        Y += len(R) 
        Z += len(T)
    precision, recall =  X / Y, X / Z
    f1 = 2*precision*recall/(precision+recall)
    return f1, precision, recall


class Evaluator(keras.callbacks.Callback):
    def __init__(self,valid_data):
        self.best_val_f1 = 0
        self.valid_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        trans = K.eval(CRF.trans)
        NER.trans = trans
#         print(NER.trans)
        f1, precision, recall = evaluate(self.valid_data)
        # 保存最优
        if f1 >= self.best_val_f1:
            self.best_val_f1 = f1
            model.save_weights('./best_model_epoch_10.weights')
        print(
            'valid:  f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
            (f1, precision, recall, self.best_val_f1)
        )



evaluator = Evaluator(valid_data)
train_generator = data_generator(train_data, batch_size)

model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=[evaluator]
)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10


100%|██████████| 404/404 [00:16<00:00, 24.54it/s]


valid:  f1: 0.62616, precision: 0.72350, recall: 0.55191, best f1: 0.62616

Epoch 2/10


100%|██████████| 404/404 [00:12<00:00, 33.13it/s]


valid:  f1: 0.71905, precision: 0.67353, recall: 0.77118, best f1: 0.71905

Epoch 3/10


100%|██████████| 404/404 [00:12<00:00, 32.34it/s]


valid:  f1: 0.67680, precision: 0.71301, recall: 0.64409, best f1: 0.71905

Epoch 4/10


100%|██████████| 404/404 [00:13<00:00, 30.70it/s]


valid:  f1: 0.73505, precision: 0.67490, recall: 0.80698, best f1: 0.73505

Epoch 5/10


100%|██████████| 404/404 [00:12<00:00, 33.19it/s]


valid:  f1: 0.73211, precision: 0.69247, recall: 0.77655, best f1: 0.73505

Epoch 6/10


100%|██████████| 404/404 [00:12<00:00, 32.82it/s]


valid:  f1: 0.73113, precision: 0.68676, recall: 0.78162, best f1: 0.73505

Epoch 7/10


100%|██████████| 404/404 [00:11<00:00, 33.83it/s]


valid:  f1: 0.71723, precision: 0.68922, recall: 0.74761, best f1: 0.73505

Epoch 8/10


100%|██████████| 404/404 [00:12<00:00, 32.24it/s]


valid:  f1: 0.73481, precision: 0.66756, recall: 0.81712, best f1: 0.73505

Epoch 9/10


100%|██████████| 404/404 [00:12<00:00, 31.77it/s]


valid:  f1: 0.73467, precision: 0.67508, recall: 0.80579, best f1: 0.73505

Epoch 10/10


100%|██████████| 404/404 [00:12<00:00, 33.42it/s]

valid:  f1: 0.72974, precision: 0.68826, recall: 0.77655, best f1: 0.73505






<keras.callbacks.callbacks.History at 0x7f5ff05610f0>

## 验证集

In [9]:
def _cut(sentence):
    """
    将一段文本切分成多个句子
    :param sentence:
    :return:
    """
    new_sentence = []
    sen = []
    for i in sentence:
        if i in ['。', '！', '？', '?'] and len(sen) != 0:
            sen.append(i)
            new_sentence.append("".join(sen))
            sen = []
            continue
        sen.append(i)

    if len(new_sentence) <= 1: # 一句话超过max_seq_length且没有句号的，用","分割，再长的不考虑了。
        new_sentence = []
        sen = []
        for i in sentence:
            if i.split(' ')[0] in ['，', ','] and len(sen) != 0:
                sen.append(i)
                new_sentence.append("".join(sen))
                sen = []
                continue
            sen.append(i)
    if len(sen) > 0:  # 若最后一句话无结尾标点，则加入这句话
        new_sentence.append("".join(sen))
    return new_sentence

def cut_test_set(text_list,len_treshold):
    cut_text_list = []
    cut_index_list = []
    for text in text_list:

        temp_cut_text_list = []
        text_agg = ''
        if len(text) < len_treshold:
            temp_cut_text_list.append(text)
        else:
            sentence_list = _cut(text)  # 一条数据被切分成多句话
            for sentence in sentence_list:
                if len(text_agg) + len(sentence) < len_treshold:
                    text_agg += sentence
                else:
                    temp_cut_text_list.append(text_agg)
                    text_agg = sentence
            temp_cut_text_list.append(text_agg)  # 加上最后一个句子

        cut_index_list.append(len(temp_cut_text_list))
        cut_text_list += temp_cut_text_list

    return cut_text_list, cut_index_list


In [10]:
class NamedEntityRecognizer(ViterbiDecoder):
    """命名实体识别器
    """
    def recognize(self, text):
        tokens = tokenizer.tokenize(text)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        nodes = model.predict([[token_ids], [segment_ids]])[0]
        labels = self.decode(nodes)
        entities, starting = [], False
        
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 2]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]


In [11]:
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])

In [12]:
def test_predict(data, NER_):
    test_ner =[]
    
    for text in tqdm(data):
        cut_text_list, cut_index_list = cut_test_set([text],300)
        posit = 0
        item_ner = []
        index =1
        for str_ in cut_text_list:
            aaaa  = NER_.recognize(str_)
            for tn in aaaa:
                ans = {}
                ans["label_type"] = tn[1]
                ans['overlap'] = "T" + str(index)
                
                ans["start_pos"] = text.find(tn[0],posit)
                ans["end_pos"] = ans["start_pos"] + len(tn[0])
                posit = ans["end_pos"]
                ans["res"] = tn[0]
                item_ner.append(ans)
                index +=1
        test_ner.append(item_ner)
    
    return test_ner

In [13]:
import glob 
import codecs
X, Y, Z = 1e-10, 1e-10, 1e-10
val_data_flist = glob.glob('./round1_train/val_data/*.txt')
data_dir = './round1_train/val_data/'
for file in val_data_flist:
    if file.find(".ann") == -1 and file.find(".txt") == -1:
        continue
    file_name = file.split('/')[-1].split('.')[0]
    r_ann_path = os.path.join(data_dir, "%s.ann" % file_name)
    r_txt_path = os.path.join(data_dir, "%s.txt" % file_name)

    R = []
    with codecs.open(r_txt_path, "r", encoding="utf-8") as f:
        line = f.readlines()
        aa = test_predict(line, NER)
        for line in aa[0]:
            lines = line['label_type']+ " "+str(line['start_pos'])+' ' +str(line['end_pos'])+ "\t" +line['res']
            R.append(lines)    
    T = []
    with codecs.open(r_ann_path, "r", encoding="utf-8") as f:
        for line in f:
            lines = line.strip('\n').split('\t')[1] + '\t' + line.strip('\n').split('\t')[2]
            T.append(lines)
    R = set(R)
    T = set(T)
    X += len(R & T) 
    Y += len(R) 
    Z += len(T)
precision, recall =  X / Y, X / Z
f1 = 2*precision*recall/(precision+recall)

100%|██████████| 1/1 [00:00<00:00, 25.41it/s]
100%|██████████| 1/1 [00:00<00:00, 30.71it/s]
100%|██████████| 1/1 [00:00<00:00, 14.56it/s]
100%|██████████| 1/1 [00:00<00:00, 16.15it/s]
100%|██████████| 1/1 [00:00<00:00, 27.20it/s]
100%|██████████| 1/1 [00:00<00:00, 29.54it/s]
100%|██████████| 1/1 [00:00<00:00, 28.93it/s]
100%|██████████| 1/1 [00:00<00:00, 13.46it/s]
100%|██████████| 1/1 [00:00<00:00, 13.19it/s]
100%|██████████| 1/1 [00:00<00:00, 11.61it/s]
100%|██████████| 1/1 [00:00<00:00, 26.91it/s]
100%|██████████| 1/1 [00:00<00:00, 31.34it/s]
100%|██████████| 1/1 [00:00<00:00, 23.70it/s]
100%|██████████| 1/1 [00:00<00:00, 35.80it/s]
100%|██████████| 1/1 [00:00<00:00, 24.90it/s]
100%|██████████| 1/1 [00:00<00:00, 29.04it/s]
100%|██████████| 1/1 [00:00<00:00, 33.14it/s]
100%|██████████| 1/1 [00:00<00:00, 29.31it/s]
100%|██████████| 1/1 [00:00<00:00, 14.15it/s]
100%|██████████| 1/1 [00:00<00:00, 16.84it/s]
100%|██████████| 1/1 [00:00<00:00, 37.25it/s]
100%|██████████| 1/1 [00:00<00:00,

In [14]:
f1,precision,recall

(0.6983720035892911, 0.6562274150807117, 0.7463013698630206)

## 测试集

In [15]:
class NamedEntityRecognizer(ViterbiDecoder):
    """命名实体识别器
    """
    def recognize(self, text):
        tokens = tokenizer.tokenize(text)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        nodes = model.predict([[token_ids], [segment_ids]])[0]
        labels = self.decode(nodes)
        entities, starting = [], False
        
        for i, label in enumerate(labels):
            if label > 0:
                if label % 2 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 2]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]


In [16]:
NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])

In [17]:
def test_predict(data, NER_):
    test_ner =[]
    
    for text in tqdm(data):
        cut_text_list, cut_index_list = cut_test_set([text],maxlen)
        posit = 0
        item_ner = []
        index =1
        for str_ in cut_text_list:
            ner_res  = NER_.recognize(str_)
            for tn in ner_res:
                ans = {}
                ans["label_type"] = tn[1]
                ans['overlap'] = "T" + str(index)
                
                ans["start_pos"] = text.find(tn[0],posit)
                ans["end_pos"] = ans["start_pos"] + len(tn[0])
                posit = ans["end_pos"]
                ans["res"] = tn[0]
                item_ner.append(ans)
                index +=1
        test_ner.append(item_ner)
    
    return test_ner

In [18]:
import os
import codecs

In [19]:
test_files = os.listdir("./round1_test/chusai_xuanshou/")

In [20]:
for file in test_files:
    with codecs.open("./round1_test/chusai_xuanshou/"+file, "r", encoding="utf-8") as f:
        line = f.readlines()
        aa = test_predict(line, NER)
    with codecs.open("./round1_test/submission_4/"+file.split('.')[0]+".ann", "w", encoding="utf-8") as ff:
        for line in aa[0]:
            lines = line['overlap'] + "\t" +line['label_type']+ " "+str(line['start_pos'])+' ' +str(line['end_pos'])+ "\t" +line['res']
            ff.write(lines+"\n")
        ff.close()

100%|██████████| 1/1 [00:00<00:00, 25.32it/s]
100%|██████████| 1/1 [00:00<00:00, 12.59it/s]
100%|██████████| 1/1 [00:00<00:00,  9.94it/s]
100%|██████████| 1/1 [00:00<00:00, 15.76it/s]
100%|██████████| 1/1 [00:00<00:00, 21.02it/s]
100%|██████████| 1/1 [00:00<00:00, 31.30it/s]
100%|██████████| 1/1 [00:00<00:00,  7.86it/s]
100%|██████████| 1/1 [00:00<00:00, 26.05it/s]
100%|██████████| 1/1 [00:00<00:00, 15.13it/s]
100%|██████████| 1/1 [00:00<00:00, 29.86it/s]
100%|██████████| 1/1 [00:00<00:00, 34.75it/s]
100%|██████████| 1/1 [00:00<00:00, 28.07it/s]
100%|██████████| 1/1 [00:00<00:00, 16.06it/s]
100%|██████████| 1/1 [00:00<00:00, 27.15it/s]
100%|██████████| 1/1 [00:00<00:00, 36.00it/s]
100%|██████████| 1/1 [00:00<00:00, 13.04it/s]
100%|██████████| 1/1 [00:00<00:00, 16.11it/s]
100%|██████████| 1/1 [00:00<00:00, 35.34it/s]
100%|██████████| 1/1 [00:00<00:00, 29.92it/s]
100%|██████████| 1/1 [00:00<00:00, 16.63it/s]
100%|██████████| 1/1 [00:00<00:00,  9.43it/s]
100%|██████████| 1/1 [00:00<00:00,