In [1]:
import random
import numpy as np

def read_file(file_path: str) -> [str]:
    with open(file_path, 'r') as f:
        texts = f.read().split('\n')
    return texts

def msra_preprocessing(split_rate: float = 0.8,
                       ignore_exist: bool = False) -> None:
    path_train = 'data/MSRA/train.txt'
    path_test = 'data/MSRA/test.txt'

    print("正在对MSRA数据进行预处理......")
    path_train1 = 'data/MSRA/train1.txt'
    texts = read_file(path_train1)
    random.shuffle(texts)
    
    split_index = int(len(texts) * split_rate)
    train_texts = texts[:split_index]
    test_texts = texts[split_index:]

    test_ = text_map(test_texts)
    train_ = text_map(train_texts)

    with open(path_train, 'w') as f:
        f.write("".join(train_))
    with open(path_test, 'w') as f:
        f.write("".join(test_))
    print("MSRA数据进行预处理完成 ---- OK!")

unk_flag = '[UNK]'
pad_flag = '[PAD]'
cls_flag = '[CLS]'
sep_flag = '[SEP]'

# 获取 word to index 词典
def get_w2i(vocab_path='data/vocab/vocab.txt'):
    w2i = {}
    with open(vocab_path, 'r') as f:
        while True:
            text = f.readline()
            if not text:
                break
            text = text.strip()
            if text and len(text) > 0:
                w2i[text] = len(w2i) + 1
    return w2i

# 文本映射
def text_map(texts: [str]) -> [str]:
    """
    文本映射处理
    处理好的数据格式:
       ['需 O'
        '要 O'
        '大 B-ORG'
        '连 I-ORG'
        '海 I-ORG'
        '富 I-ORG'
        '集 I-ORG'
        '团 I-ORG']

    :param texts:  例如 中/B_nt 共/M_nt 中/M_nt 央/E_nt 总/O  的文本
    :return: [str] 处理好的数据
    """
    mapping = {
        'nr': 'PER',
        'ns': 'LOC',
        'nt': 'ORG'
    }
    deal_texts = []
    for line in texts:
        sub_line = str(line).split(' ')
        for item in sub_line:
            item_list = str(item).split('/')
            if len(item_list) == 2:
                a = item_list[0]
                b = item_list[1]
                if b in mapping:
                    flag = mapping[b]
                    for i, char in enumerate(a):
                        if i == 0:
                            deal_texts.append(f"{char} B-{flag}\n")
                        else:
                            deal_texts.append(f"{char} I-{flag}\n")
                else:
                    for char in a:
                        deal_texts.append(f"{char} O\n")
        deal_texts.append('\n')
    return deal_texts

# 获取 tag to index 词典
def get_tag2index():
    return {"O": 0,
            "B-PER": 1, "I-PER": 2,
            "B-LOC": 3, "I-LOC": 4,
            "B-ORG": 5, "I-ORG": 6
            }
class DataProcess(object):
    def __init__(self,
                 max_len=100,
                 ):
        """
        数据处理
        :param max_len: 句子最长的长度，默认为保留100
        :param data_type: 数据类型，当前支持四种数据类型
        """
        self.w2i = get_w2i()  # word to index
        self.tag2index = get_tag2index()  # tag to index
        self.vocab_size = len(self.w2i)
        self.tag_size = len(self.tag2index)
        self.unk_flag = unk_flag
        self.pad_flag = pad_flag
        self.max_len = max_len

        self.unk_index = self.w2i.get(unk_flag, 101)
        self.pad_index = self.w2i.get(pad_flag, 1)
        self.cls_index = self.w2i.get(cls_flag, 102)
        self.sep_index = self.w2i.get(sep_flag, 103)
        msra_preprocessing()

    def get_data(self, one_hot: bool = True) -> ([], [], [], []):
        """
        获取数据，包括训练、测试数据中的数据和标签
        :param one_hot:
        :return:
        """
        # 拼接地址
        path_train = 'data/MSRA/train.txt'
        path_test = 'data/MSRA/test.txt'
        # 读取数据
        train_data, train_label = self.__text_to_indexs(path_train)
        test_data, test_label = self.__text_to_indexs(path_test)

        # 进行 one-hot处理
        if one_hot:
            def label_to_one_hot(index: []) -> []:
                data = []
                for line in index:
                    data_line = []
                    for i, index in enumerate(line):
                        line_line = [0]*self.tag_size
                        line_line[index] = 1
                        data_line.append(line_line)
                    data.append(data_line)
                return np.array(data)
            train_label = label_to_one_hot(index=train_label)
            test_label = label_to_one_hot(index=test_label)
        else:
            train_label = np.expand_dims(train_label, 2)
            test_label = np.expand_dims(test_label, 2)
        return train_data, train_label, test_data, test_label

    def num2tag(self):
        return dict(zip(self.tag2index.values(), self.tag2index.keys()))

    def i2w(self):
        return dict(zip(self.w2i.values(), self.w2i.keys()))

    # texts 转化为 index序列
    def __text_to_indexs(self, file_path: str) -> ([], []):
        data, label = [], []
        with open(file_path, 'r') as f:
            line_data,  line_label = [], []
            for line in f:
                if line != '\n':
                    w, t = line.split()
                    char_index = self.w2i.get(w, self.w2i[self.unk_flag])
                    tag_index = self.tag2index.get(t, 0)
                    line_data.append(char_index)
                    line_label.append(tag_index)
                else:
                    if len(line_data) < self.max_len:
                        pad_num = self.max_len - len(line_data)
                        line_data = [self.pad_index]*pad_num + line_data
                        line_label = [0]*pad_num + line_label
                    else:
                        line_data = line_data[:self.max_len]
                        line_label = line_label[:self.max_len]
                    data.append(line_data)
                    label.append(line_label)
                    line_data, line_label = [], []
        return np.array(data), np.array(label)

In [2]:
dp = DataProcess()
train_data, train_label, test_data, test_label = dp.get_data(one_hot=True)

正在对MSRA数据进行预处理......
MSRA数据进行预处理完成 ---- OK!


In [6]:

from keras.models import  Model
from keras.layers import Embedding, Dense, Dropout, Input
from keras.layers import Conv1D
from keras_contrib.layers import CRF

class IDCNNCRF(object):
    def __init__(self,
                 vocab_size: int,  # 词的数量(词表的大小)
                 n_class: int,  # 分类的类别(本demo中包括小类别定义了7个类别)
                 max_len: int = 100,  # 最长的句子最长长度
                 embedding_dim: int = 128,  # 词向量编码长度
                 drop_rate: float = 0.5,  # dropout比例
                 ):
        self.vocab_size = vocab_size
        self.n_class = n_class
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.drop_rate = drop_rate
        pass

    def create_model(self):
        """
        本网络的机构采用的是，
           Embedding
           直接进行2个常规一维卷积操作
           接上一个空洞卷积操作
           连接全连接层
           最后连接CRF层

        kernel_size 采用2、3、4

        cnn  特征层数: 64、128、128
        """

        inputs = Input(shape=(self.max_len,))
        x = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim)(inputs)
        x = Conv1D(filters=64,
                   kernel_size=3,
                   activation='relu',
                   padding='same',
                   dilation_rate=1)(x)
        x = Conv1D(filters=128,
                   kernel_size=3,
                   activation='relu',
                   padding='same',
                   dilation_rate=1)(x)
        x = Conv1D(filters=128,
                   kernel_size=3,
                   activation='relu',
                   padding='same',
                   dilation_rate=2)(x)
        x = Dropout(self.drop_rate)(x)
        x = Dense(self.n_class)(x)
        self.crf = CRF(self.n_class, sparse_target=False)
        x = self.crf(x)
        self.model = Model(inputs=inputs, outputs=x)
        self.model.summary()
        self.compile()
        return self.model

    def compile(self):
        self.model.compile('adam',
                           loss=self.crf.loss_function,
                           metrics=[self.crf.accuracy])


In [7]:
model = IDCNNCRF(dp.vocab_size, dp.tag_size)

In [8]:
model.create_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 128)          2704256   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           24640     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 128)          24704     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 128)          49280     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
dense_2 (Dense)              (None, 100, 7)            903       
__________

<keras.engine.training.Model at 0x14067c438>

In [9]:
model.model.fit(train_data, train_label, batch_size=128, epochs=2,
              validation_data=[test_data, test_label])


Train on 37092 samples, validate on 9273 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x15c61d4a8>