In [102]:
import re
from itertools import chain
import pandas as pd
import numpy as np
import pickle

In [103]:
# Read origin data
text = open('data/data.txt', encoding='utf-8').read()

In [104]:
# 去除标点符号的词位，按照标点符号划分句子
sentences = re.split('[，。！？、‘’“”]/[bems]', text)
sentences[0:2]

['人/b  们/e  常/s  说/s  生/b  活/e  是/s  一/s  部/s  教/b  科/m  书/e  ',
 '  而/s  血/s  与/s  火/s  的/s  战/b  争/e  更/s  是/s  不/b  可/m  多/m  得/e  的/s  教/b  科/m  书/e  ']

In [105]:
# 去除长度为0的句子
sentences = list(filter(lambda x: x.strip(), sentences))

In [106]:
# Strip sentences
sentences = list(map(lambda x: x.strip(), sentences))

In [107]:
# To numpy array
words, labels = [], []
print('Start creating words and labels...')
for sentence in sentences:
    groups = re.findall('(.)/(.)', sentence)
    arrays = np.asarray(groups)
    words.append(arrays[:, 0])
    labels.append(arrays[:, 1])
print('Words Length', len(words), 'Labels Length', len(labels))
print('Words Example', words[0])
print('Labels Example', labels[0])

Start creating words and labels...
Words Length 321533 Labels Length 321533
Words Example ['人' '们' '常' '说' '生' '活' '是' '一' '部' '教' '科' '书']
Labels Example ['b' 'e' 's' 's' 'b' 'e' 's' 's' 's' 'b' 'm' 'e']


In [108]:
# Merge all words
all_words = list(chain(*words))

In [109]:
# All words to Series
all_words_sr = pd.Series(all_words)

In [110]:
# Get value count, index changed to set
all_words_counts = all_words_sr.value_counts()

In [111]:
# Get words set
all_words_set = all_words_counts.index

In [112]:
# Get words ids
all_words_ids = range(1, len(all_words_set) + 1)

In [113]:
# Dict to transform
word2id = pd.Series(all_words_ids, index=all_words_set)
id2word = pd.Series(all_words_set, index=all_words_ids)

In [114]:
# Tag set and ids
tags_set = ['x', 's', 'b', 'm', 'e']
tags_ids = range(len(tags_set))

In [115]:
# Dict to transform
tag2id = pd.Series(tags_ids, index=tags_set)
id2tag = pd.Series(tags_set, index=tag2id)

In [116]:
max_length = 32

def x_transform(words):
    ids = list(word2id[words])
    if len(ids) >= max_length:
        ids = ids[:max_length]#截断，把多余的扔掉
    ids.extend([0] * (max_length - len(ids)))#padding,不够的在后面补0
    return ids

In [117]:
def y_transform(tags):
    ids = list(tag2id[tags])
    if len(ids) >= max_length:
        ids = ids[:max_length]#截断，把多余的扔掉
    ids.extend([0] * (max_length - len(ids)))#padding,不够的在后面补0
    return ids

In [118]:
print('Starting transform...')
data_x = list(map(lambda x: x_transform(x), words))
data_y = list(map(lambda y: y_transform(y), labels))

Starting transform...


In [119]:
print('Data X Length', len(data_x), 'Data Y Length', len(data_y))
print('Data X Example', data_x[0])
print('Data Y Example', data_y[0])

Data X Length 321533 Data Y Length 321533
Data X Example [8, 43, 320, 88, 36, 198, 7, 2, 41, 163, 124, 245, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Data Y Example [2, 4, 1, 1, 2, 4, 1, 1, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [120]:
data_x = np.asarray(data_x)
data_y = np.asarray(data_y)

In [121]:

from os import makedirs
from os.path import exists, join

path = 'data/'

if not exists(path):
    makedirs(path)

print('Starting pickle to file...')
with open(join(path, 'data.pkl'), 'wb') as f:
    pickle.dump(data_x, f)
    pickle.dump(data_y, f)
    pickle.dump(word2id, f)
    pickle.dump(id2word, f)
    pickle.dump(tag2id, f)
    pickle.dump(id2tag, f)
print('Pickle finished')


Starting pickle to file...
Pickle finished


In [122]:
import argparse
import tensorflow as tf
import pickle
import math
import numpy as np
from sklearn.model_selection import train_test_split
from os.path import join

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from gensim.models import word2vec
from gensim.models import KeyedVectors

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.models import load_model
from keras import backend as K#返回当前后端
from keras.models import Sequential,Model
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding,LSTM,Layer,initializers,regularizers,constraints,Input,Dropout,concatenate,BatchNormalization
from keras.layers import Dense,Bidirectional,Concatenate,Multiply,Maximum,Subtract,Lambda,dot,Flatten,Reshape,TimeDistributed



In [123]:
def load_data(path):
    """
    Load data from pickle
    :return: Arrays
    """
    with open(path, 'rb') as f:
        data_x = pickle.load(f)
        data_y = pickle.load(f)
        word2id = pickle.load(f)
        id2word = pickle.load(f)
        tag2id = pickle.load(f)
        id2tag = pickle.load(f)
        return data_x, data_y, word2id, id2word, tag2id, id2tag


def get_data(data_x, data_y):
    """
    Split data from loaded data
    :param data_x:
    :param data_y:
    :return: Arrays
    """
    print('Data X Length', len(data_x), 'Data Y Length', len(data_y))
    print('Data X Example', data_x[0])
    print('Data Y Example', data_y[0])
    
    train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.2, random_state=40)
    train_x, dev_x, train_y, dev_y = train_test_split(train_x, train_y, test_size=0.2, random_state=40)
    
    print('Train X Shape', train_x.shape, 'Train Y Shape', train_y.shape)
    print('Dev X Shape', dev_x.shape, 'Dev Y Shape', dev_y.shape)
    print('Test X Shape', test_x.shape, 'Test Y Shape', test_y.shape)
    return train_x, train_y, dev_x, dev_y, test_x, test_y

In [124]:
def create_bi_LSTM_mdoel(embedding_matrix,embedding_size = 100,max_sentence_length = 32):
    #定义模型输入
    input_layer = Input(shape=(max_sentence_length,), dtype='int32')
    
    # 定义需要使用的网络层
    embedding_layer = Embedding(
        input_dim=len(embedding_matrix, ),
        output_dim=embedding_size,
        weights=[embedding_matrix],
        trainable=True,
        input_length=max_sentence_length
    )(input_layer)

    blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(embedding_layer)
    output = TimeDistributed(Dense(5, activation='softmax'))(blstm)
    model = Model(input=input_layer, output=output, name="bi_lstm_token")

    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=["accuracy"])
    model.summary()
    return model

In [125]:
#配置参数
max_length = 32
RANOD_SEED = 42
np.random.seed(RANOD_SEED)
nepoch = 1
batch_size = 1024
embedding_size = 100

model_checkpoint_path = 'save_model.h5' 

In [126]:

def train():
    # Load data
    data_x, data_y, word2id, id2word, tag2id, id2tag = load_data("data/data.pkl")
    # Split data
    train_x, train_y, dev_x, dev_y, test_x, test_y = get_data(data_x, data_y)
    
    # 生成适合模型输入的格式
    def trans_one(x):
        _ = map(lambda y: to_categorical(y, 5), x)
        return list(_)
    
    train_y = np.array(trans_one(train_y))
    dev_y = np.array(trans_one(dev_y))
    test_y = np.array(trans_one(test_y))
    
    embedding_matrix = 1 * np.random.randn(len(all_words_set) + 1, embedding_size)
    
    model = create_bi_LSTM_mdoel(embedding_matrix)

    model.fit(x = train_x,y = train_y.reshape((-1, max_length, 5)),
                                validation_data = (dev_x,dev_y.reshape((-1, max_length, 5))),
                                batch_size = batch_size,
                                epochs = nepoch,
                                verbose = 1,
                                callbacks=[
                                EarlyStopping(
                                monitor='val_acc',   #监控的方式：’acc’,’val_acc’,’loss’,’val_loss’
                                min_delta=0.005,     #增大或者减小的阈值，只有只有大于这个部分才算作improvement
                                patience=4,          #连续n次没有提升
                                verbose=1,           #信息展示模式
                                mode='max'           #‘auto’，‘min’，‘max’之一，在min模式下，如果检测值停止下降则中止训练。在max模式下，当检测值不再上升则停止训练。
                                ),
                                ModelCheckpoint(
                                model_checkpoint_path,
                                monitor='val_acc',
                                save_best_only=True,
                                save_weights_only=False,
                                verbose=1,
                                mode = "max"
                                )]
                )

In [127]:
train()

Data X Length 321533 Data Y Length 321533
Data X Example [  8  43 320  88  36 198   7   2  41 163 124 245   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
Data Y Example [2 4 1 1 2 4 1 1 1 2 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Train X Shape (205780, 32) Train Y Shape (205780, 32)
Dev X Shape (51446, 32) Dev Y Shape (51446, 32)
Test X Shape (64307, 32) Test Y Shape (64307, 32)


  app.launch_new_instance()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 32, 100)           515900    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 32, 64)            84480     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 32, 5)             325       
Total params: 600,705
Trainable params: 600,705
Non-trainable params: 0
_________________________________________________________________
Train on 205780 samples, validate on 51446 samples
Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.90119, saving model to save_model.h5


In [128]:
def cut(text):
    embedding_matrix = 1 * np.random.randn(len(all_words_set) + 1, embedding_size)
    model = create_bi_LSTM_mdoel(embedding_matrix)
    model.load_weights(model_checkpoint_path)
    tmp = []
    tmp.append(list(text))
    data_x = list(map(lambda x: x_transform(x),tmp))
    data_x = np.array(data_x)
    y_test_p = model.predict(data_x,verbose= 0)
    print(y_test_p)
    tags_set = ['x', 's', 'b', 'm', 'e']
    tmp = 0
    last_result = []
    index = 0
    for i in y_test_p:
        for j in i:
            for k in range(5):
                if j[k] > tmp:
                    tmp = j[k]
                    index = k
            last_result.append(tags_set[index])
            tmp = 0
            index = 0
        print(last_result)
    

In [129]:
cut("中国人民万岁")

  app.launch_new_instance()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 32, 100)           515900    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 32, 64)            84480     
_________________________________________________________________
time_distributed_8 (TimeDist (None, 32, 5)             325       
Total params: 600,705
Trainable params: 600,705
Non-trainable params: 0
_________________________________________________________________
[[[2.40861485e-03 1.55661225e-01 6.66465342e-01 1.31809250e-01
   4.36555780e-02]
  [1.38541916e-03 2.71124132e-02 2.06591696e-01 2.97193259e-01
   4.67717260e-01]
  [1.49074697e-03 9.14590061e-02 3.95683050e-01 4.31627184e-01
   7.97400028e-02]
  [3.06123588e-03 1.32