In [1]:
import gensim
import os
from gensim.models.word2vec import Word2Vec, PathLineSentences
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

class OneHot(object):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.__onehot_encodeder = OneHotEncoder()

    def encode(self, target_list):
        integer_encoded = self.label_encoder.fit_transform(np.array(target_list))
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        self.__onehot_encodeder = self.__onehot_encodeder.fit_transform(integer_encoded)
        return self.__onehot_encodeder.toarray()

    def encode_label(self, target_list):
        integer_encoded = self.label_encoder.fit_transform(np.array(target_list))
        return integer_encoded

    def decode(self, encoder_list):
        return self.label_encoder.inverse_transform(np.argmax(np.array(encoder_list), axis=1))


def read_file_to_corpus(folder):
    corpus = []
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), encoding="utf-8") as f:
            for line in f:
                corpus.append(line.split())
    return corpus



def get_vec_model(model_path):
    vec_model = gensim.models.Word2Vec.load(model_path)
    return vec_model


def get_train_list(source_folder, target_folder):
    source_string = []
    target_string = []
    for filename in os.listdir(source_folder):
        target_file_name = "targetH_" + "_".join(filename.split("_")[1:])
        if os.path.exists(os.path.join(target_folder, target_file_name)):
            with open(os.path.join(source_folder, filename), encoding="utf-8") as source:
                with open(os.path.join(target_folder, target_file_name), encoding="utf-8") as target:
                    for source_line in source:
                        for target_line in target:
                            if len(source_line.split()) == len(target_line.split()):
                                source_string.append(source_line.split())
                                target_string.append(target_line.split())
    return source_string, target_string

In [2]:
def get_train_feature(source_string, vec_model, max_sequence=1000):
    index2word_set = set(vec_model.wv.index2word)
    row_vector_list = []
    for source_line in source_string:
        i = 0
        row_vector = []
        for source_word in source_line:
            if i < max_sequence:
                if source_word in index2word_set:
                    row_vector= np.append(row_vector, vec_model[source_word])
                else:
                    row_vector = np.append(row_vector, np.zeros(vec_model.trainables.layer1_size, dtype='float32'))
            i += 1
        if len(source_line) < max_sequence:
            row_vector = np.append(row_vector,
                                   np.zeros((vec_model.trainables.layer1_size * (max_sequence - len(source_line)),),
                                            dtype='float32'))
        row_vector_list.append(row_vector)
    return np.matrix(row_vector_list)


In [3]:
def get_target_label(target_string,max_sequence=1000):
    onehot_model = OneHot()
    for i in range(0, len(target_string)):
        if len(target_string[i]) < max_sequence:
            target_string[i] = target_string[i].extend(["O"]*(max_sequence - len(target_string[i])))
            if target_string[i] is None:
                target_string[i] = ["O"]*max_sequence
        else:
            if target_string[i] is None:
                target_string[i] = ["O"]*max_sequence
            else:
                target_string[i] = target_string[i][0:max_sequence]
    num_rows = len(target_string)
    flat_list = [item for sublist in target_string for item in sublist]
    print(flat_list)
    target_vector = onehot_model.encode_label(flat_list)
    target_vector = target_vector.reshape(-1, max_sequence)
    return target_vector, onehot_model


In [4]:
import gensim
import os
from gensim.models.word2vec import Word2Vec, PathLineSentences
import numpy as np
def get_train_list(source_folder, target_folder):
    source_string = []
    target_string = []
    for filename in os.listdir(source_folder):
        target_file_name = "targetH_" + "_".join(filename.split("_")[1:])
        if os.path.exists(os.path.join(target_folder, target_file_name)):
            with open(os.path.join(source_folder, filename), 'r', encoding="utf-8") as source:
                with open(os.path.join(target_folder, target_file_name), 'r', encoding="utf-8") as target:
                    for source_line, target_line in zip(source.readlines(), target.readlines()):
                        s_line = source_line.split()
                        t_line = target_line.split()
                        if len(s_line) == len(t_line):
                            source_string.append(s_line)
                            target_string.append(t_line)
    print('源数据读取完毕，共' + str(len(source_string)) + '行')
    return source_string, target_string


In [5]:
def get_vec_from_corpus(corpus, size=128, min_count=2, save_path=os.path.join("./data/ner_word2vec_model")):
    vec_model = Word2Vec(sentences=corpus, size=size, min_count=min_count)
    vec_model.save(save_path)
    return vec_model


In [6]:
def lstm_crf(X):
    embedding_size = 128
    unit_num = 128
    dropout_rate = None
    output_size = 3
    batch_size = 1
    seq_length = 10
    lr = 0.001

    cell_forward = tf.nn.rnn_cell.BasicLSTMCell(unit_num)
    cell_backward = tf.nn.rnn_cell.BasicLSTMCell(unit_num)
    input_bi_lstm = tf.reshape(X, [batch_size, seq_length, embedding_size])
    bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(cell_forward,
                                    cell_backward, input_bi_lstm, dtype=tf.float32)

    bi_output = tf.concat(bi_outputs, axis=2)

    W = tf.get_variable("projection_w", [2 * unit_num, output_size])
    b = tf.get_variable("projection_b", [output_size])
    x_reshape = tf.reshape(bi_output, [-1, 2 * unit_num])
    projection = tf.matmul(x_reshape, W) + b
    outputs = tf.reshape(projection, [batch_size, seq_length, output_size])
    return outputs


In [7]:
def predict(predcit_feature, model_path):
    embedding_size = 128
    unit_num = 128
    dropout_rate = None
    output_size = 3
    batch_size = 1
    seq_length = 10
    lr = 0.001
    
    X = tf.placeholder(tf.float32, shape=[batch_size, seq_length * embedding_size])
    pred = lstm_crf(X)
    saver = tf.train.Saver(tf.global_variables())
    predict_label=[]
    with tf.Session() as sess:
        #参数恢复
        module_file = tf.train.latest_checkpoint(model_path)
        saver.restore(sess, module_file)
        for step in range(len(predcit_feature)):
            print(step)
            prob = sess.run(pred, feed_dict={X:predcit_feature[step]})
            predict=prob.reshape((-1)).reshape(-1,3)
            predict_label.append(predict)
    return predict_label


In [8]:
source_string, target_string = get_train_list("./data/source","./data/target")
vec_model = get_vec_from_corpus(source_string, min_count=1)
target_vector, onehot_model = get_target_label(target_string, max_sequence=10)
feature = get_train_feature(source_string, vec_model, max_sequence=10)

源数据读取完毕，共2行
['O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


  # Remove the CWD from sys.path while we load stuff.


In [9]:
len(feature), np.array(feature).shape

(2, (2, 1280))

In [10]:
tf.reset_default_graph()
predict_result = predict(feature, "./model/")

INFO:tensorflow:Restoring parameters from ./model/bilstm-400
0
1


In [11]:
len(predict_result), np.array(predict_result).shape

AttributeError: 'list' object has no attribute 'shape'

In [55]:
predict_result

[array([[  2.7013447,   8.644368 ,  -8.853404 ],
        [  6.632493 ,   1.5225788,  -6.3457103],
        [  1.134716 ,   3.4497828,  -6.0142875],
        [-12.048701 ,  12.990964 ,  -9.226457 ],
        [-16.25568  ,  15.564315 ,  -9.794048 ],
        [-17.570583 ,  16.039436 ,  -9.477436 ],
        [-18.120975 ,  16.190962 ,  -9.253028 ],
        [-18.397783 ,  16.258446 ,  -9.117073 ],
        [-18.551426 ,  16.286884 ,  -9.025663 ],
        [-18.628977 ,  16.285948 ,  -8.963935 ]], dtype=float32),
 array([[  2.7013447,   8.644368 ,  -8.853404 ],
        [  6.632493 ,   1.5225788,  -6.3457103],
        [  1.134716 ,   3.4497828,  -6.0142875],
        [-12.048701 ,  12.990964 ,  -9.226457 ],
        [-16.25568  ,  15.564315 ,  -9.794048 ],
        [-17.570583 ,  16.039436 ,  -9.477436 ],
        [-18.120975 ,  16.190962 ,  -9.253028 ],
        [-18.397783 ,  16.258446 ,  -9.117073 ],
        [-18.551426 ,  16.286884 ,  -9.025663 ],
        [-18.628977 ,  16.285948 ,  -8.963935 ]], dt

In [56]:
for line in predict_result:
    print(line.shape)
    print(np.argmax(np.array(line), axis=1))
    predict_label = onehot_model.decode(line)
    print(predict_label)

(10, 3)
[1 0 1 1 1 1 1 1 1 1]
['O' 'B-ORG' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']
(10, 3)
[1 0 1 1 1 1 1 1 1 1]
['O' 'B-ORG' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']
