## 数据分析工具包

In [1]:
import jieba
import thulac

import pickle
import json

import re

import numpy as np
import pandas as pd

from collections import defaultdict

from gensim.models import word2vec
from gensim.models import KeyedVectors

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.models import load_model

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 训练测试数据的文件及路径

In [10]:
# train data file paths
train_data_path = 'data/atec_nlp_sim_train.csv'                 #训练数据
train_add_data_path = 'data/atec_nlp_sim_train_add.csv'         #添加训练数据
train_all_path = 'data/processed/train_all_data/train_all.csv'  #合并后的训练数据

#训练数据的分析的外部数据
stop_words_path = 'data/stop_words.txt'                      #停用词路径
tokenize_dict_path = 'data/dict_all.txt'                     #jieba分词新自定义字典
spelling_corrections_path = 'data/spelling_corrections.json' #纠错及部分同义词替换规则的文件
doubt_words_path = 'data/doubt_words.txt'                    #计算两个语句中的疑问词的相似度的疑问词相似的规则文件

#根目录
base_path = 'data/processed/'

#预处理之后的数据保存路径
processed_data_path = 'data/processed/train_all_data/train_all_processed.csv'    #预处理后的训练数据
char_texts_path = "data/processed/char_data/char_texts.pickle"                   #Word2Vec用的char字
char_index_path = "data/processed/char_data/char_index.pickle"                   #embedding_wv用的char字

# 词向量路径
train_all_wordvec_path = "data/processed/word2vec_bigram/train_all_data.bigram"           #全部数据训练的词向量
train_char_all_wordvec_path = "data/processed/word2vec_bigram/train_char_all_data.bigram" #全部数据训练的词向量
zhihu_wordvec_path = "sgns.zhihu.bigram"                                                  #知乎词向量


#嵌入层矩阵路径
embedding_matrix_path = "data/processed/embedding_matrix/embedding_matrix.pickle"            #word2vec的词向量矩阵
char_embedding_matrix_path = "data/processed/embedding_matrix/char_embedding_matrix.pickle"  #char的word2vec的字向量矩阵
s1_train_ids_pad_path = "data/processed/embedding_matrix/s1_train_ids_pad.pickle"            #s1的文章矩阵（多退少补）
s2_train_ids_pad_path = "data/processed/embedding_matrix/s2_train_ids_pad.pickle"            #s2的文章矩阵
y_train_path = "data/processed/embedding_matrix/y_train.pickle"                              #训练集的标签

#NLP特征存储路径
sentece_length_diff_feature_path = "data/processed/nlp_feature/sentece_length_diff_feature.pickle"
edit_distance_feature_path = "data/processed/nlp_feature/edit_distance_feature.pickle"
common_substring_feature_path = "data/processed/nlp_feature/common_subsequence_feature.pickle"
common_subsequence_feature_path = "data/processed/nlp_feature/common_substring_feature.pickle"
ngram_feature_path = "data/processed/nlp_feature/ngram_feature.pickle"
sentence_diff_same_feature_path = "data/processed/nlp_feature/sentence_diff_same_feature.pickle"
doubt_sim_feature_path = "data/processed/nlp_feature/doubt_sim_feature.pickle"
sentence_exist_topic_feature_path = "data/processed/nlp_feature/sentence_exist_topic_feature.pickle"
word_embedding_sim_feature_path = "data/processed/nlp_feature/word_embedding_sim_feature.pickle" 


## 读取合并训练数据

* 对训练集和测试集预处理（分词、去除停用词、修改错误拼写、替换脱敏*），提取训练集字典

In [3]:
train_data_df = pd.read_csv(train_data_path, sep='\t', header=None,names=["index", "s1", "s2", "label"])
train_add_data_df = pd.read_csv(train_add_data_path, sep='\t', header=None, names=["index", "s1", "s2", "label"])

In [4]:
train_data_df.head()

Unnamed: 0,index,s1,s2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


In [5]:
train_add_data_df.head()

Unnamed: 0,index,s1,s2,label
0,1,为何我无法申请开通花呗信用卡收款,支付宝开通信用卡花呗收款不符合条件怎么回事,1
1,2,花呗分期付款会影响使用吗,花呗分期有什么影响吗,0
2,3,为什么我花呗没有临时额度,花呗没有临时额度怎么可以负,0
3,4,能不能开花呗老兄,花呗逾期了还能开通,0
4,5,我的怎么开通花呗收钱,这个花呗是个什么啥？我没开通 我怎么有账单,0


In [6]:
print("train_data_df:",train_data_df.shape)
print("train_add_data_df:",train_add_data_df.shape)

frames = [train_data_df, train_add_data_df]
train_all = pd.concat(frames)
print(train_all.shape)

train_all.reset_index(drop=True, inplace=True)
train_all.head()

train_data_df: (39346, 4)
train_add_data_df: (63131, 4)
(102477, 4)


Unnamed: 0,index,s1,s2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


## 保存合并数据

In [7]:
train_all.to_csv(train_all_path,columns = ["index", "s1", "s2", "label"], index = False)

## 文本预处理

In [8]:
def load_stopwordslist(filepath):
    """
    加载停用词
    :param filepath:停用词文件路径
    :return:
    """
    with open(filepath,"r",encoding="utf-8") as file:
        stop_words = [line.strip() for line in file]
        return stop_words
    
def load_spelling_corrections(filepath):
    """
    加载拼写修改词
    :param filepath:替换词文件路径
    :return:
    """
    with open(filepath,"r",encoding="utf-8") as file:
        spelling_corrections = json.load(file)
        return spelling_corrections

def transform_other_word(str_text,reg_dict):
    """
    替换词
    :param str_text:待替换的句子
    :param reg_dict:替换词字典
    :return:
    """
    for token_str,replac_str in reg_dict.items():
        str_text = str_text.replace(token_str, replac_str)
    return str_text

def seg_sentence(sentence,stop_words):
    """
    对句子进行分词
    :param sentence:句子，停用词
    """
    sentence_seged = jieba.cut(sentence.strip())
    out_str = ""
    for word in sentence_seged:
        if word not in stop_words:#去除停用词
            if word != " ":
                out_str += word
                out_str += " "
    return out_str



## 用于训练Word2Vec的word预料
* 替换错别字
* 替换脱敏词
* 去除停用词
* jieba分词
* 生成字DataFrame

In [9]:
#################### 文本的清理工作 ####################
import copy
def preprocessing_word(data_df):
    """
    :param data_df:需要处理的数据集
    :param fname:
    :return:
    """
    data_processed = copy.deepcopy(data_df)

    # 加载停用词
    stopwords = load_stopwordslist(stop_words_path)
    
    # 加载拼写错误替换词
    spelling_corrections = load_spelling_corrections(spelling_corrections_path)

    re_object = re.compile(r'\*+') #去除句子中的脱敏数字***，替换成一
    
    for index, row in data_df.iterrows():
            
        # 分别遍历每行的两个句子，并进行分词处理
        for col_name in ["s1", "s2"]:
            # 替换掉脱敏的数字
            re_str = re_object.subn(u"十一",row[col_name])
            
            # 纠正一些词
            spell_corr_str = transform_other_word(re_str[0],spelling_corrections)
            
            # 分词
            seg_str = seg_sentence(spell_corr_str, stopwords)
            
            #分词之后的DataFrame
            data_processed.at[index, col_name] = seg_str   
    print(data_processed.head())
    return data_processed

## 用于训练Word2Vec的Char预料
* 替换错别字
* 替换脱敏词
* 生成字矩阵

In [10]:

def preprocessing_char(data_df):
    """
    :param data_df:需要处理的数据集
    :param fname:
    :return:
    """
    # 加载停用词
    stopwords = load_stopwordslist(stop_words_path)
    
    # 加载拼写错误替换词
    spelling_corrections = load_spelling_corrections(spelling_corrections_path)

    re_object = re.compile(r'\*+') #去除句子中的脱敏数字***，替换成一
    
    texts = []
    char_vocabs = {}
    char_index = 1
    
    for index, row in data_df.iterrows(): 
        # 分别遍历每行的两个句子，并进行分词处理
        for col_name in ["s1", "s2"]:
            # 替换掉脱敏的数字
            re_str = re_object.subn(u"十一",row[col_name])
            
            # 纠正一些词
            spell_corr_str = transform_other_word(re_str[0],spelling_corrections)
            spell_corr_str = list(spell_corr_str)
            
            for char in spell_corr_str:
                if char not in char_vocabs and char not in stopwords and not char.strip()==u"":
                    char_vocabs[char] = char_index
                    char_index = char_index + 1
            texts.extend(spell_corr_str)
    print(texts[0:20])               
    return texts,char_vocabs

In [11]:
#加载自定义新词
jieba.load_userdict(tokenize_dict_path)

#预处理后的训练数据
train_all_processed = preprocessing_word(train_all)

#char语料
texts_processed,char_index = preprocessing_char(train_all)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.797 seconds.
Prefix dict has been built succesfully.


   index                s1                                   s2  label
0      1  ﻿ 怎么 更换 花呗 手机号码   花呗 是 以前 手机号码 怎么 更换 成 现在 支付宝 号码 手机号       1
1      2     开不了 花呗 这样 完事                          真的 就是 花呗 付款       0
2      3    花呗 冻结 以后 能 开通                       条件 可以 开通 花呗 借款       0
3      4       如何 得知 关 借呗                              永久 关 借呗       0
4      5         花呗 扫码 付钱                        二维码 扫描 可以 用花呗       0
['\ufeff', '怎', '么', '更', '换', '花', '呗', '手', '机', '号', '码', '我', '的', '花', '呗', '是', '以', '前', '的', '手']


## 保存预处理数据

In [12]:
train_all_processed.to_csv(processed_data_path,columns = ["index", "s1", "s2", "label"], index = False)

with open(char_texts_path, 'wb') as file:
    pickle.dump(texts_processed, file)
    
with open(char_index_path, 'wb') as file:
    pickle.dump(char_index, file)

## 加载并查看预处理数据

In [13]:
import pickle
import pandas as pd

#合并后的原始输出
train_all = pd.read_csv(train_all_path)

#合并后处理过的数据
train_all_processed = pd.read_csv(processed_data_path)

#char语料
with open(char_texts_path, 'rb') as file:
    texts_processed = pickle.load(file)
    
#char index（字-编号）
with open(char_index_path, 'rb') as file:
    char_index = pickle.load(file)

In [14]:
train_all.head()

Unnamed: 0,index,s1,s2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


In [15]:
train_all_processed.head()

Unnamed: 0,index,s1,s2,label
0,1,﻿ 怎么 更换 花呗 手机号码,花呗 是 以前 手机号码 怎么 更换 成 现在 支付宝 号码 手机号,1
1,2,开不了 花呗 这样 完事,真的 就是 花呗 付款,0
2,3,花呗 冻结 以后 能 开通,条件 可以 开通 花呗 借款,0
3,4,如何 得知 关 借呗,永久 关 借呗,0
4,5,花呗 扫码 付钱,二维码 扫描 可以 用花呗,0


## 训练Word2Vec词向量

In [16]:
from gensim.models import word2vec
from gensim.models import KeyedVectors

#处理后的语料训练词向量
def pre_train_w2v(train_all_processed,binary = False):

    texts = []
    texts_s1_train = [line.strip().split(" ") for line in train_all_processed['s1'].tolist()]
    texts_s2_train = [line.strip().split(" ") for line in train_all_processed['s2'].tolist()]
    
    texts.extend(texts_s1_train)
    texts.extend(texts_s2_train)
    print(texts[0:5])
    model = word2vec.Word2Vec(sentences=texts,size=300,window=2,min_count=3,workers=-1)
    #保存词向量
    model.wv.save_word2vec_format(train_all_wordvec_path,binary=binary,fvocab=None)


In [17]:
#字训练字向量
def pre_train_char_w2v(texts,binary = False):

    model = word2vec.Word2Vec(sentences=texts,size=300,window=3,min_count=3,workers=-1)
    #保存字向量
    model.wv.save_word2vec_format(fname=train_char_all_wordvec_path,binary=binary,fvocab=None)


In [18]:
pre_train_w2v(train_all_processed,binary = False)
pre_train_char_w2v(texts_processed,binary = False)

[['\ufeff', '怎么', '更换', '花呗', '手机号码'], ['开不了', '花呗', '这样', '完事'], ['花呗', '冻结', '以后', '能', '开通'], ['如何', '得知', '关', '借呗'], ['花呗', '扫码', '付钱']]


In [19]:
#训练样本总共有条数 102477*2
train_all_processed.shape

(102477, 4)

In [20]:
#训练样本总共2721763字
len(texts_processed)

2721763

## 生成嵌入矩阵

* word_embedding_matrix

In [21]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical

#param
embedding_size = 300
max_sentence_length = 20
max_word_length = 25
max_vovab_size = 100000

#################### 文本的Embeding工作 ####################
def process_save_embedding_wv(train_all_processed,type = 2):
    """
    :param type: 词向量的选择：1，知乎，2，训练集 3 知乎+训练集
    """
    w2v_path = zhihu_wordvec_path
    if type == 2:
        w2v_path = train_all_wordvec_path
        
    #文本处理    
    tokenizer = Tokenizer(
        num_words=max_vovab_size,
        split=' ',
        lower=False,
        char_level=False,
        filters=''
    )

    texts = []

    texts_s1_train = train_all_processed['s1'].tolist()
    texts_s2_train = train_all_processed['s2'].tolist()
    
    #保存标签
    y_train = train_all_processed["label"].tolist()
    with open(y_train_path, 'wb') as file:
        pickle.dump(y_train, file)
    

    texts.extend(texts_s1_train)
    texts.extend(texts_s2_train)

    # 生成各个词对应的index列表
    tokenizer.fit_on_texts(texts)
    # 将文章以index表示
    s1_train_ids = tokenizer.texts_to_sequences(texts_s1_train)
    s2_train_ids = tokenizer.texts_to_sequences(texts_s2_train)
    
    #将文章以矩阵的形式（长度多退少补）保存
    s1_train_ids_pad = sequence.pad_sequences(s1_train_ids,maxlen=max_sentence_length)
    s2_train_ids_pad = sequence.pad_sequences(s2_train_ids,maxlen=max_sentence_length)

    with open(s1_train_ids_pad_path, 'wb') as file:
        pickle.dump(s1_train_ids_pad, file)
    with open(s2_train_ids_pad_path, 'wb') as file:
        pickle.dump(s2_train_ids_pad, file)

    #词序列(word_index：key:词，value:索引（编号）)
    word_index_dict = tokenizer.word_index

    # 训练集的词汇表的词向量矩阵,行数为最大值+1,形式为：index->vec
    embedding_matrix = 1 * np.random.randn(len(word_index_dict) + 1, embedding_size)
    embedding_matrix[0] = np.random.randn(embedding_size)

    # 加载预训练的词向量w2v
    print ('load w2v_model...')
    w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
    print ('finish w2v_model...')

    if type == 3:
        w2v_path2 = train_all_wordvec_path
        w2v_model2 = KeyedVectors.load_word2vec_format(w2v_path2, binary=False)
    count = 0
    for word,index in word_index_dict.items():
        if word in w2v_model.vocab:
            embedding_matrix[index] = w2v_model.word_vec(word)
            count = count +1
        else:
            if type == 3:
                if word in w2v_model2.vocab:
                    embedding_matrix[index] = w2v_model2.word_vec(word)
                    count = count + 1
    
    #总共有n个词，在模型里有m个词
    print('total {}, word in model have {}'.format(len(word_index_dict),count))

    with open(embedding_matrix_path, 'wb') as file:
        pickle.dump(embedding_matrix, file)
    

In [22]:
process_save_embedding_wv(train_all_processed,type=2)

load w2v_model...
finish w2v_model...
total 13188, word in model have 5160


* 总共有13181个词，在word2vec模型中总共有5155个词。其他的词频低于min_count

* char_embedding_matrix

In [23]:
def process_save_char_embedding_wv(train_all,char_index):
    """
    :param type: 词向量-训练集
    """
    w2v_path = train_char_all_wordvec_path
    w2v_char_model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
    
    embedding_char_matrix = 1 * np.random.randn((len(char_index) + 1), embedding_size)
    embedding_char_matrix[0] = np.random.randn(embedding_size)
    
    count = 0
    for char,index in char_index.items():
        if char in w2v_char_model.vocab:
            embedding_char_matrix[index] = w2v_char_model.word_vec(char)
            count = count + 1
    #总共有n个字，在模型里有m个词(Word2Vec会把低频词过滤掉，默认是5个，可通过min_count设置)
    print('total {}, word in model have {}'.format(len(char_index),count))

    with open(char_embedding_matrix_path, 'wb') as file:
        pickle.dump(embedding_char_matrix, file)

In [24]:
process_save_char_embedding_wv(train_all,char_index)

total 2081, word in model have 1426


* 总共有2081个字，在word2vec模型中总共有1426个词。其他的字频低于min_count

## 深度学习模型

## 深度学习特征提取

* 创建孪生LSTM模型

In [46]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.models import load_model

from keras.models import Sequential,Model
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding,LSTM,Layer,initializers,regularizers,constraints,Input,Dropout,concatenate,BatchNormalization,Dense,Bidirectional,Concatenate,Multiply,Maximum,Subtract,Lambda,dot,Flatten,Reshape

from keras import backend as K             #返回当前后端
from sklearn.model_selection import KFold

import numpy as np

import gc

* 模型辅助类

In [None]:

class ConsDist(Layer):
    """
    自定义定义曼哈顿距离计算层，继承Layer层，必须实现三个父类方法
    build,call,comput_output_shape
    """
    def __init__(self, **kwargs):
        self.res = None  # 表示相似度
        # self.match_vector = None
        super(ConsDist, self).__init__(**kwargs)

    def build(self, input_shape):
        """Creates the layer weights.
              # Arguments
                  input_shape: Keras tensor (future input to layer)
                      or list/tuple of Keras tensors to reference
                      for weight shape computations.
              """
        super(ConsDist, self).build(input_shape)

    def call(self, inputs, **kwargs):
        """This is where the layer's logic lives.
         # Arguments
             inputs: Input tensor, or list/tuple of input tensors.
             **kwargs: Additional keyword arguments.
         # Returns
             A tensor or list/tuple of tensors.
         """
        # 计算曼哈顿距离,因为输入计算曼哈顿距离的有两个Input层分别为inputs[0]和inputs[1]
        # lstm model
        self.res = K.sum(inputs[0] * inputs[1],axis=1,keepdims=True)/(K.sum(inputs[0]**2,axis=1,keepdims=True) * K.sum(inputs[1]**2,axis=1,keepdims=True))
        return self.res
    def compute_output_shape(self, input_shape):
        """Computes the output shape of the layer.
               Assumes that the layer will be built
               to match that input shape provided.
               # Arguments
                   input_shape: Shape tuple (tuple of integers)
                       or list of shape tuples (one per output tensor of the layer).
                       Shape tuples can include None for free dimensions,
                       instead of an integer.

               # Returns
                   An input shape tuple.
               """
        return K.int_shape(self.res)



In [17]:

def precision(y_true, y_pred):
    y_t = y_true
    y_p = y_pred

    true_positives = K.sum(K.round(K.clip(y_t * y_p, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_p, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    #y_t = K.cast(K.argmax(y_true,axis=1),dtype='float32')
    #y_p = K.cast(K.argmax(y_pred,axis=1),dtype='float32')
    y_t = y_true
    y_p = y_pred

    true_positives = K.sum(K.round(K.clip(y_t * y_p, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_t, 0, 1)))

    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fbeta_score(y_t, y_p, beta=1):
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_t, 0, 1))) == 0:
        return 0
    p = precision(y_t, y_p)
    r = recall(y_t, y_p)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score

def contrastive_loss(y_true,y_pred):
    """
    定义孪生网络的代价函数，对比代价函数,每个样本的误差为L=(1 - y) * d + y * max((margin - d),0) 其中margin为相似度的阈值默认为1
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    :param y_true:1表示两个样本相似，0表示不匹配,y
    :param y_pred:表示相似度d，范围是(0,1)
    :return:
    """
    margin = 0.8
    return K.mean((1-y_true) * y_pred + y_true * K.maximum((margin - y_pred),0))


## 抽取Attention特征

In [None]:
class AttentionLayer1(Layer):
    def __init__(self, **kwargs):
        # self.res = None  # 表示相似度
        self.match_vector = None
        super(AttentionLayer1, self).__init__(**kwargs)

    def build(self, input_shape):
        """Creates the layer weights.
              # Arguments
                  input_shape: Keras tensor (future input to layer)
                      or list/tuple of Keras tensors to reference
                      for weight shape computations.
              """
        super(AttentionLayer1, self).build(input_shape)

    def call(self, inputs, **kwargs):
        """This is where the layer's logic lives.
         # Arguments
             inputs: Input tensor, or list/tuple of input tensors.
             **kwargs: Additional keyword arguments.
         # Returns
             A tensor or list/tuple of tensors.
         """
        encode_s1 = inputs[0]
        encode_s2 = inputs[1]
        sentence_differerce = encode_s1 - encode_s2
        sentece_product = encode_s1 * encode_s2
        self.match_vector = K.concatenate([encode_s1,sentence_differerce,sentece_product,encode_s2],1)
        #
        return self.match_vector

    def compute_output_shape(self, input_shape):
        """Computes the output shape of the layer.
               Assumes that the layer will be built
               to match that input shape provided.
               # Arguments
                   input_shape: Shape tuple (tuple of integers)
                       or list of shape tuples (one per output tensor of the layer).
                       Shape tuples can include None for free dimensions,
                       instead of an integer.

               # Returns
                   An input shape tuple.
               """
        return K.int_shape(self.match_vector)

In [11]:
def create_siamese_lstm_attention_model(embedding_matrix,model_param,embedding_size = 300,max_sentence_length = 20):

    # step 1 定义孪生网络的公共层
    X = Sequential()
    
    embedding_layer = Embedding(
        input_dim=len(embedding_matrix,),     #input_dim：词向量矩阵的维度
        output_dim=embedding_size,            #output_dim:词向量的长度
        weights=[embedding_matrix],           #weights：词向量矩阵
        trainable=True,                       #trainable：是否冻结嵌入层   
        input_length=max_sentence_length      #input_length：句子的最大长度
    )
    
    # 一般来说return_sequences为true时，需要使用attention
    lstm_layer = LSTM(
        units=model_param['lstm_units']  #定义LSTM的输出维度
        ,return_sequences=False
    )
    
    # attention_layer = AttentionLayer()
    X.add(embedding_layer)
    X.add(lstm_layer)
    # X.add(attention_layer)

    #share_model为孪生网络的共同拥有的层
    share_model = X

    # step 2 模型是多输入的结构，定义两个句子的输入
    left_input = Input(shape=(max_sentence_length,), dtype='int32')
    right_input = Input(shape=(max_sentence_length,), dtype='int32')

    # Step3定义两个输入合并后的模型层
    s1_net = share_model(left_input)
    s2_net = share_model(right_input)

    matching_layer = AttentionLayer1()([s1_net,s2_net])

    merge_model = Dense(model_param['num_dense'])(matching_layer)#num_dense：128
    merge_model = Dropout(model_param['desen_dropout_rate'])(merge_model)#desen_dropout_rate：0.75
    merge_model = BatchNormalization()(merge_model)

    # Step4 定义输出层
    output_layer = Dense(1,activation='sigmoid')(merge_model)

    model = Model(inputs=[left_input, right_input],outputs=[output_layer], name="simaese_lstm_attention")
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=["accuracy",fbeta_score,precision,recall])
    return model

In [12]:

def extract_feature_siamese_lstm_attention():
    
    #特征名
    feature_name = 'dl_siamese_lstm_attention'
    
    RANOD_SEED = 42
    np.random.seed(RANOD_SEED)
    nepoch = 40
    num_folds = 5
    batch_size = 512
    
    # 前期参数设置
    
    embedding_matrix_file_path = embedding_matrix_path

    # 加载Embeding矩阵
    with open(embedding_matrix_file_path, 'rb') as file:
        embedding_matrix = pickle.load(file)

    #加载输入数据
    with open(s1_train_ids_pad_path, 'rb') as file:
        X_train_s1 = pickle.load(file)
        
    with open(s2_train_ids_pad_path, 'rb') as file:
        X_train_s2 = pickle.load(file)

    #标签
    with open(y_train_path, 'rb') as file:
        y_train = pickle.load(file)
        
    #定义model param
    model_param = {
        'lstm_units':50,
        'lstm_dropout_rate':0.,
        'lstm_re_dropout_rate':0.,
        'desen_dropout_rate':0.75,
        'num_dense':128
    }
    
    model_checkpoint_path = base_path + 'fold-checkpoint-'+feature_name + '.h5'
    
    kfold = StratifiedKFold(
        n_splits=num_folds,
        shuffle=True,
        random_state=RANOD_SEED
    )
    
    # 存放最后预测结果
    y_train_oofp = np.zeros((len(y_train),1),dtype='float32')
    
    #将标签独热向量处理
    labels = to_categorical(y_train, 2)

    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1,y_train)):

        # 选出需要添加的样本
        train_true_mask = y_train[ix_train] == 1                    #选出训练集的正样本标签的索引
        X_train_true_s1 = X_train_s1[ix_train][train_true_mask]     #选出训练集的正样本s1训练样本
        X_train_true_s2 = X_train_s2[ix_train][train_true_mask]     #选出训练集的正样本s2训练样本
        y_train_true = y_train[ix_train][train_true_mask]           #选出训练集的正样本标签

        # 进行添加
        X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train],X_train_true_s2])#上下合并
        X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train],X_train_true_s1])#
        y_add_train_fold = np.concatenate([y_train[ix_train],y_train_true])

        val_true_mask = y_train[ix_val]==1
        X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
        X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
        y_val_true = y_train[ix_val][val_true_mask]

        # 进行添加
        X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
        X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
        y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true])

        print ('start train fold {} of {} ......'.format((fold_num + 1), 5))
        
        # 创建模型
        model = create_siamese_lstm_attention_model(embedding_matrix, model_param)
        
        # 训练模型
        model_checkpoint_path = base_path + 'dl_siamese_lstm_attention_model{}.h5'.format(fold_num)
        
        model.fit(x=[X_add_train_fold_s1,X_add_train_fold_s2],y=y_add_train_fold,
                      validation_data=([X_add_val_fold_s1,X_add_val_fold_s2],y_add_val_fold),
                      batch_size=batch_size,
                      epochs=nepoch,
                      verbose=1,
                      class_weight={0: 1, 1: 2},
                      callbacks=[
                          EarlyStopping(
                              monitor='val_loss',
                              min_delta=0.005,
                              patience=5,
                              verbose=1,
                              mode='auto'
                          ),
                          ModelCheckpoint(
                              model_checkpoint_path,
                              monitor='val_loss',
                              save_best_only=True,
                              save_weights_only=False,
                              verbose=1
                          )]
                  )
        
        #加载最优的模型参数
        model.load_weights(model_checkpoint_path)
        y_train_oofp[ix_val] = predict(model,X_train_s1[ix_val],X_train_s2[ix_val])
        
        K.clear_session()
        gc.collect()

    model_path =  base_path+'dl_siamese_lstm_attention_model0.h5'
    model0 = load_model(model_path,custom_objects={'AttentionLayer1': AttentionLayer1, 'fbeta_score': fbeta_score, 
                                                   'precision': precision, 'recall': recall})

    y_test_oofp = model0.predict(X_test_s1, X_test_s2)

In [None]:
extract_feature_siamese_lstm_attention()

## 抽取曼哈顿距离特征

In [13]:
import pickle
import pandas as pd
from sklearn.model_selection import StratifiedKFold

from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.models import load_model

from keras.models import Sequential,Model
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.layers import Embedding,LSTM,Layer,initializers,regularizers,constraints,Input,Dropout,concatenate,BatchNormalization,Dense,Bidirectional,Concatenate,Multiply,Maximum,Subtract,Lambda,dot,Flatten,Reshape

from keras import backend as K             #返回当前后端
from sklearn.model_selection import KFold

import numpy as np

import gc
class ManDist(Layer):
    """
    自定义定义曼哈顿距离计算层，继承Layer层，必须实现三个父类方法
    build,call,comput_output_shape
    """
    def __init__(self, **kwargs):
        self.res = None  # 表示相似度
        super(ManDist, self).__init__(**kwargs)

    def build(self, input_shape):
        """Creates the layer weights.
              # Arguments
                  input_shape: Keras tensor (future input to layer)
                      or list/tuple of Keras tensors to reference
                      for weight shape computations.
        """
        super(ManDist, self).build(input_shape)

    def call(self, inputs, **kwargs):
        """This is where the layer's logic lives.
         # Arguments
             inputs: Input tensor, or list/tuple of input tensors.
             **kwargs: Additional keyword arguments.
         # Returns
             A tensor or list/tuple of tensors.
         """
        # 计算曼哈顿距离,因为输入计算曼哈顿距离的有两个Input层分别为inputs[0]和inputs[1]
        self.res  = K.exp(- K.sum(K.abs(inputs[0]-inputs[1]),axis = 1,keepdims = True))
        return self.res

    def compute_output_shape(self, input_shape):
        """Computes the output shape of the layer.
               Assumes that the layer will be built
               to match that input shape provided.
               # Arguments
                   input_shape: Shape tuple (tuple of integers)
                       or list of shape tuples (one per output tensor of the layer).
                       Shape tuples can include None for free dimensions,
                       instead of an integer.

               # Returns
                   An input shape tuple.
               """
        return K.int_shape(self.res)

In [14]:
def create_siamese_lstm_ManDistance_model(embedding_matrix,model_param,embedding_size = 300,max_sentence_length = 20):

    # 定义孪生网络的公共层
    X = Sequential()
    embedding_layer = Embedding(
        input_dim=len(embedding_matrix,),
        output_dim=embedding_size,
        weights=[embedding_matrix],
        trainable=True,
        input_length=max_sentence_length
    )

    lstm_layer = LSTM(
        units=model_param['lstm_units'],
        dropout=model_param['lstm_dropout_rate'],
        recurrent_dropout=model_param['lstm_re_dropout_rate']
        ,return_sequences=False
    )

    X.add(embedding_layer)
    X.add(lstm_layer)

    #share_model为孪生网络的共同拥有的层
    share_model = X

    # 模型是多输入的结构，定义两个句子的输入
    left_input = Input(shape=(max_sentence_length,), dtype='int32')
    right_input = Input(shape=(max_sentence_length,), dtype='int32')

    # 定义两个输入合并后的模型层
    s1_net = share_model(left_input)
    s2_net = share_model(right_input)

    # 定义输出层
    man_layer = ManDist()([s1_net,s2_net])
    
    out_put_layer = Dense(2, activation='softmax')(man_layer)
    # out_put_layer = Dense(1,activation='sigmoid')(man_layer)
    
    model = Model(inputs=[left_input, right_input],outputs=[out_put_layer], name="simaese_lstm_manDist")
    model.compile(loss= 'categorical_crossentropy',optimizer='adam',metrics=["accuracy",fbeta_score,precision,recall])
    model.summary()
    return model

In [18]:
#    feature_name = 'dl_siamese_lstm_manDist'
def extract_feature_siamese_lstm_manDist():
    # 前期参数设置
    embedding_matrix_file_path = embedding_matrix_path

    RANOD_SEED = 42
    np.random.seed(RANOD_SEED)
    nepoch = 30
    num_folds = 5
    batch_size = 512

    # 加载Embeding矩阵
    with open(embedding_matrix_file_path, 'rb') as file:
        embedding_matrix = pickle.load(file)
        
    #加载输入数据
    with open(s1_train_ids_pad_path, 'rb') as file:
        X_train_s1 = pickle.load(file)
        
    with open(s2_train_ids_pad_path, 'rb') as file:
        X_train_s2 = pickle.load(file)

    #标签
    with open(y_train_path, 'rb') as file:
        y_train = pickle.load(file)
    
    y_train = np.array(y_train)

    #定义model param
    model_param = {
        'lstm_units':50,
        'lstm_dropout_rate':0.,
        'lstm_re_dropout_rate':0.,
        'desen_dropout_rate':0.75,
        'num_dense':128
    }

    kfold = StratifiedKFold(
        n_splits=num_folds,
        shuffle=True,
        random_state=RANOD_SEED
    )
    # 存放最后预测结果
    y_train_oofp = np.zeros((len(y_train),2),dtype='float32')

    label = to_categorical(y_train, 2)
    
    
    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1,y_train)):

        #对于正例的语句对样本数量少的问题，通过将正例的样本语句对进行顺序调换，形成新的正例样本对。
        
        # 提取训练集中的正样本和标签
        train_true_mask = y_train[ix_train] == 1                    #选出训练集的正样本标签的索引
        X_train_true_s1 = X_train_s1[ix_train][train_true_mask]     #选出s1训练样本的 正样本 
        X_train_true_s2 = X_train_s2[ix_train][train_true_mask]     #选出s2训练样本的 正样本
        y_train_true = label[ix_train][train_true_mask]             #选出训练集的正样本标签
          
        # 将训练集 和 训练集的正样本（s1和s2调换位置）
        X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train],X_train_true_s2])#合并训练集s1 和 训练集s2的正样本
        X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train],X_train_true_s1])#合并训练集s2 和 训练集s1的正样本
        y_add_train_fold = np.concatenate([label[ix_train],y_train_true])      #合并训练集标签标签 和 训练集的正样本标签

        # 选出验证集中的正样本和标签
        val_true_mask = y_train[ix_val]==1                          #选出验证集的正样本标签的索引
        X_val_true_s1 = X_train_s1[ix_val][val_true_mask]           #选出s1验证样本的 正样本
        X_val_true_s2 = X_train_s2[ix_val][val_true_mask]           #选出s2验证样本的 正样本
        y_val_true = label[ix_val][val_true_mask]                   #选出训验证的正样本标签

        # 将验证集 和 验证集的正样本（s1和s2调换位置）
        X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])#合并验证集s1 和 验证集s2的正样本
        X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])#合并验证集s2 和 验证集s1的正样本
        y_add_val_fold = np.concatenate([label[ix_val], y_val_true])      #合并验证集标签标签 和 验证集的正样本标签

        #打印训练的是5折的第几折
        print ('start train fold {} of {} ......'.format((fold_num + 1), 5))
        
        # 创建模型
        model = create_siamese_lstm_ManDistance_model(embedding_matrix, model_param)
        
        # 训练模型
        model_checkpoint_path = base_path + 'dl_siamese_lstm_manDist_model{}.h5'.format(fold_num)
        
        
        model.fit(x=[X_add_train_fold_s1,X_add_train_fold_s2],y=y_add_train_fold,
                      validation_data=([X_add_val_fold_s1,X_add_val_fold_s2],y_add_val_fold),
                      batch_size=batch_size,
                      epochs=nepoch,
                      verbose=1,
                      class_weight={0: 1, 1: 2},
                      callbacks=[
                          EarlyStopping(
                              monitor='val_loss',  #监控的方式：’acc’,’val_acc’,’loss’,’val_loss’
                              min_delta=0.005,     #增大或者减小的阈值，只有只有大于这个部分才算作improvement
                              patience=5,          #连续n次没有提升
                              verbose=1,           #信息展示模式
                              mode='auto'          #‘auto’，‘min’，‘max’之一，在min模式下，如果检测值停止下降则中止训练。在max模式下，当检测值不再上升则停止训练。
                          ),
                          ModelCheckpoint(
                              model_checkpoint_path,
                              monitor='val_loss',
                              save_best_only=True,
                              save_weights_only=False,
                              verbose=1
                          )]
                  )
        
        model.load_weights(model_checkpoint_path)
        
        y_train_oofp[ix_val] = predict(model,X_train_s1[ix_val],X_train_s2[ix_val])
        K.clear_session()
        
        del X_add_train_fold_s1
        del X_add_train_fold_s2
        del X_add_val_fold_s1
        del X_add_val_fold_s2
        del y_add_train_fold
        del y_add_val_fold
        gc.collect()

    # save feature

    model_path = base_path + 'dl_siamese_lstm_manDist_model0.h5'
    
    model0 = load_model(model_path,custom_objects={'ManDist': ManDist, 'fbeta_score': fbeta_score, 
                                                   'precision': precision, 'recall': recall})
    
    y_test_oofp = predict(model0,X_test_s1,X_test_s2)


In [19]:
extract_feature_siamese_lstm_manDist()

start train fold 1 of 5 ......
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 50)           4024800     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
man_dist_2 (ManDist)            (None, 1)            0           sequential_2[

NameError: name 'predict' is not defined

## 抽取dssm特征

In [None]:

class AttentionLayer(Layer):
    def __init__(self,step_dim,W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(AttentionLayer,self).__init__(**kwargs)#用于调用父类(超类)的一个方法。

    #设置self.supports_masking = True后需要复写该方法
    def compute_mask(self, inputs, mask=None):
        return None

    #参数设置，必须实现
    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    # input (None,sentence_length,embedding_size)
    def call(self, x, mask = None):
        # 计算输出
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        # print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim


    def get_config(self):
        config = {'step_dim': self.step_dim}
        base_config = super(AttentionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def create_siamese_lstm_dssm_mdoel(embedding_matrix,embedding_word_matrix,model_param,embedding_size = 300,max_sentence_length = 20,max_word_length=25):
    # 第一部分
    # step 1 定义复杂模型的输入
    num_conv2d_layers = 1
    filters_2d = [6, 12]
    kernel_size_2d = [[3, 3], [3, 3]]
    mpool_size_2d = [[2, 2], [2, 2]]
    left_input = Input(shape=(max_sentence_length,), dtype='int32')
    right_input = Input(shape=(max_sentence_length,), dtype='int32')

    # 定义需要使用的网络层
    embedding_layer1 = Embedding(
        input_dim=len(embedding_matrix, ),
        output_dim=embedding_size,
        weights=[embedding_matrix],
        trainable=True,
        input_length=max_sentence_length
    )
    att_layer1 = AttentionLayer(20)
    bi_lstm_layer =Bidirectional(LSTM(model_param['lstm_units']))
    lstm_layer1 = LSTM(model_param['lstm_units'],return_sequences=True)#return_sequences：返回全部time step 的 hidden state值
    lstm_layer2 = LSTM(model_param['lstm_units'])

    # 组合模型结构,两个输入添加Embeding层
    s1 = embedding_layer1(left_input)
    s2 = embedding_layer1(right_input)

    # 在Embeding层上添加双向LSTM层
    s1_bi = bi_lstm_layer(s1)
    s2_bi = bi_lstm_layer(s2)

    # 另在Embeding层上添加双层LSTM层
    s1_lstm_lstm = lstm_layer2(lstm_layer1(s1))
    s2_lstm_lstm = lstm_layer2(lstm_layer1(s2))

    s1_lstm = lstm_layer1(s1)
    s2_lstm = lstm_layer1(s2)

    cnn_input_layer = dot([s1_lstm,s2_lstm],axes=-1)
    cnn_input_layer_dot = Reshape((20,20,-1))(cnn_input_layer)
    layer_conv1 = Conv2D(filters=8,kernel_size=3,padding='same',activation='relu')(cnn_input_layer_dot)
    z = MaxPooling2D(pool_size=(2,2))(layer_conv1)

    for i in range(num_conv2d_layers):
        z = Conv2D(filters=filters_2d[i], kernel_size=kernel_size_2d[i], padding='same', activation='relu')(z)
        z = MaxPooling2D(pool_size=(mpool_size_2d[i][0], mpool_size_2d[i][1]))(z)

    pool1_flat = Flatten()(z)
    # # print pool1_flat
    pool1_flat_drop = Dropout(rate=0.1)(pool1_flat)
    ccn1 = Dense(32, activation='relu')(pool1_flat_drop)
    ccn2 = Dense(16, activation='relu')(ccn1)

    # 另在Embeding层上添加attention层
    s1_att = att_layer1(s1)
    s2_att = att_layer1(s2)

    # 组合在Embeding层上添加attention层和在Embeding层上添加双向LSTM层
    s1_last = Concatenate(axis=1)([s1_att,s1_bi])
    s2_last = Concatenate(axis=1)([s2_att,s2_bi])

    cos_layer = ConsDist()([s1_last,s2_last])
    man_layer = ManDist()([s1_last,s2_last])
    # 第二部分
    left_w_input = Input(shape=(max_word_length,), dtype='int32')
    right_w_input = Input(shape=(max_word_length,), dtype='int32')

    # 定义需要使用的网络层
    embedding_layer2 = Embedding(
        input_dim=len(embedding_word_matrix, ),
        output_dim=embedding_size,
        weights=[embedding_word_matrix],
        trainable=True,
        input_length=max_word_length
    )
    lstm_word_bi_layer = Bidirectional(LSTM(6))
    att_layer2 = AttentionLayer(25)

    s1_words = embedding_layer2(left_w_input)
    s2_words = embedding_layer2(right_w_input)

    s1_words_bi = lstm_word_bi_layer(s1_words)
    s2_words_bi = lstm_word_bi_layer(s2_words)

    s1_words_att = att_layer2(s1_words)
    s2_words_att = att_layer2(s2_words)

    s1_words_last = Concatenate(axis=1)([s1_words_att,s1_words_bi])
    s2_words_last = Concatenate(axis=1)([s2_words_att,s2_words_bi])
    cos_layer1 = ConsDist()([s1_words_last,s2_words_last])
    man_layer1 = ManDist()([s1_words_last,s2_words_last])


    # 第三部分，前两部分模型组合
    s1_s2_mul = Multiply()([s1_last,s2_last])
    s1_s2_sub = Lambda(lambda x: K.abs(x))(Subtract()([s1_last,s2_last]))
    s1_s2_maxium = Maximum()([Multiply()([s1_last,s1_last]),Multiply()([s2_last,s2_last])])
    s1_s2_sub1 = Lambda(lambda x: K.abs(x))(Subtract()([s1_lstm_lstm,s2_lstm_lstm]))


    s1_words_s2_words_mul = Multiply()([s1_words_last,s2_words_last])
    s1_words_s2_words_sub = Lambda(lambda x: K.abs(x))(Subtract()([s1_words_last,s2_words_last]))
    s1_words_s2_words_maxium = Maximum()([Multiply()([s1_words_last,s1_words_last]),Multiply()([s2_words_last,s2_words_last])])

    last_list_layer = Concatenate(axis=1)([s1_s2_mul,s1_s2_sub,s1_s2_sub1,s1_s2_maxium,s1_words_s2_words_mul,s1_words_s2_words_sub,s1_words_s2_words_maxium])
    last_list_layer = Dropout(0.05)(last_list_layer)
    # Dense 层
    dense_layer1 = Dense(32,activation='relu')(last_list_layer)
    dense_layer2 = Dense(48,activation='sigmoid')(last_list_layer)

    output_layer = Concatenate(axis=1)([dense_layer1,dense_layer2,cos_layer,man_layer,cos_layer1,man_layer1,ccn2])
    # Step4 定义输出层
    output_layer = Dense(1, activation='sigmoid')(output_layer)

    model = Model(inputs=[left_input,right_input,left_w_input,right_w_input],outputs=[output_layer], name="simaese_lstm_attention")
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=["accuracy", fbeta_score, precision, recall])
    return model

In [None]:
def extract_feature_siamese_lstm_dssm():
    # 前期参数设置
    embedding_matrix_file_path = 'train_all_w2v_embedding_matrix.pickle'
    embedding_char_matrix_file_path = 'train_all_char_embedding_matrix.pickle'
    feature_name = 'dl_siamese_lstm_dssm'
    RANOD_SEED = 42
    np.random.seed(RANOD_SEED)
    nepoch = 30
    num_folds = 5
    batch_size = 512

    # 加载Embeding矩阵
    embedding_matrix = project.load(project.aux_dir + embedding_matrix_file_path)
    char_embedding_matrix =  project.load(project.aux_dir + embedding_char_matrix_file_path)

    # 加载输入数据
    X_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_ids_pad.pickle')
    X_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_ids_pad.pickle')

    print X_train_s2.shape

    X_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_ids_pad.pickle')
    X_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_ids_pad.pickle')

    X_char_train_s1 = project.load(project.preprocessed_data_dir + 's1_train_char_ids_pad.pickle')
    X_char_train_s2 = project.load(project.preprocessed_data_dir + 's2_train_char_ids_pad.pickle')

    X_char_test_s1 = project.load(project.preprocessed_data_dir + 's1_test_char_ids_pad.pickle')
    X_char_test_s2 = project.load(project.preprocessed_data_dir + 's2_test_char_ids_pad.pickle')

    # y_0.6_train.pickle 存储的为list
    y_train = np.array(project.load(project.features_dir + 'y_0.6_train.pickle'))
    y_val = np.array(project.load(project.features_dir + 'y_0.4_test.pickle'))

    # train_y = to_categorical(y_train, 2)
    # val_y = to_categorical(y_val,2)
    # 定义model param
    model_param = {
        'lstm_units': 50,
        'lstm_dropout_rate': 0.,
        'lstm_re_dropout_rate': 0.,
        'desen_dropout_rate': 0.75,
        'num_dense': 128
    }
    kfold = StratifiedKFold(
        n_splits=num_folds,
        shuffle=True,
        random_state=RANOD_SEED
    )
    # 存放最后预测结果
    # y_train_oofp = np.zeros_like(y_train,dtype='float64')

    y_train_oofp = np.zeros((len(y_train), 1), dtype='float64')

    y_test_oofp = np.zeros((len(X_test_s1), 1), dtype='float64')

    for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_s1, y_train)):
        # 选出需要添加的样本
        train_true_mask = y_train[ix_train] == 1
        X_train_true_s1 = X_train_s1[ix_train][train_true_mask]
        X_train_true_s2 = X_train_s2[ix_train][train_true_mask]
        y_train_true = y_train[ix_train][train_true_mask]

        # 进行添加
        X_add_train_fold_s1 = np.vstack([X_train_s1[ix_train], X_train_true_s2])
        X_add_train_fold_s2 = np.vstack([X_train_s2[ix_train], X_train_true_s1])
        y_add_train_fold = np.concatenate([y_train[ix_train], y_train_true])



        X_train_true_s1_char = X_char_train_s1[ix_train][train_true_mask]
        X_train_true_s2_char = X_char_train_s2[ix_train][train_true_mask]

        # 进行添加
        X_add_train_fold_s1_char = np.vstack([X_char_train_s1[ix_train], X_train_true_s2_char])
        X_add_train_fold_s2_char = np.vstack([X_char_train_s2[ix_train], X_train_true_s1_char])

        #   验证部分
        val_true_mask = y_train[ix_val] == 1
        X_val_true_s1 = X_train_s1[ix_val][val_true_mask]
        X_val_true_s2 = X_train_s2[ix_val][val_true_mask]
        y_val_true = y_train[ix_val][val_true_mask]

        # 进行添加
        X_add_val_fold_s1 = np.vstack([X_train_s1[ix_val], X_val_true_s2])
        X_add_val_fold_s2 = np.vstack([X_train_s2[ix_val], X_val_true_s1])
        y_add_val_fold = np.concatenate([y_train[ix_val], y_val_true])

        X_val_true_s1_char = X_char_train_s1[ix_val][val_true_mask]
        X_val_true_s2_char = X_char_train_s2[ix_val][val_true_mask]

        X_add_val_fold_s1_char = np.vstack([X_char_train_s1[ix_val], X_val_true_s2_char])
        X_add_val_fold_s2_char = np.vstack([X_char_train_s2[ix_val], X_val_true_s1_char])

        print 'start train fold {} of {} ......'.format((fold_num + 1), 5)
        # 创建模型
        model = create_siamese_lstm_dssm_mdoel(embedding_matrix,char_embedding_matrix, model_param)
        # 训练模型
        model_checkpoint_path = project.trained_model_dir + 'dl_siamese_lstm_dssm_model{}.h5'.format(fold_num)
        model.fit(x=[X_add_train_fold_s1, X_add_train_fold_s2,X_add_train_fold_s1_char,X_add_train_fold_s2_char], y=y_add_train_fold,
                  validation_data=([X_add_val_fold_s1, X_add_val_fold_s2,X_add_val_fold_s1_char,X_add_val_fold_s2_char], y_add_val_fold),
                  batch_size=batch_size,
                  epochs=nepoch,
                  class_weight={0:1,1:2},
                  verbose=1,
                  callbacks=[
                      EarlyStopping(
                          monitor='val_loss',
                          min_delta=0.001,
                          patience=3,
                          verbose=1,
                          mode='auto'
                      ),
                      ModelCheckpoint(
                          model_checkpoint_path,
                          monitor='val_loss',
                          save_best_only=True,
                          save_weights_only=False,
                          verbose=1
                      )]
                  )
        model.load_weights(model_checkpoint_path)
        y_train_oofp[ix_val] = predict1(model, X_train_s1[ix_val], X_train_s2[ix_val],X_char_train_s1[ix_val],X_char_train_s2[ix_val])
        K.clear_session()
        del X_add_train_fold_s1
        del X_add_train_fold_s2
        del X_add_val_fold_s1
        del X_add_val_fold_s2
        del y_add_train_fold
        del y_add_val_fold
        gc.collect()

    model_path = project.trained_model_dir + 'dl_siamese_lstm_dssm_model0.h5'
    model0 = load_model(model_path,
                        custom_objects={'AttentionLayer': AttentionLayer,'ManDist': ManDist,'ConsDist':ConsDist, 'fbeta_score': fbeta_score,
                                        'precision': precision,
                                        'recall': recall})
    y_test_oofp = predict1(model0, X_test_s1, X_test_s2,X_char_test_s1,X_char_test_s2)
    col_names = ['{}_{}'.format(feature_name, index) for index in range(1)]
    after_extract_feature_save_data(y_train_oofp, y_test_oofp, col_names, feature_name)

# NLP特征提取

* 抽取两个句子长度之差(归一化)

In [25]:

def extract_sentece_length_diff(train_all):
    """
    长度差特征
    """ 
    feature_train = np.zeros((train_all.shape[0],1),dtype='float32')

    # 计算两个句子的长度差
    def get_length_diff(s1, s2):
        return 1 - (abs(len(s1) - len(s2)) / float(max(len(s1), len(s2))))

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip().split(' ')
        s2 = row['s2'].strip().split(' ')
        diff = get_length_diff(s1,s2)
        feature_train[index] = round(diff,5)

    return feature_train

In [26]:
sentece_length_diff_feature = extract_sentece_length_diff(train_all_processed)
with open(sentece_length_diff_feature_path, 'wb') as file:
    pickle.dump(sentece_length_diff_feature, file)

* 抽取两个句子编辑距离(归一化)

In [27]:
def extract_edit_distance(train_all):
    """
    编辑距离特征
    """ 
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    # 计算编辑距离
    def get_edit_distance(rawq1, rawq2):
        #构建DP矩阵
        m, n = len(rawq1) + 1, len(rawq2) + 1
        matrix = [[0] * n for i in range(m)]
        matrix[0][0] = 0
        for i in range(1, m):
            matrix[i][0] = matrix[i - 1][0] + 1
        for j in range(1, n):
            matrix[0][j] = matrix[0][j - 1] + 1
        cost = 0
        for i in range(1, m):
            for j in range(1, n):
                if rawq1[i - 1] == rawq2[j - 1]:
                    cost = 0
                else:
                    cost = 1
                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
        return 1 - (matrix[m - 1][n - 1] / float(max(len(rawq1), len(rawq2))))

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        edit_distance = get_edit_distance(s1,s2)
        feature_train[index] = round(edit_distance,5)
        
    return feature_train

In [28]:
edit_distance_feature = extract_edit_distance(train_all_processed)
with open(edit_distance_feature_path, 'wb') as file:
    pickle.dump(edit_distance_feature, file)

* 抽取公共子串特征

In [2]:
def extract_longest_common_substring(train_all):
    """
    公共子串特征
    """ 
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')
    
    # 计算最长公共子串
    def get_common_substring_len(rawq1, rawq2):
        #构建DP矩阵
        m, n = len(rawq1) + 1, len(rawq2) + 1
        matrix = [[0] * n for i in range(m)]
        longest_num = 0
        for i in range(1, m):
            for j in range(1, n):
                if rawq1[i - 1] == rawq2[j - 1]:
                    matrix[i][j] = matrix[i-1][j-1] + 1
                    if matrix[i][j] > longest_num:
                        longest_num = matrix[i][j]
                    else:
                        matrix[i][j] = 0
        return longest_num / float(min(len(rawq1), len(rawq2)))
    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        common_substring_len = get_common_substring_len(s1,s2)
        feature_train[index] = round(common_substring_len,5)
        
    return feature_train

In [None]:
common_substring_feature = extract_longest_common_substring(train_all_processed)
with open(common_substring_feature_path, 'wb') as file:
    pickle.dump(common_substring_feature, file)

* 抽取公共子序列特征

In [1]:
def extract_longest_common_subsequence(train_all):
    """
    公共子序列特征
    """ 
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')
    
    # 计算最长公共子序列
    def get_common_subsequence_len(rawq1, rawq2):
        #构建DP矩阵
        m, n = len(rawq1) + 1, len(rawq2) + 1
        matrix = [[0] * n for i in range(m)]
        for i in range(1, m):
            for j in range(1, n):
                if rawq1[i - 1] == rawq2[j - 1]:
                    matrix[i][j] = matrix[i-1][j-1] + 1
                else:
                    matrix[i][j] = max(matrix[i-1][j],matrix[i][j-1])
        return matrix[m-1][n-1] / float(min(len(rawq1), len(rawq2)))
    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        common_subsequence_len = get_common_subsequence_len(s1,s2)
        feature_train[index] = round(common_subsequence_len,5)
        
    return feature_train

In [None]:
common_subsequence_feature = extract_longest_common_subsequence(train_all_processed)
with open(common_subsequence_feature_path, 'wb') as file:
    pickle.dump(common_subsequence_feature, file)

* 抽取n-gram特征，计算两个句子n-gram下的差异

In [29]:
def extract_ngram(train_all,max_ngram = 3):
    '''
    提取ngram特征
    '''
    feature_train = np.zeros((train_all.shape[0], max_ngram), dtype='float32')

    # 提取n-gram词汇
    def get_ngram(rawq, ngram_value):
        result = []
        for i in range(len(rawq)):
            if i + ngram_value < len(rawq) + 1:
                result.append(rawq[i:i + ngram_value])
        return result

    #提取两个句子词的差异（归一化）
    def get_ngram_sim(q1_ngram, q2_ngram):
        q1_dict = {}
        q2_dict = {}
        
        #统计q1_ngram中个词汇的个数
        for token in q1_ngram:
            if token not in q1_dict:
                q1_dict[token] = 1
            else:
                q1_dict[token] = q1_dict[token] + 1
        #q1_ngram总词汇数
        q1_count = np.sum([value for key, value in q1_dict.items()])

        #统计q2_ngram中个词汇的个数
        for token in q2_ngram:
            if token not in q2_dict:
                q2_dict[token] = 1
            else:
                q2_dict[token] = q2_dict[token] + 1
        #q2_ngram总词汇数
        q2_count = np.sum([value for key, value in q2_dict.items()])

        # ngram1有但是ngram2没有
        q1_count_only = np.sum([value for key, value in q1_dict.items() if key not in q2_dict])
        # ngram2有但是ngram1没有
        q2_count_only = np.sum([value for key, value in q2_dict.items() if key not in q1_dict])
        # ngram1和ngram2都有的话，计算value的差值
        q1_q2_count = np.sum([abs(value - q2_dict[key]) for key, value in q1_dict.items() if key in q2_dict])
        # ngram1和ngram2的总值
        all_count = q1_count + q2_count

        return (1 - float(q1_count_only + q2_count_only + q1_q2_count) / (float(all_count) + 0.00000001))

    for ngram_value in range(max_ngram):
        for index, row in train_all.iterrows():
            s1 = row['s1'].strip()
            s2 = row['s2'].strip()
            ngram1 = get_ngram(s1, ngram_value + 1)
            ngram2 = get_ngram(s2, ngram_value + 1)
            ngram_sim = get_ngram_sim(ngram1, ngram2)
            feature_train[index,ngram_value] = round(ngram_sim,5)

    return feature_train

In [30]:
ngram_feature = extract_ngram(train_all_processed)
with open(ngram_feature_path, 'wb') as file:
    pickle.dump(ngram_feature, file)

* 抽取两个句子的 相同字的长度/较长句子长度、相同字的长度/较短句子长度、相同字的长度/两句子平均长度、句子1中独有字的长度/句子1长度、句子2中独有字的长度/句子2长度、两个句子的杰卡德距离

In [31]:
def extract_sentence_diff_same(train_all):
    '''
    两个句子的相同和不同的词特征
    '''
    col_num = 6
    feature_train = np.zeros((train_all.shape[0],col_num),dtype='float64')

    #统计两个句子的相同和不同
    def get_word_diff(q1, q2):
        set1 = set(q1.split(" "))
        set2 = set(q2.split(" "))
        
        #两个句子相同词的长度
        same_word_len = len(set1 & set2)
        
        #仅句子1中有的词汇个数
        unique_word1_len = len(set1 - set2)
        
        #仅句子2中有的词汇个数
        unique_word2_len = len(set2 - set1)
        
        #句子1中词汇个数
        word1_len = len(set1)
        
        #句子2中词汇个数
        word2_len = len(set2)
        
        #两句子的平均长度
        avg_len = (word1_len + word2_len) / 2.0
        
        #两个句子中较长的长度
        max_len = max(word1_len, word2_len)
        
        #两个句子中较短的长度
        min_len = min(word1_len, word2_len)
        
        #两个句子的杰卡德距离
        jaccard_sim = same_word_len / float(len(set1 | set2))

        return same_word_len / float(max_len), same_word_len / float(min_len), same_word_len / float(avg_len), \
               unique_word1_len / float(word1_len), unique_word2_len /float(word2_len), jaccard_sim

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        features = tuple()
        features = get_word_diff(s1,s2)
        for col_index,feature in enumerate(features):
            feature_train[index,col_index] = round(feature,5)

    return feature_train

In [32]:
sentence_diff_same_feature = extract_sentence_diff_same(train_all_processed)
with open(sentence_diff_same_feature_path, 'wb') as file:
    pickle.dump(sentence_diff_same_feature, file)

* 抽取疑问词相同的比例

In [33]:
def extract_doubt_sim(train_all):
    '''
    抽取疑问词相同的比例
    '''
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')
    
    with open(doubt_words_path,"r",encoding="utf-8") as file:
        doubt_words = [line.strip() for line in file]
        
    # 获取疑问词相同的比例
    def get_doubt_sim(q1, q2, doubt_words):
        q1_doubt_words = set(q1.split(" ")) & set(doubt_words)
        q2_doubt_words = set(q2.split(" ")) & set(doubt_words)
        return len(q1_doubt_words & q2_doubt_words) / float(len(q1_doubt_words | q2_doubt_words) + 1)

    for index,row in train_all.iterrows():
        # 因为doubt_words词表加载出来的是Unicode，所以需要将s1,s2解码成Unicode
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        doubt_sim = get_doubt_sim(s1,s2,doubt_words)
        feature_train[index] = round(doubt_sim,5)

    return feature_train


In [34]:
doubt_sim_feature = extract_doubt_sim(train_all_processed)
with open(doubt_sim_feature_path, 'wb') as file:
    pickle.dump(doubt_sim_feature, file)

* 抽取两个句子中是否同时存在蚂蚁花呗或者蚂蚁借呗的特征,同时包含花呗为1，同时包含借呗为1，否则为0

In [35]:
def extract_sentence_exist_topic(train_all):
    """
    抽取两个句子中是否同时存在蚂蚁花呗或者蚂蚁借呗的特征,同时包含花呗为1，同时包含借呗为1，否则为0
    """
    with open(doubt_words_path,"r",encoding="utf-8") as file:
        doubt_words = [line.strip() for line in file]
        
    feature_train = np.zeros((train_all.shape[0], 2), dtype='float32')

    def get_exist_same_topic(rawq1,rawq2):
        hua_flag = 0.
        jie_flag = 0.
        if '花呗' in rawq1 and '花呗' in rawq2:
            hua_flag = 1.

        if '借呗' in rawq1 and '借呗' in rawq2:
            jie_flag = 1.

        return hua_flag,jie_flag

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        hua_flag, jie_flag = get_exist_same_topic(s1,s2)
        feature_train[index,0] = hua_flag
        feature_train[index,1] = jie_flag

    return feature_train


In [36]:
sentence_exist_topic_feature = extract_sentence_exist_topic(train_all_processed)
with open(sentence_exist_topic_feature_path, 'wb') as file:
    pickle.dump(sentence_exist_topic_feature, file)

* 提取句子的词向量组合的相似度

In [38]:
def extract_word_embedding_sim(train_all,w2v_model_path = train_all_wordvec_path):
    '''
    提取句子的词向量组合的相似度
    w2v_model_path为词向量文件
    :return:
    '''
    #定义提取特征的空间
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    train_all_w2v_model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=False)

    # 得到句子的词向量组合（tfidf）
    def get_sen_vec(q, train_all_w2v_model, tfidf_dict, tfidf_flag=True):
        sen_vec = 0
        for word in q.split(' '):
            if word in train_all_w2v_model.vocab:
                word_vec = train_all_w2v_model.word_vec(word)
                word_tfidf = tfidf_dict.get(word, None)

                if tfidf_flag == True:
                    #tfidf有效，词向量*tfidf权重=句子向量
                    sen_vec += word_vec * word_tfidf
                else:
                    #句子向量
                    sen_vec += word_vec
        sen_vec = sen_vec / np.sqrt(np.sum(np.power(sen_vec, 2)) + 0.000001)
        return sen_vec

    def get_sentece_embedding_sim(q1, q2, train_all_w2v_model, tfidf_dict, tfidf_flag=True):
        # 得到两个问句的词向量组合
        q1_sec = get_sen_vec(q1, train_all_w2v_model, tfidf_dict, tfidf_flag)
        q2_sec = get_sen_vec(q2, train_all_w2v_model, tfidf_dict, tfidf_flag)

        # 曼哈顿距离
        # manhattan_distance = np.sum(np.abs(np.subtract(q1_sec, q2_sec)))

        # 欧式距离
        # enclidean_distance = np.sqrt(np.sum(np.power((q1_sec - q2_sec),2)))

        # 余弦相似度
        molecular = np.sum(np.multiply(q1_sec, q2_sec))
        denominator = np.sqrt(np.sum(np.power(q1_sec, 2))) * np.sqrt(np.sum(np.power(q2_sec, 2)))
        cos_sim = molecular / (denominator + 0.000001)

        # 闵可夫斯基距离
        # minkowski_distance = np.power(np.sum(np.power(np.abs(np.subtract(q1_sec, q2_sec)), 3)), 0.333333)

        # return manhattan_distance, enclidean_distance, cos_sim, minkowski_distance
        return cos_sim

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        sentece_embedding_sim = get_sentece_embedding_sim(s1,s2,train_all_w2v_model,{},False)
        feature_train[index] = round(sentece_embedding_sim,5)
    
    return feature_train

In [39]:
word_embedding_sim_feature = extract_word_embedding_sim(train_all_processed)
with open(word_embedding_sim_feature_path, 'wb') as file:
    pickle.dump(word_embedding_sim_feature, file)

## 提取全部特征

In [40]:
#两个句子长度之差
with open(sentece_length_diff_feature_path, 'rb') as file:
    sentece_length_diff_feature = pickle.load(file)

#两个句子编辑距离
with open(edit_distance_feature_path, 'rb') as file:
    edit_distance_feature = pickle.load(file)

#两个句子公共字符串长度
with open(common_substring_feature_path, 'rb') as file:
    common_substring_feature = pickle.load(file)    
    
#两个句子n-gram下的差异
with open(ngram_feature_path, 'rb') as file:
    ngram_feature = pickle.load(file)
    
#抽取两个句子的相同字的长度/较长句子长度、相同字的长度/较短句子长度、相同字的长度/两句子平均长度
#句子1中独有字的长度/句子1长度、句子2中独有字的长度/句子2长度、两个句子的杰卡德距离
with open(sentence_diff_same_feature_path, 'rb') as file:
    sentence_diff_same_feature = pickle.load(file)

#疑问词相同的比例
with open(doubt_sim_feature_path, 'rb') as file:
    doubt_sim_feature = pickle.load(file)

#两个句子中是否同时存在蚂蚁花呗或者蚂蚁借呗的特征,同时包含花呗为1，同时包含借呗为1，否则为0
with open(sentence_exist_topic_feature_path, 'rb') as file:
    sentence_exist_topic_feature = pickle.load(file)
    
#提取句子的词向量组合的相似度
with open(word_embedding_sim_feature_path, 'rb') as file:
    word_embedding_sim_feature = pickle.load(file)

## 特征组合

In [63]:
#合并特征
X = np.concatenate([sentece_length_diff_feature,
                            edit_distance_feature,
                            ngram_feature,
                            sentence_diff_same_feature,
                            doubt_sim_feature,
                            sentence_exist_topic_feature,
                            word_embedding_sim_feature],
                            axis = 1)
#标签
with open(y_train_path, 'rb') as file:
    y = pickle.load(file)

In [64]:
X.shape

(102477, 15)

In [73]:
y = np.array(y)
np.array(y).shape

(102477,)

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


## Stacking模型

In [75]:
#################### Stacking 模型的融合 ####################
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

class StackingBaseClassifier(object):

    def train(self, x_train, y_train, x_val=None, y_val=None):
        pass

    def predict(self, model, x_test):
        pass

    def get_model_out(self, x_train, y_train, x_test, n_fold=5):
        n_train = x_train.shape[0]
        n_test = x_test.shape[0]

        train_oofp = np.zeros((n_train,))  # 存储每个fold的预测结果
        test_oofp = np.zeros((n_test, n_fold))  # 存储对测试集预测结果

        kfold = KFold(n_splits=n_fold, random_state=44, shuffle=True)

        for index, (ix_train, ix_val) in enumerate(kfold.split(x_train,y_train)):
            print ('{} fold of {} start train and predict...'.format(index, n_fold))
            X_fold_train = x_train[ix_train]
            y_fold_train = y_train[ix_train]

            X_fold_val = x_train[ix_val]
            y_fold_val = y_train[ix_val]

            model = self.train(X_fold_train, y_fold_train, X_fold_val, y_fold_val)

            #以4折做为训练数据训练的模型，预测剩下1折的验证数据，生成第一层的训练的输出数据
            train_oofp[ix_val] = self.predict(model, X_fold_val)
            
            #以4折生成的模型预测测试数据，生成第一层的测试集的输出数据
            test_oofp[:, index] = self.predict(model, x_test)
            
        #第一层的测试集输出数据
        test_oofp_mean = np.mean(test_oofp, axis=1)
        return train_oofp, test_oofp_mean

class GussianNBClassifier(StackingBaseClassifier):
    def __init__(self):
        # 参数设置
        pass

    def train(self, x_train, y_train, x_val, y_val):
        print ('use GaussianNB train model...')
        gnb = GaussianNB()
        gnb.fit(x_train, y_train)
        return gnb

    def predict(self, model, x_test):
        print ('use GaussianNB model test... ')
        return model.predict(x_test)

class RFClassifer(StackingBaseClassifier):
    def train(self, x_train, y_train, x_val, y_val):
        print ('use RandomForest train model...')
        clf = RandomForestClassifier(n_estimators=25,
                                     max_depth=4,
                                     class_weight={
                                         0: 1,
                                         1: 4
                                     }
                                     )
        clf.fit(x_train, y_train)
        return clf

    def predict(self, model, x_test):
        print ('use RandomForest test...')
        return model.predict(x_test)

class LogisicClassifier(StackingBaseClassifier):
    def train(self, x_train, y_train, x_val=None, y_val=None):
        print ('use LogisticRegression train model...')
        lr = LogisticRegression(class_weight={0: 1, 1: 4})
        lr.fit(x_train, y_train)
        return lr
    def predict(self, model, x_test):
        print ('use LogisticRegression test...')
        return model.predict(x_test)

class DecisionClassifier(StackingBaseClassifier):
    def train(self, x_train, y_train, x_val=None, y_val=None):
        print ('use DecisionClassifier train model...')
        dt = DecisionTreeClassifier(class_weight={0: 1, 1: 4},max_depth=5)
        dt.fit(x_train, y_train)
        return dt
    def predict(self, model, x_test):
        print ('use DecisionClassifier test...')
        return model.predict(x_test)


In [76]:
#get_model_out:获取训练集的第一层输出，测试集的第一层输出

gnb_cls = GussianNBClassifier()
gnb_oop_train,gnb_oofp_val = gnb_cls.get_model_out(X_train,y_train,X_test)

rf_cls = RFClassifer()
rf_oop_train, rf_oofp_val = rf_cls.get_model_out(X_train, y_train, X_test)

lg_cls = LogisicClassifier()
lg_oop_train, lg_oofp_val = lg_cls.get_model_out(X_train, y_train, X_test)

dt_cls = DecisionClassifier()
dt_oop_train, dt_oofp_val = dt_cls.get_model_out(X_train, y_train, X_test)

# 构造输入
input_train = [gnb_oop_train,rf_oop_train,lg_oop_train,dt_oop_train]

input_test = [gnb_oofp_val,rf_oofp_val,lg_oofp_val,dt_oofp_val]

stacked_train = np.concatenate([data.reshape(-1,1) for data in input_train],axis=1)

stacked_test = np.concatenate([data.reshape(-1,1) for data in input_test],axis=1)


# stacking 第二层模型训练

second_model = DecisionTreeClassifier(max_depth=3,class_weight={0: 1, 1: 4})
second_model.fit(stacked_train,y_train)

y_test_p = second_model.predict(stacked_test)

for index,pre in enumerate(y_test_p):
    if pre >=0.5:
        y_test_p[index] = 1
    else:
        y_test_p[index] = 0

print (accuracy_score(y_test,y_test_p))
print (f1_score(y_test,y_test_p))

0 fold of 5 start train and predict...
use GaussianNB train model...
use GaussianNB model test... 
use GaussianNB model test... 
1 fold of 5 start train and predict...
use GaussianNB train model...
use GaussianNB model test... 
use GaussianNB model test... 
2 fold of 5 start train and predict...
use GaussianNB train model...
use GaussianNB model test... 
use GaussianNB model test... 
3 fold of 5 start train and predict...
use GaussianNB train model...
use GaussianNB model test... 
use GaussianNB model test... 
4 fold of 5 start train and predict...
use GaussianNB train model...
use GaussianNB model test... 
use GaussianNB model test... 
0 fold of 5 start train and predict...
use RandomForest train model...
use RandomForest test...
use RandomForest test...
1 fold of 5 start train and predict...
use RandomForest train model...
use RandomForest test...
use RandomForest test...
2 fold of 5 start train and predict...
use RandomForest train model...
use RandomForest test...
use RandomForest 