In [1]:
import numpy as np
import pandas as pd
import pickle
import jieba
import json
import re

In [2]:
train_data_path = 'data/atec_nlp_sim_train.csv'  # 训练数据
train_add_data_path = 'data/atec_nlp_sim_train_add.csv'  # 添加训练数据
stop_words_path = 'data/stop_words.txt'  # 停用词路径
tokenize_dict_path = 'data/dict_all.txt'  # jieba分词新自定义字典
spelling_corrections_path = 'data/spelling_corrections.json'

In [3]:
train_data_df = pd.read_csv(train_data_path, sep='\t', header=None,names=["index", "s1", "s2", "label"])
train_add_data_df = pd.read_csv(train_add_data_path, sep='\t', header=None, names=["index", "s1", "s2", "label"])
train_all = pd.concat([train_data_df, train_add_data_df])

In [4]:
train_all = train_all.sample(frac=1).reset_index(drop=True)  

In [5]:
train_all.head()

Unnamed: 0,index,s1,s2,label
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


### 分词及处理

In [6]:
jieba.load_userdict(tokenize_dict_path)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zxq\AppData\Local\Temp\jieba.cache
Loading model cost 0.851 seconds.
Prefix dict has been built succesfully.


In [7]:
# 停用词表
stopwords = [line.strip() for line in open(stop_words_path, 'r', encoding='utf-8').readlines()]

In [8]:
# 拼错词替换表
with open(spelling_corrections_path,"r",encoding="utf-8") as file:
    spelling_corrections = json.load(file)

In [9]:
def transform_other_word(str_text,reg_dict):
    """
    替换词
    :param str_text:待替换的句子
    :param reg_dict:替换词字典
    :return:
    """
    for token_str,replac_str in reg_dict.items():
        str_text = str_text.replace(token_str, replac_str)
    return str_text

In [14]:
def seg_sentence(sentence, stop_words):
    """
    对句子进行分词
    :param sentence:句子，停用词
    """
    sentence_seged = jieba.cut(sentence.strip())
    word_list = [i for i in sentence_seged if i not in stop_words and i != ' ']
    return " ".join(word_list)

In [15]:
def preprocessing_word(s1_train, s2_train, stopwords, spelling_corrections):

    # 去除句子中的脱敏数字***，替换成一
    re_object = re.compile(r'\*+')

    s1_all = []
    s2_all = []
    all = []

    for s1_,s2_ in zip(s1_train, s2_train):
        s1 = re_object.sub(u"十一", s1_)
        s2 = re_object.sub(u"十一", s2_)
        spell_corr_s1 = transform_other_word(s1, spelling_corrections)
        spell_corr_s2 = transform_other_word(s2, spelling_corrections)

        # 分词
        seg_s1 = seg_sentence(spell_corr_s1, stopwords)
        seg_s2 = seg_sentence(spell_corr_s2, stopwords)

        all.extend(seg_s1)
        all.extend(seg_s2)
        s1_all.append(seg_s1)
        s2_all.append(seg_s2)
    source_list = []
    # source_list = list(set(all))
    source_list.append('<UNK>')
    source_list.append('<PAD>')
    source_list.extend(list(set(all)))
    word2id = {}
    id2word = {}
    for index, char in enumerate(source_list):
        word2id[char] = index
        id2word[index] = char

    return s1_all, s2_all, word2id, id2word

In [16]:
s1_train = train_all["s1"].tolist()
s2_train = train_all["s2"].tolist()
y_train = train_all["label"].tolist()

In [17]:
s1_word_all, s2_word_all, word2id, id2word = preprocessing_word(s1_train, s2_train, stopwords, spelling_corrections)

In [18]:
def make_word2id(data, word2id):
    data2id = []
    for word_list in data:
        id_list = [word2id.get(i) if word2id.get(i) is not None else word2id.get('<PAD>') for i in word_list]
        data2id.append(id_list)
    return data2id

In [19]:
def all_data_set(s1_all, s2_all, word2id, y_train, max_l=15):
    pad = word2id['<PAD>']
    all_data = []
    s1_data_id = make_word2id(s1_all, word2id)
    s2_data_id = make_word2id(s2_all, word2id)
    s1_all_new = []
    s2_all_new = []
    y = []
    for i in range(len(s1_data_id)):
        if len(s1_data_id[i]) > max_l:
            s1_set = s1_data_id[i][:max_l]
        else:
            s1_set = np.concatenate((s1_data_id[i], np.tile(pad, max_l - len(s1_data_id[i]))), axis=0)
        if len(s2_data_id[i]) > max_l:
            s2_set = s2_data_id[i][:max_l]
        else:
            s2_set = np.concatenate((s2_data_id[i], np.tile(pad, max_l - len(s2_data_id[i]))), axis=0)
        y_set = [1,0] if y_train[i] == 0 else [0,1]
        s1_all_new.append(s1_set)
        s2_all_new.append(s2_set)
        y.append(y_set)
    return s1_all_new, s2_all_new, y

In [20]:
s1_word_id_all, s2_word_id_all, y_set = all_data_set(s1_word_all, s2_word_all, word2id, y_train, max_l=15)

In [21]:
train_all["s1"] = s1_word_all

In [22]:
train_all["s2"] = s2_word_all

In [23]:
train_all["s1_word_id"] = s1_word_id_all

In [24]:
train_all["s2_word_id"] = s2_word_id_all

In [25]:
train_all["y_set"] = y_set

In [27]:
train_all.head()

Unnamed: 0,index,s1,s2,label,s1_word_id,s2_word_id,y_set
0,1,﻿ 怎么 更换 花呗 手机号码,花呗 是 以前 手机号码 怎么 更换 成 现在 支付宝 号码 手机号,1,"[2106, 1421, 2057, 511, 1421, 1521, 908, 1421,...","[245, 563, 1421, 1280, 1421, 849, 1929, 1421, ...","[0, 1]"
1,2,开不了 花呗 这样 完事,真的 就是 花呗 付款,0,"[771, 1642, 1732, 1421, 245, 563, 1421, 42, 12...","[613, 1407, 1421, 760, 1280, 1421, 245, 563, 1...","[1, 0]"
2,3,花呗 冻结 以后 能 开通,条件 可以 开通 花呗 借款,0,"[245, 563, 1421, 883, 724, 1421, 849, 1436, 14...","[335, 312, 1421, 2005, 849, 1421, 771, 181, 14...","[1, 0]"
3,4,如何 得知 关 借呗,永久 关 借呗,0,"[939, 517, 1421, 158, 463, 1421, 922, 1421, 20...","[1868, 385, 1421, 922, 1421, 2088, 563, 1, 1, ...","[1, 0]"
4,5,花呗 扫码 付钱,二维码 扫描 可以 用花呗,0,"[245, 563, 1421, 938, 753, 1421, 50, 1542, 1, ...","[861, 615, 753, 1421, 938, 186, 1421, 2005, 84...","[1, 0]"


In [38]:
# 词向量路径
train_all_wordvec_path = "data/train_all_data.bigram"           #全部数据训练的词向量
train_char_all_wordvec_path = "data/train_char_all_data.bigram" #全部数据训练的词向量
zhihu_wordvec_path = "data/sgns.zhihu.bigram"                   #知乎词向量
doubt_words_path = 'data/doubt_words.txt'     

In [30]:
#字符特征提取

#抽取两个句子长度之差(归一化)
def extract_sentece_length_diff(train_all):
    """
    长度差特征
    """
    feature_train = np.zeros((train_all.shape[0],1),dtype='float32')

    # 计算两个句子的长度差
    def get_length_diff(s1, s2):
        return 1 - (abs(len(s1) - len(s2)) / float(max(len(s1), len(s2))))

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip().split(' ')
        s2 = row['s2'].strip().split(' ')
        diff = get_length_diff(s1,s2)
        feature_train[index] = round(diff,5)

    return feature_train

#抽取两个句子编辑距离(归一化)
def extract_edit_distance(train_all):
    """
    编辑距离特征
    """
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    # 计算编辑距离
    def get_edit_distance(rawq1, rawq2):
        #构建DP矩阵
        m, n = len(rawq1) + 1, len(rawq2) + 1
        matrix = [[0] * n for i in range(m)]
        matrix[0][0] = 0
        for i in range(1, m):
            matrix[i][0] = matrix[i - 1][0] + 1
        for j in range(1, n):
            matrix[0][j] = matrix[0][j - 1] + 1
        cost = 0
        for i in range(1, m):
            for j in range(1, n):
                if rawq1[i - 1] == rawq2[j - 1]:
                    cost = 0
                else:
                    cost = 1
                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
        return 1 - (matrix[m - 1][n - 1] / float(max(len(rawq1), len(rawq2))))

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        edit_distance = get_edit_distance(s1,s2)
        feature_train[index] = round(edit_distance,5)

    return feature_train

#抽取公共子串特征
def extract_longest_common_substring(train_all):
    """
    公共子串特征
    """
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    # 计算最长公共子串
    def get_common_substring_len(rawq1, rawq2):
        #构建DP矩阵
        m, n = len(rawq1) + 1, len(rawq2) + 1
        matrix = [[0] * n for i in range(m)]
        longest_num = 0
        for i in range(1, m):
            for j in range(1, n):
                if rawq1[i - 1] == rawq2[j - 1]:
                    matrix[i][j] = matrix[i-1][j-1] + 1
                    if matrix[i][j] > longest_num:
                        longest_num = matrix[i][j]
                    else:
                        matrix[i][j] = 0
        return longest_num / float(min(len(rawq1), len(rawq2)))
    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        common_substring_len = get_common_substring_len(s1,s2)
        feature_train[index] = round(common_substring_len,5)

    return feature_train

#抽取公共子序列特征
def extract_longest_common_subsequence(train_all):
    """
    公共子序列特征
    """
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    # 计算最长公共子序列
    def get_common_subsequence_len(rawq1, rawq2):
        #构建DP矩阵
        m, n = len(rawq1) + 1, len(rawq2) + 1
        matrix = [[0] * n for i in range(m)]
        for i in range(1, m):
            for j in range(1, n):
                if rawq1[i - 1] == rawq2[j - 1]:
                    matrix[i][j] = matrix[i-1][j-1] + 1
                else:
                    matrix[i][j] = max(matrix[i-1][j],matrix[i][j-1])
        return matrix[m-1][n-1] / float(min(len(rawq1), len(rawq2)))
    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        common_subsequence_len = get_common_subsequence_len(s1,s2)
        feature_train[index] = round(common_subsequence_len,5)

    return feature_train


#抽取n-gram特征，计算两个句子n-gram下的差异
def extract_ngram(train_all,max_ngram = 3):
    '''
    提取ngram特征
    '''
    feature_train = np.zeros((train_all.shape[0], max_ngram), dtype='float32')

    # 提取n-gram词汇
    def get_ngram(rawq, ngram_value):
        result = []
        for i in range(len(rawq)):
            if i + ngram_value < len(rawq) + 1:
                result.append(rawq[i:i + ngram_value])
        return result

    #提取两个句子词的差异（归一化）
    def get_ngram_sim(q1_ngram, q2_ngram):
        q1_dict = {}
        q2_dict = {}

        #统计q1_ngram中个词汇的个数
        for token in q1_ngram:
            if token not in q1_dict:
                q1_dict[token] = 1
            else:
                q1_dict[token] = q1_dict[token] + 1
        #q1_ngram总词汇数
        q1_count = np.sum([value for key, value in q1_dict.items()])

        #统计q2_ngram中个词汇的个数
        for token in q2_ngram:
            if token not in q2_dict:
                q2_dict[token] = 1
            else:
                q2_dict[token] = q2_dict[token] + 1
        #q2_ngram总词汇数
        q2_count = np.sum([value for key, value in q2_dict.items()])

        # ngram1有但是ngram2没有
        q1_count_only = np.sum([value for key, value in q1_dict.items() if key not in q2_dict])
        # ngram2有但是ngram1没有
        q2_count_only = np.sum([value for key, value in q2_dict.items() if key not in q1_dict])
        # ngram1和ngram2都有的话，计算value的差值
        q1_q2_count = np.sum([abs(value - q2_dict[key]) for key, value in q1_dict.items() if key in q2_dict])
        # ngram1和ngram2的总值
        all_count = q1_count + q2_count

        return (1 - float(q1_count_only + q2_count_only + q1_q2_count) / (float(all_count) + 0.00000001))

    for ngram_value in range(max_ngram):
        for index, row in train_all.iterrows():
            s1 = row['s1'].strip()
            s2 = row['s2'].strip()
            ngram1 = get_ngram(s1, ngram_value + 1)
            ngram2 = get_ngram(s2, ngram_value + 1)
            ngram_sim = get_ngram_sim(ngram1, ngram2)
            feature_train[index,ngram_value] = round(ngram_sim,5)

    return feature_train


#抽取两个句子的 相同字的长度/较长句子长度、相同字的长度/较短句子长度、相同字的长度/两句子平均长度、句子1中独有字的长度/句子1长度、句子2中独有字的长度/句子2长度、两个句子的杰卡德距离
def extract_sentence_diff_same(train_all):
    '''
    两个句子的相同和不同的词特征
    '''
    col_num = 6
    feature_train = np.zeros((train_all.shape[0],col_num),dtype='float64')

    #统计两个句子的相同和不同
    def get_word_diff(q1, q2):
        set1 = set(q1.split(" "))
        set2 = set(q2.split(" "))

        #两个句子相同词的长度
        same_word_len = len(set1 & set2)

        #仅句子1中有的词汇个数
        unique_word1_len = len(set1 - set2)

        #仅句子2中有的词汇个数
        unique_word2_len = len(set2 - set1)

        #句子1中词汇个数
        word1_len = len(set1)

        #句子2中词汇个数
        word2_len = len(set2)

        #两句子的平均长度
        avg_len = (word1_len + word2_len) / 2.0

        #两个句子中较长的长度
        max_len = max(word1_len, word2_len)

        #两个句子中较短的长度
        min_len = min(word1_len, word2_len)

        #两个句子的杰卡德距离
        jaccard_sim = same_word_len / float(len(set1 | set2))

        return same_word_len / float(max_len), same_word_len / float(min_len), same_word_len / float(avg_len), \
               unique_word1_len / float(word1_len), unique_word2_len /float(word2_len), jaccard_sim

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        features = tuple()
        features = get_word_diff(s1,s2)
        for col_index,feature in enumerate(features):
            feature_train[index,col_index] = round(feature,5)

    return feature_train

#抽取疑问词相同的比例
def extract_doubt_sim(train_all):
    '''
    抽取疑问词相同的比例
    '''
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    with open(doubt_words_path,"r",encoding="utf-8") as file:
        doubt_words = [line.strip() for line in file]

    # 获取疑问词相同的比例
    def get_doubt_sim(q1, q2, doubt_words):
        q1_doubt_words = set(q1.split(" ")) & set(doubt_words)
        q2_doubt_words = set(q2.split(" ")) & set(doubt_words)
        return len(q1_doubt_words & q2_doubt_words) / float(len(q1_doubt_words | q2_doubt_words) + 1)

    for index,row in train_all.iterrows():
        # 因为doubt_words词表加载出来的是Unicode，所以需要将s1,s2解码成Unicode
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        doubt_sim = get_doubt_sim(s1,s2,doubt_words)
        feature_train[index] = round(doubt_sim,5)

    return feature_train

#抽取两个句子中是否同时存在蚂蚁花呗或者蚂蚁借呗的特征,同时包含花呗为1，同时包含借呗为1，否则为0
def extract_sentence_exist_topic(train_all):
    """
    抽取两个句子中是否同时存在蚂蚁花呗或者蚂蚁借呗的特征,同时包含花呗为1，同时包含借呗为1，否则为0
    """
    with open(doubt_words_path,"r",encoding="utf-8") as file:
        doubt_words = [line.strip() for line in file]

    feature_train = np.zeros((train_all.shape[0], 2), dtype='float32')

    def get_exist_same_topic(rawq1,rawq2):
        hua_flag = 0.
        jie_flag = 0.
        if '花呗' in rawq1 and '花呗' in rawq2:
            hua_flag = 1.

        if '借呗' in rawq1 and '借呗' in rawq2:
            jie_flag = 1.

        return hua_flag,jie_flag

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        hua_flag, jie_flag = get_exist_same_topic(s1,s2)
        feature_train[index,0] = hua_flag
        feature_train[index,1] = jie_flag

    return feature_train

#提取句子的词向量组合的相似度
def extract_word_embedding_sim(train_all,w2v_model_path = train_all_wordvec_path,extern_word2vec_path = zhihu_wordvec_path):
    '''
    提取句子的词向量组合的相似度
    w2v_model_path为词向量文件
    :return:
    '''
    #定义提取特征的空间
    feature_train = np.zeros((train_all.shape[0], 1), dtype='float32')

    train_all_w2v_model = KeyedVectors.load_word2vec_format(w2v_model_path, binary=False)

    zhihu_w2v_model = KeyedVectors.load_word2vec_format(zhihu_wordvec_path, binary=False)

    # 得到句子的词向量组合（tfidf）
    def get_sen_vec(q, train_all_w2v_model, tfidf_dict, tfidf_flag=True):
        sen_vec = 0
        for word in q.split(' '):
            if word in zhihu_w2v_model.vocab:
                word_vec = zhihu_w2v_model.word_vec(word)
                word_tfidf = tfidf_dict.get(word, None)
                if tfidf_flag == True:
                    #tfidf有效，词向量*tfidf权重=句子向量
                    sen_vec += word_vec * word_tfidf
                else:
                    #句子向量
                    sen_vec += word_vec
            elif word in train_all_w2v_model.vocab:
                word_vec = train_all_w2v_model.word_vec(word)
                word_tfidf = tfidf_dict.get(word, None)

                if tfidf_flag == True:
                    #tfidf有效，词向量*tfidf权重=句子向量
                    sen_vec += word_vec * word_tfidf
                else:
                    #句子向量
                    sen_vec += word_vec

            else:
                pass
        sen_vec = sen_vec / np.sqrt(np.sum(np.power(sen_vec, 2)) + 0.000001)
        return sen_vec
    def get_sentece_embedding_sim(q1, q2, train_all_w2v_model, tfidf_dict, tfidf_flag=True):
        # 得到两个问句的词向量组合
        q1_sec = get_sen_vec(q1, train_all_w2v_model, tfidf_dict, tfidf_flag)
        q2_sec = get_sen_vec(q2, train_all_w2v_model, tfidf_dict, tfidf_flag)

        # 曼哈顿距离
        # manhattan_distance = np.sum(np.abs(np.subtract(q1_sec, q2_sec)))

        # 欧式距离
        # enclidean_distance = np.sqrt(np.sum(np.power((q1_sec - q2_sec),2)))

        # 余弦相似度
        molecular = np.sum(np.multiply(q1_sec, q2_sec))
        denominator = np.sqrt(np.sum(np.power(q1_sec, 2))) * np.sqrt(np.sum(np.power(q2_sec, 2)))
        cos_sim = molecular / (denominator + 0.000001)

        # 闵可夫斯基距离
        # minkowski_distance = np.power(np.sum(np.power(np.abs(np.subtract(q1_sec, q2_sec)), 3)), 0.333333)

        # return manhattan_distance, enclidean_distance, cos_sim, minkowski_distance
        return cos_sim

    for index,row in train_all.iterrows():
        s1 = row['s1'].strip()
        s2 = row['s2'].strip()
        sentece_embedding_sim = get_sentece_embedding_sim(s1,s2,train_all_w2v_model,{},False)
        feature_train[index] = round(sentece_embedding_sim,5)

    return feature_train


In [32]:
sentece_length_diff_feature = extract_sentece_length_diff(train_all)

In [45]:
sentece_length_diff_feature.shape

(102477, 1)

In [33]:
edit_distance_feature = extract_edit_distance(train_all)

In [46]:
edit_distance_feature.shape

(102477, 1)

In [34]:
common_substring_feature = extract_longest_common_substring(train_all)

In [49]:
common_substring_feature.shape

(102477, 1)

In [35]:
common_subsequence_feature = extract_longest_common_subsequence(train_all)

In [51]:
common_subsequence_feature.shape

(102477, 1)

In [36]:
ngram_feature = extract_ngram(train_all)

In [52]:
ngram_feature.shape

(102477, 3)

In [37]:
sentence_diff_same_feature = extract_sentence_diff_same(train_all)

In [53]:
sentence_diff_same_feature.shape

(102477, 6)

In [39]:
doubt_sim_feature = extract_doubt_sim(train_all)

In [55]:
doubt_sim_feature.shape

(102477, 1)

In [40]:
sentence_exist_topic_feature = extract_sentence_exist_topic(train_all)

In [56]:
sentence_exist_topic_feature.shape

(102477, 2)

In [42]:
from gensim.models import KeyedVectors



In [43]:
word_embedding_sim_feature = extract_word_embedding_sim(train_all)

In [57]:
word_embedding_sim_feature.shape

(102477, 1)

In [58]:
statistic_feature = np.concatenate([sentece_length_diff_feature,
                            edit_distance_feature,
                            common_substring_feature,
                            common_subsequence_feature,
                            ngram_feature,
                            sentence_diff_same_feature,
                            doubt_sim_feature,
                            sentence_exist_topic_feature,
                            word_embedding_sim_feature],
                            axis = 1)

In [59]:
statistic_feature.shape

(102477, 17)

In [65]:
statistic_feature[0]

array([ 0.45455   ,  0.29412001,  0.06667   ,  0.73333001,  0.57143003,
        0.51064003,  0.40000001,  0.36364   ,  0.8       ,  0.5       ,
        0.2       ,  0.63636   ,  0.33333   ,  0.5       ,  1.        ,
        0.        ,  0.77051002])

In [64]:
len(train_all)

102477

#### 将特征存入pickle中

In [None]:
# 这个只是保存在pickle中了,需要用的时候调用pickle

In [62]:
# 将数据存入pickle中
with open("statistic_feature.pk", 'wb') as f1:
    pickle.dump(statistic_feature, f1)

In [23]:
# 将数据存到一个大列表里面，格式是[[s1,s2,y],[s1,s2,y],[s1,s2,y].......]
all_data = []
for i in range(len(s1_word_id_all)):
    all_data.append([s1_word_id_all[i],s2_word_id_all[i],y_set[i]])

In [24]:
# 将数据存入pickle中
with open("word_data.pk", 'wb') as f1:
    pickle.dump((all_data,word2id,id2word), f1)