In [1]:
import numpy as np
import pandas as pd
import pickle
import jieba
import json
import re 

In [2]:
train_data_path = '../data/atec_nlp_sim_train.csv'  # 训练数据
train_add_data_path = '../data/atec_nlp_sim_train_add.csv'  # 添加训练数据
stop_words_path = '../data/stop_words.txt'  # 停用词路径
tokenize_dict_path = '../data/dict_all.txt'  # jieba分词新自定义字典
spelling_corrections_path = '../data/spelling_corrections.json'

In [3]:
train_data_df = pd.read_csv(train_data_path, sep='\t', header=None,names=["index", "s1", "s2", "label"])
train_add_data_df = pd.read_csv(train_add_data_path, sep='\t', header=None, names=["index", "s1", "s2", "label"])
train_all = pd.concat([train_data_df, train_add_data_df])

In [4]:
train_all = train_all.sample(frac=1).reset_index(drop=True)

In [5]:
train_all.head()

Unnamed: 0,index,s1,s2,label
0,30241,借呗可以升级网商贷吧,我的网商贷可以换回蚂蚁借呗吗,0
1,10179,为什么我的借呗被停用了？我又没失约,为什么这个月不用借呗就挨停了,0
2,34382,为什么我开通了花呗，却不能付款，显示该商户不支持花呗付款,已经开通了花呗支持付款，但是付款显示不支持，是怎么回事,0
3,51574,花呗退款为什么花呗账单不便,花呗退款单为何还需要还款,0
4,3210,反复试过，没有花呗,我从来没弄过花呗,0


### 分词及处理

In [6]:
# 停用词表
stopwords = [line.strip() for line in open(stop_words_path, 'r', encoding='utf-8').readlines()]

In [7]:
# 拼错词替换表
with open(spelling_corrections_path,"r",encoding="utf-8") as file:
    spelling_corrections = json.load(file)

In [8]:
def transform_other_word(str_text,reg_dict):
    """
    替换词
    :param str_text:待替换的句子
    :param reg_dict:替换词字典
    :return:
    """
    for token_str,replac_str in reg_dict.items():
        str_text = str_text.replace(token_str, replac_str)
    return str_text

In [9]:
def preprocessing_char(s1_train, s2_train, stopwords, spelling_corrections):

    # 去除句子中的脱敏数字***，替换成一
    re_object = re.compile(r'\*+')

    s1_all = []
    s2_all = []
    all = []
    for s1_, s2_ in zip(s1_train, s2_train):
        s1 = re_object.sub(u"十一", s1_)
        s2 = re_object.sub(u"十一", s2_)
        spell_corr_s1 = transform_other_word(s1, spelling_corrections)
        spell_corr_s2 = transform_other_word(s2, spelling_corrections)
        spell_corr_s1 = list(spell_corr_s1)
        spell_corr_s2 = list(spell_corr_s2)
        
        all.extend(spell_corr_s1)
        all.extend(spell_corr_s2)
        split_s1 = [i for i in spell_corr_s1 if i not in stopwords and i.strip() != '']
        split_s2 = [i for i in spell_corr_s2 if i not in stopwords and i.strip() != '']

        s1_all.append(split_s1)
        s2_all.append(split_s2)
    source_list = []
    # source_list = list(set(all))
    source_list.append('<UNK>')
    source_list.append('<PAD>')
    source_list.extend(list(set(all)))
    char2id = {}
    id2char = {}
    for index, char in enumerate(source_list):
        char2id[char] = index
        id2char[index] = char

    return s1_all, s2_all, char2id, id2char

In [10]:
s1_train = train_all["s1"].tolist()
s2_train = train_all["s2"].tolist()
y_train = train_all["label"].tolist()

#### 获取对应的词表及词与id的映射

In [11]:
s1_char_all, s2_char_all, char2id, id2char = preprocessing_char(s1_train, s2_train, stopwords, spelling_corrections)

In [12]:
def make_word2id(data, word2id):
    data2id = []
    for word_list in data:
        id_list = [word2id.get(i) if word2id.get(i) is not None else word2id.get('<PAD>') for i in word_list]
        data2id.append(id_list)
    return data2id

In [13]:
def all_data_set(s1_all, s2_all, word2id, y_train, max_l=20):
    pad = word2id['<PAD>']
    all_data = []
    s1_data_id = make_word2id(s1_all, word2id)
    s2_data_id = make_word2id(s2_all, word2id)
    s1_all_new = []
    s2_all_new = []
    y = []
    for i in range(len(s1_data_id)):
        if len(s1_data_id[i]) > max_l:
            s1_set = s1_data_id[i][:max_l]
        else:
            s1_set = np.concatenate((s1_data_id[i], np.tile(pad, max_l - len(s1_data_id[i]))), axis=0)
        if len(s2_data_id[i]) > max_l:
            s2_set = s2_data_id[i][:max_l]
        else:
            s2_set = np.concatenate((s2_data_id[i], np.tile(pad, max_l - len(s2_data_id[i]))), axis=0)
        y_set = [1,0] if y_train[i] == 0 else [0,1]
        s1_all_new.append(s1_set)
        s2_all_new.append(s2_set)
        y.append(y_set)
    return s1_all_new, s2_all_new, y

#### 获取id形式表示的文本特征

In [14]:
s1_char_id_all, s2_char_id_all, y_set = all_data_set(s1_char_all, s2_char_all, char2id, y_train, max_l=20)

In [15]:
train_all["s1_char_all"] = s1_char_all

In [16]:
train_all["s2_char_all"] = s2_char_all

In [17]:
train_all["s1_char_id_all"] = s1_char_id_all

In [18]:
train_all["s2_char_id_all"] = s2_char_id_all

In [19]:
train_all["y_set"] = y_set

In [20]:
train_all.head()

Unnamed: 0,index,s1,s2,label,s1_char_all,s2_char_all,s1_char_id_all,s2_char_id_all,y_set
0,1,﻿怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1,"[﻿, 怎, 更, 换, 花, 手, 机, 号, 码]","[花, 是, 以, 前, 手, 机, 号, 码, 怎, 更, 换, 成, 现, 支, 付, ...","[2073, 523, 1724, 1326, 1974, 1718, 1556, 286,...","[1974, 1930, 1699, 1696, 1718, 1556, 286, 1795...","[0, 1]"
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0,"[开, 不, 花, 样, 完, 事]","[真, 是, 花, 付, 款]","[811, 1819, 1974, 929, 1863, 877, 1, 1, 1, 1, ...","[1150, 1930, 1974, 2144, 1169, 1, 1, 1, 1, 1, ...","[1, 0]"
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0,"[花, 冻, 结, 以, 能, 开, 通]","[条, 件, 可, 以, 开, 通, 花, 借, 款]","[1974, 565, 73, 1699, 369, 811, 1342, 1, 1, 1,...","[1535, 1106, 1173, 1699, 811, 1342, 1974, 1731...","[1, 0]"
3,4,如何得知关闭借呗,想永久关闭借呗,0,"[如, 何, 得, 知, 关, 借]","[永, 久, 关, 借]","[543, 1584, 964, 550, 1776, 1731, 1, 1, 1, 1, ...","[435, 93, 1776, 1731, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0]"
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0,"[花, 扫, 码, 付, 钱]","[二, 维, 码, 扫, 描, 可, 以, 用, 花]","[1974, 300, 1795, 2144, 317, 1, 1, 1, 1, 1, 1,...","[1860, 1663, 1795, 300, 983, 1173, 1699, 514, ...","[1, 0]"


In [21]:
all_data = []
for i in range(len(s1_char_id_all)):
    one_feature_map = []
    left_input_one = s1_char_id_all[i]
    right_input_one = s2_char_id_all[i]
    for j in range(18):
        left_rows = left_input_one[j:j + 3]
        for k in range(18):
            right_rows = right_input_one[k:k + 3]
            one_feature_map.extend(left_rows)
            one_feature_map.extend(right_rows)
    all_data.append([one_feature_map,y_set[i]])

In [8]:
18*18*6

1944

In [36]:
len(all_data[0][0])

1944

In [22]:
len(all_data)

102477

In [37]:
# 将数据存入pickle中
with open("char_arc_data.pk", 'wb') as f1:
    pickle.dump((all_data[:5000],char2id,id2char), f1)