In [1]:
import numpy as np
import pandas as pd
import pickle
import jieba
import json
import re 

In [2]:
train_data_path = '../data/atec_nlp_sim_train.csv'  # 训练数据
train_add_data_path = '../data/atec_nlp_sim_train_add.csv'  # 添加训练数据
stop_words_path = '../data/stop_words.txt'  # 停用词路径
tokenize_dict_path = '../data/dict_all.txt'  # jieba分词新自定义字典
spelling_corrections_path = '../data/spelling_corrections.json'

In [3]:
train_data_df = pd.read_csv(train_data_path, sep='\t', header=None,names=["index", "s1", "s2", "label"])
train_add_data_df = pd.read_csv(train_add_data_path, sep='\t', header=None, names=["index", "s1", "s2", "label"])
train_all = pd.concat([train_data_df, train_add_data_df])

In [6]:
train_all = train_all.sample(frac=1).reset_index(drop=True)

In [7]:
train_all.head()

Unnamed: 0,index,s1,s2,label
0,31668,借呗借钱三个月每个月都要还吗,蚂蚁借呗干嘛都是每月二十号还,0
1,41666,使用花呗购物，自动全额付款会产生费用吗,花呗付款当时就退款了会产生费用吗,0
2,26291,提前还借呗，借呗会关闭吗,蚂蚁借呗有额度会关闭吗,0
3,29187,花呗收款从哪里上传证件,哪里上传证件信息申请花呗收款,1
4,25416,我的退货款退回了花呗，可是花呗的钱我已经还过了。退回的钱去了哪里,我花呗钱还掉了 然后有退款退到花呗 现在钱去哪了呀,1


### 分词及处理

In [8]:
# 停用词表
stopwords = [line.strip() for line in open(stop_words_path, 'r', encoding='utf-8').readlines()]

In [9]:
# 拼错词替换表
with open(spelling_corrections_path,"r",encoding="utf-8") as file:
    spelling_corrections = json.load(file)

In [10]:
def transform_other_word(str_text,reg_dict):
    """
    替换词
    :param str_text:待替换的句子
    :param reg_dict:替换词字典
    :return:
    """
    for token_str,replac_str in reg_dict.items():
        str_text = str_text.replace(token_str, replac_str)
    return str_text

In [11]:
def preprocessing_char(s1_train, s2_train, stopwords, spelling_corrections):

    # 去除句子中的脱敏数字***，替换成一
    re_object = re.compile(r'\*+')

    s1_all = []
    s2_all = []
    all = []
    for s1_, s2_ in zip(s1_train, s2_train):
        s1 = re_object.sub(u"十一", s1_)
        s2 = re_object.sub(u"十一", s2_)
        spell_corr_s1 = transform_other_word(s1, spelling_corrections)
        spell_corr_s2 = transform_other_word(s2, spelling_corrections)
        spell_corr_s1 = list(spell_corr_s1)
        spell_corr_s2 = list(spell_corr_s2)
        
        all.extend(spell_corr_s1)
        all.extend(spell_corr_s2)
        split_s1 = [i for i in spell_corr_s1 if i not in stopwords and i.strip() != '']
        split_s2 = [i for i in spell_corr_s2 if i not in stopwords and i.strip() != '']

        s1_all.append(split_s1)
        s2_all.append(split_s2)
    source_list = []
    # source_list = list(set(all))
    source_list.append('<UNK>')
    source_list.append('<PAD>')
    source_list.extend(list(set(all)))
    char2id = {}
    id2char = {}
    for index, char in enumerate(source_list):
        char2id[char] = index
        id2char[index] = char

    return s1_all, s2_all, char2id, id2char

In [12]:
s1_train = train_all["s1"].tolist()
s2_train = train_all["s2"].tolist()
y_train = train_all["label"].tolist()

#### 获取对应的词表及词与id的映射

In [13]:
s1_char_all, s2_char_all, char2id, id2char = preprocessing_char(s1_train, s2_train, stopwords, spelling_corrections)

In [14]:
def make_word2id(data, word2id):
    data2id = []
    for word_list in data:
        id_list = [word2id.get(i) if word2id.get(i) is not None else word2id.get('<PAD>') for i in word_list]
        data2id.append(id_list)
    return data2id

In [15]:
def all_data_set(s1_all, s2_all, word2id, y_train, max_l=20):
    pad = word2id['<PAD>']
    all_data = []
    s1_data_id = make_word2id(s1_all, word2id)
    s2_data_id = make_word2id(s2_all, word2id)
    s1_all_new = []
    s2_all_new = []
    y = []
    for i in range(len(s1_data_id)):
        if len(s1_data_id[i]) > max_l:
            s1_set = s1_data_id[i][:max_l]
        else:
            s1_set = np.concatenate((s1_data_id[i], np.tile(pad, max_l - len(s1_data_id[i]))), axis=0)
        if len(s2_data_id[i]) > max_l:
            s2_set = s2_data_id[i][:max_l]
        else:
            s2_set = np.concatenate((s2_data_id[i], np.tile(pad, max_l - len(s2_data_id[i]))), axis=0)
        y_set = [1,0] if y_train[i] == 0 else [0,1]
        s1_all_new.append(s1_set)
        s2_all_new.append(s2_set)
        y.append(y_set)
    return s1_all_new, s2_all_new, y

#### 获取id形式表示的文本特征

In [16]:
s1_char_id_all, s2_char_id_all, y_set = all_data_set(s1_char_all, s2_char_all, char2id, y_train, max_l=20)

In [17]:
train_all["s1_char_all"] = s1_char_all

In [18]:
train_all["s2_char_all"] = s2_char_all

In [19]:
train_all["s1_char_id_all"] = s1_char_id_all

In [20]:
train_all["s2_char_id_all"] = s2_char_id_all

In [21]:
train_all["y_set"] = y_set

In [22]:
train_all.head()

Unnamed: 0,index,s1,s2,label,s1_char_all,s2_char_all,s1_char_id_all,s2_char_id_all,y_set
0,31668,借呗借钱三个月每个月都要还吗,蚂蚁借呗干嘛都是每月二十号还,0,"[借, 借, 钱, 三, 个, 月, 每, 个, 月, 都, 要]","[蚂, 蚁, 借, 干, 都, 是, 每, 月, 二, 十, 号]","[1208, 1208, 611, 846, 879, 310, 563, 879, 310...","[757, 227, 1208, 900, 870, 1305, 563, 310, 197...","[1, 0]"
1,41666,使用花呗购物，自动全额付款会产生费用吗,花呗付款当时就退款了会产生费用吗,0,"[使, 用, 花, 购, 物, 自, 动, 全, 额, 付, 款, 会, 产, 生, 费, 用]","[花, 付, 款, 当, 时, 退, 款, 会, 产, 生, 费, 用]","[1065, 414, 1294, 330, 1791, 2094, 446, 1657, ...","[1294, 1722, 215, 48, 1559, 800, 215, 1785, 19...","[1, 0]"
2,26291,提前还借呗，借呗会关闭吗,蚂蚁借呗有额度会关闭吗,0,"[提, 前, 借, 借, 会, 关]","[蚂, 蚁, 借, 有, 额, 度, 会, 关]","[1675, 180, 1208, 1208, 1785, 1263, 1, 1, 1, 1...","[757, 227, 1208, 1114, 1382, 2136, 1785, 1263,...","[1, 0]"
3,29187,花呗收款从哪里上传证件,哪里上传证件信息申请花呗收款,1,"[花, 收, 款, 哪, 传, 证, 件]","[哪, 传, 证, 件, 信, 息, 申, 请, 花, 收, 款]","[1294, 554, 215, 1489, 1777, 945, 2081, 1, 1, ...","[1489, 1777, 945, 2081, 648, 1348, 1120, 1484,...","[0, 1]"
4,25416,我的退货款退回了花呗，可是花呗的钱我已经还过了。退回的钱去了哪里,我花呗钱还掉了 然后有退款退到花呗 现在钱去哪了呀,1,"[退, 货, 款, 退, 回, 花, 可, 是, 花, 钱, 已, 经, 退, 回, 钱, ...","[花, 钱, 掉, 然, 有, 退, 款, 退, 到, 花, 现, 钱, 去, 哪]","[800, 524, 215, 800, 2154, 1294, 1935, 1305, 1...","[1294, 611, 515, 1720, 1114, 800, 215, 800, 14...","[0, 1]"


In [23]:
# 将数据存到一个大列表里面，格式是[[s1,s2,y],[s1,s2,y],[s1,s2,y].......]
all_data = []
for i in range(len(s1_char_id_all)):
    all_data.append([s1_char_id_all[i],s2_char_id_all[i],y_set[i]])

In [24]:
ratio = int(len(all_data)*0.8)
train_data = all_data[:ratio]
test_data = all_data[ratio:]

In [25]:
# 将数据存入pickle中
with open("char_data.pk", 'wb') as f1:
    pickle.dump((train_data,test_data,char2id,id2char), f1)