# 环境准备

In [1]:
%%time
import jieba
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from gensim.models import word2vec

warnings.filterwarnings("ignore")

Wall time: 2.61 s


# 加载数据

In [13]:
%%time
names = ['sentence1', 'sentence2', 'label']
train = pd.read_csv("../xfdata/train.csv", header=None, sep="\t", names=names)
test = pd.read_csv("../xfdata/test.csv", header=None, sep="\t", names=names)

Wall time: 105 ms


# 查看数据

In [15]:
%%time
train.head()

Wall time: 0 ns


Unnamed: 0,sentence1,sentence2,label
0,藏獒为什么这么贵,藏獒见人不咬为什么,0
1,人生应该怎么才算精彩？,人生要怎么过才算精彩啊,1
2,为什么打牌老是输,为什么我枪神纪进不去了,0
3,现在网上卖什么最赚钱,网上卖什么最赚钱,1
4,如何提高气质,怎样提高自身气质？,1


# 特征工程

## 中文分词

In [19]:
%%time
def text_cut_words(short_dialogue_text, mapdict=None)->list:
    cut_words = list(jieba.cut(short_dialogue_text, cut_all=False))
    if mapdict != None:
        words = [word if word not in mapdict else mapdict[word] for word in cut_words]
    else:
        words = cut_words
    return words

train["sentence1_words"] = train["sentence1"].apply(text_cut_words)
train["sentence2_words"] = train["sentence2"].apply(text_cut_words)
test["sentence1_words"] = test["sentence1"].apply(text_cut_words)
test["sentence2_words"] = test["sentence2"].apply(text_cut_words)

Wall time: 5.19 s


## 词向量化

In [28]:
%%time
def word2vec_model(sentences):
    return word2vec.Word2Vec(sentences=sentences, vector_size=30, window=10, min_count=1, workers=8, sg=1)

def w2v_sent2vec(words, model)->list:
    matrix = []
    for word in words:
        try:
            matrix.append(model.wv[str(word)])
        except KeyError:
            continue
    if len(matrix) == 0:
        matrix.append(0)
    matrix = 30 * matrix
    matrix = np.array(matrix)
    vector = matrix.sum(axis=0)
    vector_transform = (vector / np.sqrt((vector ** 2).sum())).astype(np.float32).tolist()
    return vector_transform

w2v_model = word2vec_model(train["sentence1_words"].tolist() + train["sentence2_words"].tolist())
feature_names = ["vec1_{}".format(str(i)) for i in range(30)]
train[feature_names] = train.apply(lambda row: w2v_sent2vec(row["sentence1_words"], w2v_model), result_type="expand", axis=1)
test[feature_names] = test.apply(lambda row: w2v_sent2vec(row["sentence1_words"], w2v_model), result_type="expand", axis=1)

feature_names = ["vec2_{}".format(str(i)) for i in range(30)]
train[feature_names] = train.apply(lambda row: w2v_sent2vec(row["sentence2_words"], w2v_model), result_type="expand", axis=1)
test[feature_names] = test.apply(lambda row: w2v_sent2vec(row["sentence2_words"], w2v_model), result_type="expand", axis=1)

Wall time: 15.6 s


## 