# 环境准备

In [31]:
%%time
import jieba
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

Wall time: 0 ns


# 加载数据

In [13]:
%%time
names = ['sentence1', 'sentence2', 'label']
train = pd.read_csv("../xfdata/train.csv", header=None, sep="\t", names=names)
test = pd.read_csv("../xfdata/test.csv", header=None, sep="\t", names=names)

Wall time: 105 ms


# 查看数据

In [15]:
%%time
train.head()

Wall time: 0 ns


Unnamed: 0,sentence1,sentence2,label
0,藏獒为什么这么贵,藏獒见人不咬为什么,0
1,人生应该怎么才算精彩？,人生要怎么过才算精彩啊,1
2,为什么打牌老是输,为什么我枪神纪进不去了,0
3,现在网上卖什么最赚钱,网上卖什么最赚钱,1
4,如何提高气质,怎样提高自身气质？,1


# 特征工程

## 中文分词

In [19]:
%%time
def text_cut_words(short_dialogue_text, mapdict=None)->list:
    cut_words = list(jieba.cut(short_dialogue_text, cut_all=False))
    if mapdict != None:
        words = [word if word not in mapdict else mapdict[word] for word in cut_words]
    else:
        words = cut_words
    return words

train["sentence1_words"] = train["sentence1"].apply(text_cut_words)
train["sentence2_words"] = train["sentence2"].apply(text_cut_words)
test["sentence1_words"] = test["sentence1"].apply(text_cut_words)
test["sentence2_words"] = test["sentence2"].apply(text_cut_words)

Wall time: 5.19 s


## 词向量化

In [28]:
%%time
def word2vec_model(sentences):
    return word2vec.Word2Vec(sentences=sentences, vector_size=300, window=10, min_count=1, workers=8, sg=1)

def w2v_sent2vec(words, model)->list:
    matrix = []
    for word in words:
        try:
            matrix.append(model.wv[str(word)])
        except KeyError:
            continue
    if len(matrix) == 0:
        matrix.append(0)
    matrix = 300 * matrix
    matrix = np.array(matrix)
    vector = matrix.sum(axis=0)
    vector_transform = (vector / np.sqrt((vector ** 2).sum())).astype(np.float32).tolist()
    return vector_transform

w2v_model = word2vec_model(train["sentence1_words"].tolist() + train["sentence2_words"].tolist())
feature_names = ["vec1_{}".format(str(i)) for i in range(300)]
train[feature_names] = train.apply(lambda row: w2v_sent2vec(row["sentence1_words"], w2v_model), result_type="expand", axis=1)
test[feature_names] = test.apply(lambda row: w2v_sent2vec(row["sentence1_words"], w2v_model), result_type="expand", axis=1)

feature_names = ["vec2_{}".format(str(i)) for i in range(300)]
train[feature_names] = train.apply(lambda row: w2v_sent2vec(row["sentence2_words"], w2v_model), result_type="expand", axis=1)
test[feature_names] = test.apply(lambda row: w2v_sent2vec(row["sentence2_words"], w2v_model), result_type="expand", axis=1)

Wall time: 15.6 s


## 查看数据

In [30]:
train.head()

Unnamed: 0,sentence1,sentence2,label,sentence1_words,sentence2_words,vec1_1,vec1_0,vec1_2,vec1_3,vec1_4,...,vec2_20,vec2_21,vec2_22,vec2_23,vec2_24,vec2_25,vec2_26,vec2_27,vec2_28,vec2_29
0,藏獒为什么这么贵,藏獒见人不咬为什么,0,"[藏獒, 为什么, 这么, 贵]","[藏獒, 见, 人, 不, 咬, 为什么]",-0.239464,-0.121251,0.135171,0.305566,0.119762,...,-0.112501,-0.085315,-0.060292,0.370697,0.145503,0.08474,0.303218,0.377802,-0.017046,-0.423276
1,人生应该怎么才算精彩？,人生要怎么过才算精彩啊,1,"[人生, 应该, 怎么, 才, 算, 精彩, ？]","[人生, 要, 怎么, 过, 才, 算, 精彩, 啊]",-0.076702,0.043066,0.288888,0.080856,0.027467,...,0.105474,-0.182471,-0.159379,0.462175,0.223113,0.194785,0.177204,0.145913,0.045965,-0.2168
2,为什么打牌老是输,为什么我枪神纪进不去了,0,"[为什么, 打牌, 老是, 输]","[为什么, 我, 枪神, 纪, 进不去, 了]",-0.213117,-0.09219,0.040614,0.113659,0.081719,...,-0.045531,-0.09959,-0.004021,0.307213,0.156712,0.052207,0.125242,0.367955,-0.050377,-0.547881
3,现在网上卖什么最赚钱,网上卖什么最赚钱,1,"[现在, 网上, 卖, 什么, 最, 赚钱]","[网上, 卖, 什么, 最, 赚钱]",0.100445,-0.101776,0.462573,0.001133,-0.056483,...,0.043075,-0.076738,0.06075,0.476237,0.066865,0.14434,0.047456,0.060169,-0.116806,-0.147685
4,如何提高气质,怎样提高自身气质？,1,"[如何, 提高, 气质]","[怎样, 提高, 自身, 气质, ？]",-0.313924,0.014691,0.273738,-0.119044,0.061329,...,0.094003,-0.179929,0.137684,0.294595,0.007612,0.240942,0.043142,-0.010049,0.082187,-0.181067


## 划分数据集

In [32]:
%%time
x_train, x_valid, y_train, y_valid = train_test_split(train.iloc[:, 5:], train.label, 
                                                      test_size=0.3, 
                                                      random_state=20)
x_test = test.iloc[:, 5:]

Wall time: 49.8 ms


# 构建模型

## 搭建一维卷积

In [35]:
%%time
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(300, 50, input_length=300))
model.add(tf.keras.layers.Conv1D(32, 7, activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(3))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 60, 50)            3000      
                                                                 
 conv1d_1 (Conv1D)           (None, 54, 32)            11232     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 18, 32)           0         
 1D)                                                             
                                                                 
 global_average_pooling1d_1   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 14,265
Trainable params: 14,265
Non-trai

## 模型编译

In [37]:
%%time
model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss = 'binary_crossentropy',
              metrics=['acc'])

Wall time: 5.98 ms


## 模型训练

In [1]:
%%time
history = model.fit(x_train, y_train, 
                    validation_data=(x_valid, y_valid),
                    epochs=30, batch_size=168)

NameError: name 'model' is not defined