In [50]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('/home/roger/kaikeba/03_lecture/code')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. 数据预处理

In [51]:
import pandas as pd
import numpy as np
from utils.data_loader import build_dataset,pad_proc,sentences_proc
from utils.config import *
from utils.multi_proc_utils import parallelize
from gensim.models.word2vec import LineSentence, Word2Vec

In [112]:
import tensorflow as tf

In [114]:
tf.__version__

'2.0.0'

# tensorflow 1.0  -> keras

## 1.1加载数据

In [54]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)
print('train data size {},test data size {}'.format(len(train_df), len(test_df)))

train data size 82943,test data size 20000


## 1.2 空值填充

In [55]:
train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True)
test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)

## 1.3.多进程, 批量数据处理

In [56]:
%%time
train_df = parallelize(train_df, sentences_proc)
test_df = parallelize(test_df, sentences_proc)

CPU times: user 450 ms, sys: 566 ms, total: 1.02 s
Wall time: 36.6 s


## 1.4 合并训练测试数据

In [57]:
train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1)
test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df),len(merged_df)))

train data size 82871,test data size 20000,merged_df data size 102871


## 1.5 保存处理好的 训练 测试集合

In [58]:
train_df = train_df.drop(['merged'], axis=1)
test_df = test_df.drop(['merged'], axis=1)
train_df.to_csv(train_seg_path, index=None, header=True)
test_df.to_csv(test_seg_path, index=None, header=True)
merged_df.to_csv(merger_seg_path, index=None, header=False)

# 2. 词向量

## 2.1 预训练词向量

In [59]:
wv_model = Word2Vec(LineSentence(merger_seg_path),
                    size=300, 
                    negative=5, 
                    workers=8, 
                    iter=wv_train_epochs, 
                    window=3,
                    min_count=5)

2019-11-24 20:23:10,365 : INFO : collecting all words and their counts
2019-11-24 20:23:10,365 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-24 20:23:10,573 : INFO : PROGRESS: at sentence #10000, processed 937272 words, keeping 36653 word types
2019-11-24 20:23:10,783 : INFO : PROGRESS: at sentence #20000, processed 1889030 words, keeping 53934 word types
2019-11-24 20:23:10,992 : INFO : PROGRESS: at sentence #30000, processed 2829438 words, keeping 66706 word types
2019-11-24 20:23:11,196 : INFO : PROGRESS: at sentence #40000, processed 3741912 words, keeping 77607 word types
2019-11-24 20:23:11,413 : INFO : PROGRESS: at sentence #50000, processed 4714603 words, keeping 87459 word types
2019-11-24 20:23:11,639 : INFO : PROGRESS: at sentence #60000, processed 5748572 words, keeping 97387 word types
2019-11-24 20:23:11,868 : INFO : PROGRESS: at sentence #70000, processed 6805872 words, keeping 106963 word types
2019-11-24 20:23:12,079 : INFO : PROGRE

## 2.2. 建立词表

In [60]:
vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)}
len(vocab)

32800

## Q 1 .使用了min_count,其实部分词不在vocab表中 ,但是训练数据和测试数据中又有这些词?

---


## 2.3. 获取词向量矩阵

In [61]:
embedding_matrix = wv_model.wv.vectors
embedding_matrix.shape

(32800, 300)

## 3. 构建训练数据

+ 可以把Question,Dialogue当做一句 `长文本处理`, 合并构建成X
+ Report作为需要预测的标签,构建Y

In [62]:
train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

In [63]:
train_df['X'].head()

0    方向机 重 助力 泵 方向机 都 换 新 都 换 助力 泵 方向机 换 方向机 带 助力 重...
1    奔驰 ML500 排气 凸轮轴 调节 错误 有没有 电脑 检测 故障 代码 有发 一下 发动...
2    2010 款 宝马X1 2011 年 出厂 20 排量 通用 6L45 变速箱 原地 换挡 ...
3    30V6 发动机 号 位置 照片 最好 右侧 排气管 上方 缸体 上 靠近 变速箱 是不是 ...
4    2012 款 奔驰 c180 维修保养 动力 值得 拥有 家庭 用车 入手 维修保养 费用 ...
Name: X, dtype: object

# Q 2. 句子长度一样 ? 如何构建训练,batch操作,矩阵 ...

---

## 3.1 填充字段

In [66]:
def pad_proc(sentence, max_len, vocab):
    '''
    < start > < end > < pad > < unk >
    '''
    # 0.按空格统计切分出词
    words = sentence.strip().split(' ')
    # 1. 截取规定长度的词数
    words = words[:max_len]
    # 2. 填充< unk > ,判断是否在vocab中, 不在填充 < unk >
    sentence = [word if word in vocab else '<UNK>' for word in words]
    # 3. 填充< start > < end >
    sentence = ['<START>'] + sentence + ['<STOP>']
    # 4. 判断长度，填充　< pad >
    sentence = sentence + ['<PAD>'] * (max_len + 2 - len(words))
    return ' '.join(sentence)

# Q3. 如何确定max_len的值? 经验 ?

## 3.2 获取适当的Max_Len

In [15]:
def get_max_len(data):
    """
    获得合适的最大长度值
    :param data: 待统计的数据  train_df['Question']
    :return: 最大长度值
    """
    max_lens = data.apply(lambda x: x.count(' '))
    return int(np.mean(max_lens) + 2 * np.std(max_lens))

In [67]:
# 获取输入数据 适当的最大长度
train_y_max_len = get_max_len(train_df['X'])
test_y_max_len = get_max_len(test_df['X'])

x_max_len = max(train_y_max_len, test_y_max_len)

# 获取标签数据 适当的最大长度
train_y_max_len = get_max_len(train_df['Report'])

In [68]:
train_y_max_len

30

In [69]:
x_max_len

257

## 3.3 填充处理

+ < start > - 句子开始
+ < end > - 句子结尾
+ < pad > - 短句填充
+ < unk > - 未知词

In [71]:
# 训练集X处理
train_df['X'] = train_df['X'].apply(lambda x: pad_proc(x, x_max_len, vocab))
# 训练集Y处理
train_df['Y'] = train_df['Report'].apply(lambda x: pad_proc(x, train_y_max_len, vocab))
# 测试集X处理
test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, x_max_len, vocab))

In [76]:
# 保存中间结果数据
train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
test_df['X'].to_csv(test_x_pad_path, index=None, header=False)

## Q4 新加的符号不在词表 和 词向量矩阵中,怎么办?

## 3.4 词表更新

In [80]:
print('start retrain w2v model')
wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
wv_model.train(LineSentence(train_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
print('1/3')
wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
wv_model.train(LineSentence(train_y_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
print('2/3')
wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
wv_model.train(LineSentence(test_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)

2019-11-24 20:46:55,543 : INFO : collecting all words and their counts
2019-11-24 20:46:55,543 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


start retrain w2v model


2019-11-24 20:46:56,015 : INFO : PROGRESS: at sentence #10000, processed 2610000 words, keeping 22467 word types
2019-11-24 20:46:56,486 : INFO : PROGRESS: at sentence #20000, processed 5220000 words, keeping 27516 word types
2019-11-24 20:46:56,958 : INFO : PROGRESS: at sentence #30000, processed 7830000 words, keeping 29902 word types
2019-11-24 20:46:57,420 : INFO : PROGRESS: at sentence #40000, processed 10440000 words, keeping 31156 word types
2019-11-24 20:46:57,899 : INFO : PROGRESS: at sentence #50000, processed 13050000 words, keeping 31849 word types
2019-11-24 20:46:58,371 : INFO : PROGRESS: at sentence #60000, processed 15660000 words, keeping 32302 word types
2019-11-24 20:46:58,846 : INFO : PROGRESS: at sentence #70000, processed 18270000 words, keeping 32557 word types
2019-11-24 20:46:59,316 : INFO : PROGRESS: at sentence #80000, processed 20880000 words, keeping 32657 word types
2019-11-24 20:46:59,450 : INFO : collected 32683 word types from a corpus of 21629331 raw w

1/3


2019-11-24 20:47:06,039 : INFO : PROGRESS: at sentence #30000, processed 1020000 words, keeping 12278 word types
2019-11-24 20:47:06,119 : INFO : PROGRESS: at sentence #40000, processed 1360000 words, keeping 13807 word types
2019-11-24 20:47:06,198 : INFO : PROGRESS: at sentence #50000, processed 1700000 words, keeping 14983 word types
2019-11-24 20:47:06,278 : INFO : PROGRESS: at sentence #60000, processed 2040000 words, keeping 16099 word types
2019-11-24 20:47:06,360 : INFO : PROGRESS: at sentence #70000, processed 2380000 words, keeping 17191 word types
2019-11-24 20:47:06,439 : INFO : PROGRESS: at sentence #80000, processed 2720000 words, keeping 17950 word types
2019-11-24 20:47:06,462 : INFO : collected 18178 word types from a corpus of 2817614 raw words and 82871 sentences
2019-11-24 20:47:06,463 : INFO : Updating model with new vocabulary
2019-11-24 20:47:06,471 : INFO : New added 7058 unique words (27% of original 25236) and increased the count of 7058 pre-existing words (27

2/3


2019-11-24 20:47:08,146 : INFO : PROGRESS: at sentence #10000, processed 2610000 words, keeping 22626 word types
2019-11-24 20:47:08,618 : INFO : collected 27245 word types from a corpus of 5220000 raw words and 20000 sentences
2019-11-24 20:47:08,618 : INFO : Updating model with new vocabulary
2019-11-24 20:47:08,631 : INFO : New added 12435 unique words (31% of original 39680) and increased the count of 12435 pre-existing words (31% of original 39680)
2019-11-24 20:47:08,680 : INFO : deleting the raw counts dictionary of 27245 items
2019-11-24 20:47:08,681 : INFO : sample=0.001 downsamples 18 most-common words
2019-11-24 20:47:08,681 : INFO : downsampling leaves estimated 3406995 word corpus (65.7% of prior 5189024)
2019-11-24 20:47:08,716 : INFO : estimated required memory for 24870 words and 300 dimensions: 72123000 bytes
2019-11-24 20:47:08,717 : INFO : updating layer weights
2019-11-24 20:47:08,742 : INFO : training model with 8 workers on 32804 vocabulary and 300 features, using

(1734879, 5220000)

In [22]:
# 保存词向量模型
wv_model.save(save_wv_model_path)

2019-11-24 18:16:27,705 : INFO : saving Word2Vec object under /home/roger/kaikeba/03_lecture/code/data/wv/word2vec.model, separately None
2019-11-24 18:16:27,706 : INFO : not storing attribute vectors_norm
2019-11-24 18:16:27,707 : INFO : not storing attribute cum_table
2019-11-24 18:16:28,240 : INFO : saved /home/roger/kaikeba/03_lecture/code/data/wv/word2vec.model


## Q5.为什么不一开始就添加 标志符号,然后训练词向量?

In [84]:
#更新vocab
vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)}
# 更新词向量矩阵
embedding_matrix = wv_model.wv.vectors
embedding_matrix.shape

(32804, 300)

## Q6. 词可以训练吗?

In [85]:
train_df['X'].head()

0    <START> 方向机 重 助力 泵 方向机 都 换 新 都 换 助力 泵 方向机 换 方向...
1    <START> 奔驰 <UNK> 排气 凸轮轴 调节 错误 有没有 电脑 检测 故障 代码 ...
2    <START> 2010 款 宝马X1 2011 年 出厂 20 排量 通用 <UNK> 变...
3    <START> 30V6 发动机 号 位置 照片 最好 右侧 排气管 上方 缸体 上 靠近 ...
4    <START> 2012 款 奔驰 c180 维修保养 动力 值得 拥有 家庭 用车 入手 ...
Name: X, dtype: object

## 3.4 数值转换

In [88]:
# 遇到未知词就填充unk的索引
unk_index = vocab['<UNK>']
def transform_data(sentence,vocab):
    # 字符串切分成词
    words=sentence.split(' ')
    # 按照vocab的index进行转换
    ids=[vocab[word] if word in vocab else unk_index for word in words]
    return ids

In [89]:
# 将词转换成索引  [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231
train_ids_x=train_df['X'].apply(lambda x:transform_data(x,vocab))
train_ids_y=train_df['Y'].apply(lambda x:transform_data(x,vocab))
test_ids_x=test_df['X'].apply(lambda x:transform_data(x,vocab))

In [98]:
# 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
train_data_X=np.array(train_ids_x.tolist())
train_data_Y=np.array(train_ids_y.tolist())
test_data_X=np.array(test_ids_x.tolist())

## 4. 简易模型搭建

In [99]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [101]:
def seq2seq(input_length, output_sequence_length, embedding_matrix, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=300, weights=[embedding_matrix], trainable=False,
                        input_length=input_length))
    model.add(Bidirectional(GRU(300, return_sequences=False)))
    model.add(Dense(300, activation="relu"))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(300, return_sequences=True)))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(1e-3))
    model.summary()
    return model

## 4.1 基本参数设置

In [32]:
train_data_X.shape

(82871, 261)

In [103]:
# 输入的长度   x  max_len
input_length = train_data_X.shape[1]
# 输出的长度  y  max_len
output_sequence_length = train_data_Y.shape[1]
# 词表大小
vocab_size=len(vocab)
# 词向量矩阵
embedding_matrix = wv_model.wv.vectors

## 4.2 模型构建

In [104]:
model = seq2seq(input_length,output_sequence_length,embedding_matrix,vocab_size)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 261, 300)          9841200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 600)               1083600   
_________________________________________________________________
dense_2 (Dense)              (None, 300)               180300    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 34, 300)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 34, 600)           1083600   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 34, 32804)         19715204  
Total params: 31,903,904
Trainable params: 22,062,704
Non-trainable params: 9,841,200
__________________________________

## 4.3 模型训练

In [None]:
model.fit(train_data_X, train_data_Y, batch_size=32, epochs=1, validation_split=0.2)

## 4.4 模型保存

In [None]:
model.save('data/seq2seq_model.h')

## 4 .5 模型预测

In [None]:
test_data_Y = model.predict(test_data_X)

# seq2seq

1. 所有输出端，都以一个通用的<start>标记开头，以<end>标记结尾，这两个标记也视为一个词/字；

2. 将<start>输入decoder，然后得到隐藏层向量，将这个向量与encoder的输出混合，然后送入一个分类器，分类器的结果应当输出P；

3. 将P输入decoder，得到新的隐藏层向量，再次与encoder的输出混合，送入分类器，分类器应输出Q；

4. 依此递归，直到分类器的结果输出<end>。
    

* 回到用seq2seq生成文章标题这个任务上，模型可以做些简化，并且可以引入一些先验知识。比如，由于输入语言和输出语言都是中文，因此encoder和decoder的Embedding层可以共享参数（也就是用同一套词向量）。这使得模型的参数量大幅度减少了。


In [None]:
from keras.preprocessing.text import Tokenizer # 词表构建 单词过滤 词频统计 序列填充
from keras.preprocessing.sequence import pad_sequences # 序列数据填充
from sklearn.model_selection import train_test_split # 数据集划分