#  1. Prepare data
* read from redis
* parse the title and abstract
* calculate the term frequency and document frequency
* build character 2 index dict and index to character index

In [1]:
import redis
import json
import h5py
import pickle
import numpy as np
import random
import tensorflow as tf
from tensorflow.contrib import rnn


In [2]:
class Char:
    def __init__(self,val,tf,df):
        self.val = val
        self.tf = tf
        self.df = df

In [3]:
def parse_all_crawled_data():
    res = []
    keys =  r.keys()
    print "Get [%s] Docs"%(len(keys))
    for data in r.mget(keys):
        data = json.loads(data)
        key = data.get("group_id")
        title = data.get("title").replace('\t',' ')
        abstract = data.get("abstract").replace('\t',' ')
        res.append((key,title,abstract))
    return res    

In [4]:
def cal_char_tf_df(corpus):
    chars = {}
    for doc in corpus:    
        title, abstract = doc[1],doc[2]
        # traverse every char in the text
        text = (title + abstract).lower()
        # acumulate the term frequency
        for char in text:
            if not chars.get(char):
                chars[char] = Char(val = char,tf = 1,df = 0)
            else:
                chars[char].tf += 1
        # acummulate the doc frequency
        for char in set(text):
            chars[char].df += 1
    return chars

In [5]:
def build_idx_for_chars_tf_df(chars,tf_thres = 12,df_thres = 6):
    id_beg = 0
    id_eos = 1
    id_emp = 2
    id_unk = 3
    
    start_idx = id_unk + 1

    char2idx = {}
    idx2char = {}

    char2idx['<eos>'] = id_eos
    char2idx['<unk>'] = id_unk
    char2idx['<emp>'] = id_emp
    char2idx['<beg>'] = id_beg
    #filter out tf>20 and df > 10 terms
    chars = filter(lambda char:char.tf > tf_thres and char.df > df_thres,chars)
    char2idx.update(dict([(char.val,start_idx + idx) for idx,char in enumerate(chars)]))
    idx2char = dict([(idx,char) for char,idx in char2idx.items()])
    return char2idx, idx2char


In [6]:
def prt(label, x):
    print label+':',
    for w in x:
        if w == id_emp:
            continue
        print idx2char[w],
    print

1. Prepare Data

In [9]:
id_beg = 0
id_eos = 1
id_emp = 2
id_unk = 3

total_samples = 26000
val_samples = 1000
train_samples = total_samples - val_samples


DataFile = "data/basic_data_tf.pkl"
UseStoredData = False

if UseStoredData:
    print "use the stored data"
    char2idx, idx2char,X_train, X_test, Y_train, Y_test = pickle.load(open(DataFile))
else:
    r = redis.StrictRedis(host='localhost', port=6379, db=0)
    corpus = parse_all_crawled_data()
    chars_dict = cal_char_tf_df(corpus)

    print "Got [%s] Uniq charaters"%len(chars_dict)
    chars_tf_reverse = sorted(chars_dict.values(),key = lambda x:x.tf,reverse = True)
    print "the Top 10 are:"
    print "\n".join(["%s\t%s\t%s" %(char.val,char.tf,char.df) for char in chars_tf_reverse[:10]])

    char2idx, idx2char = build_idx_for_chars_tf_df(chars_dict.values())
    titles = [[char2idx.get(char,id_unk) for char in doc[1]] for doc in corpus][:total_samples]
    abstracts = [[char2idx.get(char,id_unk) for char in doc[2]] for doc in corpus][:total_samples]
        
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(abstracts, titles, test_size=val_samples, random_state=10)
    len(X_train), len(Y_train), len(X_test), len(Y_test)

    pickle.dump((char2idx, idx2char,X_train, X_test, Y_train, Y_test),open(DataFile,"wb"),-1)
    


vocab_size = len(char2idx)
print "vocabsize is :[%d]"%vocab_size
i = random.randint(0,len(X_train))
prt('H',Y_train[i])
prt('D',X_train[i])

print len(X_test)
print len(X_train)

Get [26658] Docs
Got [5692] Uniq charaters
the Top 10 are:
，	156756	25825
的	103125	24579
。	52691	21783
一	44808	19812
是	44700	18907
了	34809	16933
不	32371	16205
人	27265	13617
有	27182	15050
在	25308	15057
vocabsize is :[3525]
H: 手 机 曲 面 屏 到 底 有 什 么 用 ？
D: 对 于 手 机 的 曲 面 屏 ， 其 实 并 不 是 一 定 要 有 什 么 作 用 ， 首 先 它 和 现 在 大 部 分 同 质 化 手 机 完 全 不 同 ， 而 且 带 来 了 更 好 的 手 感 和 视 觉 效 果 ， 现 在 的 曲 面 屏 都 是 高 端 手 机 代 表 ， 因 为 它 对 工 艺 技 术 的 有 更 高 的 要 求 ， 所 以 并 不 是 随 便 一 家 手 机 厂 商 都 可 以 做 曲 面 屏 的 。
1000
25000


# 2. Model

## 1. parameters

In [10]:
learning_rate = 0.001
batch_size = 32
display_step = 10
dropout_keep_prob = 1.0


maxlena=150 # 0 - if we dont want to use description at all
maxlent=40
maxlen = maxlena + maxlent
maxlenh = maxlent
maxlend = maxlena

vocab_size = len(char2idx)
embedding_size = 100

empty = id_emp
eos = id_eos
unk = id_unk
beg = id_beg


# for cnn encoder use
filter_sizes = [2,3,4,5,6,8,10,13]
num_filters = 16

# for rnn deocoder use ,GRU cell memory size. same as encoder state
memory_dim = 128

In [11]:
encoder_inputs = tf.placeholder(tf.int32, shape=[None,maxlend], name='encoder_inputs')
decoder_targets = tf.placeholder(tf.int32,shape=(None, maxlenh), name='decoder_targets')
decoder_inputs = tf.placeholder(tf.int32, [None, maxlenh], name = "decoder_inputs")

In [12]:
embeddings = tf.Variable(
    tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))

In [13]:
# cnn as encode
def CNNEncoder(encoder_inputs):
    #train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
    # to expand one dim for CNN
    embed_expanded = tf.expand_dims(encoder_inputs_embedded,-1)

    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv = tf.nn.conv2d(
                embed_expanded,
                W,  
                strides=[1, 1, 1, 1], 
                padding="VALID",
                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            #print h.shape
            # Max-pooling over the outputs
            pooled = tf.nn.max_pool(
                h,  
                ksize=[1, maxlend - filter_size + 1, 1, 1], 
                strides=[1, 1, 1, 1], 
                padding='VALID',
                name="pool")          
            pooled_outputs.append(pooled)
    # Combine all the pooled features
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(pooled_outputs,3)
    #print h_pool.shape
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
    #print h_pool_flat.shape

    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
    return h_drop

 RNN AS Decoder

In [14]:
def RNNDecoder(encoder_state,decoder_inputs):
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)
    #from tensorflow.models.rnn import rnn_cell, seq2seq
    cell = rnn.GRUCell(memory_dim)
    decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
        cell, decoder_inputs_embedded,
        initial_state=encoder_state,
        dtype=tf.float32,scope="plain_decoder1")
    return decoder_outputs, decoder_final_state 


In [32]:
encoder_state = CNNEncoder(encoder_inputs)
decoder_outputs, _ = RNNDecoder(encoder_state,decoder_inputs)

decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels = labels,
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy)

decoder_prediction = tf.argmax(decoder_logits, 2)

train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)


ValueError: Variable plain_decoder1/gru_cell/gates/weights already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "/home/hewei/.local/lib/python2.7/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 1044, in _linear
    _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype)
  File "/home/hewei/.local/lib/python2.7/site-packages/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py", line 150, in __call__
    [inputs, state], 2 * self._num_units, True, 1.0))
  File "<ipython-input-14-d003c7c96826>", line 8, in RNNDecoder
    dtype=tf.float32,scope="plain_decoder1")


In [16]:
labels_ = tf.argmax(labels,2)

# 3. Training:

In [20]:
def rpadd(x, maxlen=maxlenh, eos=eos,lpad=True,prefix=None):
    assert maxlen >= 0
    
    if prefix != None:
        x = [prefix] + x
    n = len(x)
    if n > maxlen - 1 :
        x = x[:maxlen - 1]
        n = maxlen - 1
    res = x + [eos] + [empty] * (maxlen - n - 1) 
    assert len(res) == maxlen
    return res

In [21]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [24]:
batch_size = 128
epocs = 1500
saver = tf.train.Saver()

for i in range(epocs):
    j = 0
    while (j < len(X_train)):
        
        encoder_inputs_ = map(lambda x:rpadd(x,maxlend),X_train[j:j+batch_size])
        decoder_inputs_ = map(lambda x:rpadd(x,maxlenh,prefix=beg),Y_train[j:j+batch_size])        
        decoder_targets_ = map(lambda x:x[1:] + [empty],decoder_inputs_)
        
        j = j + batch_size
        _,loss_,labels__,decoder_prediction_ = sess.run([train_op,loss,labels_,decoder_prediction],
            feed_dict={
                encoder_inputs : encoder_inputs_,
                decoder_inputs : decoder_inputs_,
                decoder_targets : decoder_targets_
        })
        
        """
        print "encorder_inputs:", encoder_inputs_
        print "decoder_inputs_:", decoder_inputs_
        print "decoder_targets_", decoder_targets_
        print "lables_",labels__
        print 'decoder_prediction is :' , decoder_prediction_
        """
        if j % (batch_size * 1) == 0:
            print "Runing in EPOC[%d] Batch [%d]" %(i, j / batch_size)
            print "loss", loss_
            
            k = random.randint(0,len( encoder_inputs_)-1)
            print "-" * 20
            x = [beg]
            for i in range(maxlenh):
                
                decoder_prediction = sess.run([decoder_prediction],
                         feed_dict = {
                            encoder_inputs : [encoder_inputs_[k]],
                            decoder_inputs : [rpadd(x,maxlenh,prefix=beg)]
                         }
                )
                if decoder_prediction[i] == eos:
                    pass
                else:
                    x.append(decoder_prediction[i])
            prt("[*预测标题*]",x)
                
            prt("[**描  述**]",encoder_inputs_[k])
            prt("[*预测标题*]",decoder_prediction_[k])
            prt("[*真实标题*]",decoder_inputs_[k])
            print "-" * 20
            
    if i % 10 == 0:
        saver.save(sess,"model/TitleGeneration",global_step = i)
        
    

Runing in EPOC[0] Batch [10]
loss 2.55794
Runing in EPOC[0] Batch [20]
loss 2.51331
Runing in EPOC[0] Batch [30]
loss 2.6242
Runing in EPOC[0] Batch [40]
loss 2.57741
Runing in EPOC[0] Batch [50]
loss 2.71939
Runing in EPOC[0] Batch [60]
loss 2.76793
Runing in EPOC[0] Batch [70]
loss 2.66793
Runing in EPOC[0] Batch [80]
loss 2.76089
Runing in EPOC[0] Batch [90]
loss 2.82169
Runing in EPOC[0] Batch [100]
loss 2.75839
Runing in EPOC[0] Batch [110]
loss 2.83365
Runing in EPOC[0] Batch [120]
loss 2.7559
Runing in EPOC[0] Batch [130]
loss 2.89953
Runing in EPOC[0] Batch [140]
loss 2.67833
Runing in EPOC[0] Batch [150]
loss 2.74658
Runing in EPOC[0] Batch [160]
loss 2.58725
Runing in EPOC[0] Batch [170]
loss 2.64656
Runing in EPOC[0] Batch [180]
loss 2.67401
Runing in EPOC[0] Batch [190]
loss 2.75071
Runing in EPOC[1] Batch [10]
loss 2.53219
Runing in EPOC[1] Batch [20]
loss 2.49097
Runing in EPOC[1] Batch [30]
loss 2.60765
Runing in EPOC[1] Batch [40]
loss 2.55826
Runing in EPOC[1] Batch [5

KeyboardInterrupt: 

In [31]:
test_x = []
test_encode_input = rpadd(X_train[10],maxlend)
prt("test_desc",test_encode_input)
for i in range(maxlenh):
    new_decoder_input = rpadd(test_x,maxlenh,prefix=beg)
    decoder_prediction_ = sess.run([decoder_prediction],
             feed_dict = {
                encoder_inputs : [test_encode_input],
                decoder_inputs : [new_decoder_input]
             }
    )
    #print decoder_prediction
    if decoder_prediction_[0][i] == eos:
        break
    else:
        test_x.append(decoder_prediction[i])
prt("[*预测标题*]",test_x)

test_desc: [ 资 讯 - 牛 车 网 ] 本 田 新 思 域 <unk> y p e   <unk> 量 产 版 已 于 3 月 开 幕 的 2 0 1 7 日 内 瓦 车 展 上 正 式 发 布 ， 也 刚 刚 在 纽 北 夺 回 前 驱 最 速 的 头 衔 ， <unk> y p e <unk> 真 的 算 是 一 辆 神 车 ， 但 因 为 没 有 进 口 所 以 国 内 车 友 对 它 并 不 了 解 ， 但 资 深 的 本 田 粉 却 日 日 夜 夜 盼 着 它 的 到 来 。 <eos>


TypeError: Fetch argument array([[1583, 3168,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
           2,    2,    2,    2,    2,    2,    2]]) has invalid type <type 'numpy.ndarray'>, must be a string or Tensor. (Can not convert a ndarray into a Tensor or Operation.)