#  1. Prepare data
* read from redis
* parse the title and abstract
* calculate the term frequency and document frequency
* build character 2 index dict and index to character index

In [1]:
import redis
import json
import h5py
import pickle
import numpy as np
import random
import tensorflow as tf
from tensorflow.contrib import rnn


In [2]:
class Char:
    def __init__(self,val,tf,df):
        self.val = val
        self.tf = tf
        self.df = df

In [3]:
def parse_all_crawled_data():
    res = []
    keys =  r.keys()
    print "Get [%s] Docs"%(len(keys))
    for data in r.mget(keys):
        data = json.loads(data)
        key = data.get("group_id")
        title = data.get("title").replace('\t',' ')
        abstract = data.get("abstract").replace('\t',' ')
        res.append((key,title,abstract))
    return res    

In [4]:
def cal_char_tf_df(corpus):
    chars = {}
    for doc in corpus:    
        title, abstract = doc[1],doc[2]
        # traverse every char in the text
        text = (title + abstract).lower()
        # acumulate the term frequency
        for char in text:
            if not chars.get(char):
                chars[char] = Char(val = char,tf = 1,df = 0)
            else:
                chars[char].tf += 1
        # acummulate the doc frequency
        for char in set(text):
            chars[char].df += 1
    return chars

In [5]:
def build_idx_for_chars_tf_df(chars,tf_thres = 12,df_thres = 6):
    id_beg = 0
    id_eos = 1
    id_emp = 2
    id_unk = 3
    
    start_idx = id_unk + 1

    char2idx = {}
    idx2char = {}

    char2idx['<eos>'] = id_eos
    char2idx['<unk>'] = id_unk
    char2idx['<emp>'] = id_emp
    char2idx['<beg>'] = id_beg
    #filter out tf>20 and df > 10 terms
    chars = filter(lambda char:char.tf > tf_thres and char.df > df_thres,chars)
    char2idx.update(dict([(char.val,start_idx + idx) for idx,char in enumerate(chars)]))
    idx2char = dict([(idx,char) for char,idx in char2idx.items()])
    return char2idx, idx2char


In [6]:
def prt(label, x):
    print label+':',
    for w in x:
        if w == id_emp:
            continue
        print idx2char[w],
    print

1. Prepare Data

In [7]:
id_beg = 0
id_eos = 1
id_emp = 2
id_unk = 3

total_samples = 26000
val_samples = 1000
train_samples = total_samples - val_samples


DataFile = "data/basic_data_tf.pkl"
UseStoredData = True

if UseStoredData:
    print "use the stored data"
    char2idx, idx2char,X_train, X_test, Y_train, Y_test = pickle.load(open(DataFile))
else:
    r = redis.StrictRedis(host='localhost', port=6379, db=0)
    corpus = parse_all_crawled_data()
    chars_dict = cal_char_tf_df(corpus)

    print "Got [%s] Uniq charaters"%len(chars_dict)
    chars_tf_reverse = sorted(chars_dict.values(),key = lambda x:x.tf,reverse = True)
    print "the Top 10 are:"
    print "\n".join(["%s\t%s\t%s" %(char.val,char.tf,char.df) for char in chars_tf_reverse[:10]])

    char2idx, idx2char = build_idx_for_chars_tf_df(chars_dict.values())
    titles = [[char2idx.get(char,id_unk) for char in doc[1]] for doc in corpus][:total_samples]
    abstracts = [[char2idx.get(char,id_unk) for char in doc[2]] for doc in corpus][:total_samples]
        
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(abstracts, titles, test_size=val_samples, random_state=10)
    len(X_train), len(Y_train), len(X_test), len(Y_test)

    pickle.dump((char2idx, idx2char,X_train, X_test, Y_train, Y_test),open(DataFile,"wb"),-1)
    


vocab_size = len(char2idx)
print "vocabsize is :[%d]"%vocab_size
i = random.randint(0,len(X_train))
prt('H',Y_train[i])
prt('D',X_train[i])

print len(X_test)
print len(X_train)

use the stored data
vocabsize is :[3525]
H: 白 百 何 羽 凡 感 情 破 裂 豪 宅 被 曝 ， 情 虽 断 但 这 家 还 在 装 修 没 得 说 ！
D: 在 白 百 合 出 轨 事 前 ， 她 和 陈 羽 凡 是 一 对 羡 煞 旁 人 的 恩 爱 夫 妻 。 此 前 ， 羽 凡 凭 着 优 秀 的 音 乐 拿 到 了 某 音 乐 节 上 的 歌 王 ， 然 后 白 百 合 这 边 便 随 着 《 失 恋 3 3 天 》 火 了 。 从 此 ， 百 合 的 星 途 就 是 一 路 绿 灯 ， 然 后 凭 借 着 一 张 更 具 有 灵 气 的 脸 蛋 抢 了 王 <unk> 丹 的 地 位 ， 后 来 更 是 随 着 《 捉 妖 记 》 名 气 大 增 ， 成 为 内 地 一 线 女 演 员 。
1000
25000


# 2. Model

## 1. parameters

In [8]:
learning_rate = 0.001
batch_size = 32
display_step = 10
dropout_keep_prob = 1.0


maxlena=150 # 0 - if we dont want to use description at all
maxlent=40
maxlen = maxlena + maxlent
maxlenh = maxlent
maxlend = maxlena

vocab_size = len(char2idx)
embedding_size = 100

empty = id_emp
eos = id_eos
unk = id_unk
beg = id_beg


# for cnn encoder use
filter_sizes = [2,3,4,5,6,8,10,13]
num_filters = 16

# for rnn deocoder use ,GRU cell memory size. same as encoder state
memory_dim = 128

In [9]:
encoder_inputs = tf.placeholder(tf.int32, shape=[None,maxlend], name='encoder_inputs')
decoder_targets = tf.placeholder(tf.int32,shape=(None, maxlenh), name='decoder_targets')
decoder_inputs = tf.placeholder(tf.int32, [None, maxlenh], name = "decoder_inputs")

In [10]:
embeddings = tf.Variable(
    tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),name="embeddings")

In [11]:
# cnn as encode
def CNNEncoder(encoder_inputs):
    #train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
    # to expand one dim for CNN
    embed_expanded = tf.expand_dims(encoder_inputs_embedded,-1)

    pooled_outputs = []
    for i, filter_size in enumerate(filter_sizes):
        with tf.name_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
            conv = tf.nn.conv2d(
                embed_expanded,
                W,  
                strides=[1, 1, 1, 1], 
                padding="VALID",
                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            #print h.shape
            # Max-pooling over the outputs
            pooled = tf.nn.max_pool(
                h,  
                ksize=[1, maxlend - filter_size + 1, 1, 1], 
                strides=[1, 1, 1, 1], 
                padding='VALID',
                name="pool")          
            pooled_outputs.append(pooled)
    # Combine all the pooled features
    num_filters_total = num_filters * len(filter_sizes)
    h_pool = tf.concat(pooled_outputs,3)
    #print h_pool.shape
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
    #print h_pool_flat.shape

    with tf.name_scope("dropout"):
        h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob,name="dropout")
    return h_drop

 RNN AS Decoder

In [12]:
def RNNDecoder(encoder_state,decoder_inputs):
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)
    #from tensorflow.models.rnn import rnn_cell, seq2seq
    cell = rnn.GRUCell(memory_dim)
    decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
        cell, decoder_inputs_embedded,
        initial_state=encoder_state,
        dtype=tf.float32,scope="plain_decoder1")
    return decoder_outputs, decoder_final_state 


In [13]:
encoder_state = CNNEncoder(encoder_inputs)
decoder_outputs, _ = RNNDecoder(encoder_state,decoder_inputs)

decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
labels = tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32)
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels = labels,
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy,name = "loss")

decoder_prediction = tf.argmax(decoder_logits, 2,name = "decoder_prediction")

train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss,name = "op_adam_minize")


In [14]:
labels_ = tf.argmax(labels,2,name = "labels_")

# 3. Training:

In [15]:
def rpadd(x, maxlen=maxlenh, eos=eos,lpad=True,prefix=None):
    assert maxlen >= 0
    
    if prefix != None:
        x = [prefix] + x
    n = len(x)
    if n > maxlen - 1 :
        x = x[:maxlen - 1]
        n = maxlen - 1
    res = x + [eos] + [empty] * (maxlen - n - 1) 
    assert len(res) == maxlen
    return res

In [16]:
outfile = open("log/train.20170520.samples",'w')
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
def prt(label, x,):
    outfile.write((label+':').encode("utf-8")),
    for w in x:
        if w == id_emp:
            continue
        outfile.write(idx2char[w].encode("utf-8")),
    outfile.write("\n")
    outfile.flush()
import logging
logger = logging.getLogger('training')
hdlr = logging.FileHandler('log/train.20170520.log')
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

In [None]:
RESTORE = False
batch_size = 128
epocs = 1500

saver = tf.train.Saver()
with tf.Session() as sess: 
    
    if RESTORE:
        #First let's load meta graph and restore weights
       
        saver = tf.train.import_meta_graph('model/TitleGeneration-110.meta')
        saver.restore(sess,tf.train.latest_checkpoint('model/'))
    
    sess.run(tf.global_variables_initializer())
      
    graph = tf.get_default_graph()
    for i in range(epocs):
        j = 0
        while (j < len(X_train)):
    
            encoder_inputs_ = map(lambda x:rpadd(x,maxlend),X_train[j:j+batch_size])
            decoder_inputs_ = map(lambda x:rpadd(x,maxlenh,prefix=beg),Y_train[j:j+batch_size])        
            decoder_targets_ = map(lambda x:x[1:] + [empty],decoder_inputs_)
    
            j = j + batch_size
            _,loss_,labels__,decoder_prediction_ = sess.run([train_op,loss,labels_,decoder_prediction],
                feed_dict={
                    encoder_inputs : encoder_inputs_,
                    decoder_inputs : decoder_inputs_,
                    decoder_targets : decoder_targets_
            })
    
            """
            print "encorder_inputs:", encoder_inputs_
            print "decoder_inputs_:", decoder_inputs_
            print "decoder_targets_", decoder_targets_
            print "lables_",labels__
            print 'decoder_prediction is :' , decoder_prediction_
            """
            if j % (batch_size * 30) == 0:
                logger.info( "Runing in EPOC[%d] Batch [%d] with loss [%f]" %(i, j / batch_size,loss_))             
                k = random.randint(0,len( encoder_inputs_)-1)
                print "-" * 20
                prt("[**描  述**]",encoder_inputs_[k])
                
                test_x = []
                test_encode_input = encoder_inputs_[k]
                
                for l in range(maxlenh):
                    new_decoder_input = rpadd(test_x,maxlenh,prefix=beg)
                    decoder_prediction_ = sess.run([decoder_prediction],
                             feed_dict = {
                                encoder_inputs : [test_encode_input],
                                decoder_inputs : [new_decoder_input]
                             }
                    )
                    test_x.append(decoder_prediction_[0][0][l])
                    if decoder_prediction_[0][0][l] == eos:
                        break
                prt("[*预测标题*]",test_x)                
                #prt("[*预测标题*]",decoder_prediction_[k])
                prt("[*真实标题*]",decoder_inputs_[k])
                
                print "-" * 20
                
        if i % 10 == 0:
                saver.save(sess,"model2/TitleGeneration",global_step = i)
    

    