In [1]:
import numpy as np
import tensorflow as tf
import utils7
import sys
import os
import pandas as pd
import pickle
import math
import time
import zhihu_cnn2 as cnn

In [2]:
word_dict = pickle.load(open('./ieee_zhihu_cup/word_dict' ,'r'))
char_dict = pickle.load(open('./ieee_zhihu_cup/char_dict' ,'r'))

In [3]:
cnn_seq_length = 50 
rnn_seq_length = 80
batch_size = 256
eval_batch_size = 2000
hidden_size = 256
lr = 0.01
reg_rate = 0.0001
epoch_num = 20
save_per_step = 1000
eval_per_step = 500
keep_prob = 0.5 
atn_hidden_size = 512
shuffle = True
ckpt_path = './models/'
num_sentences = 2
filter_sizes = [1,2,3,4,5]
num_filters = 256
decay_steps = 2000 
decay_rate = 0.90
vocab_size = len(word_dict) 
char_size = len(char_dict) 
embed_size = 128 
is_train = True

summary_dir = './Log/'
max_score = 0.0

In [4]:
utils = utils7.Utils(words_num=rnn_seq_length, chars_num=cnn_seq_length, batch_size=batch_size, 
                     eval_batch_size=eval_batch_size, dataset_dir='./ieee_zhihu_cup/', 
                     epoch_num=epoch_num, word_dict=word_dict, char_dict=char_dict, shuffle=True)

In [5]:
def train(sess, utils):
    # 生成数据
    batches = utils.generate_t_batch()
    eval_batches = utils.generate_e_batch()
    model = cnn.CNN(cnn_seq_length, rnn_seq_length, batch_size, eval_batch_size, hidden_size, 
                                lr, reg_rate, epoch_num, save_per_step, eval_per_step, keep_prob, 
                                atn_hidden_size, shuffle, ckpt_path, num_sentences, 
                                filter_sizes, num_filters, decay_steps, decay_rate, 
                                vocab_size, char_size, embed_size, is_train)
    sess.run(tf.global_variables_initializer())
    
    summary_writer = tf.summary.FileWriter(summary_dir, graph=tf.get_default_graph())
    
    for batch in batches:
        x_word, x_word_length, y = zip(*batch)
        feed_dict = {model.rnn_input: x_word,
                     model.output_keep_prob: keep_prob,
                     model.labels:y}
        train_loss, step, summary, _= sess.run([model.loss, model.global_step, model.merged, model.train_op], 
                                                feed_dict) 
        summary_writer.add_summary(summary, step)
        if (step % eval_per_step == 0 and step > 10000):
            path = model.saver.save(sess, "models/cnn-model", global_step=step)
            print("Saved model checkpoint to {}".format(path))  
            
            print ("Step:",step)
            print ("Train loss:",train_loss)
            
            score_list = []
            eval_loss_list = []
            for i in range(20):
                x_word, x_word_length, y, real_labels = zip(*(eval_batches.next()))
                score, eval_loss = do_eval(utils, sess, model, x_word, y, real_labels)
                score_list.append(score)
                eval_loss_list.append(eval_loss)
            print("avg eval loss:", np.mean(eval_loss_list))
            save_best_model(score_list, step)
        # 每训练save_per_step次保存1次模型
#         if (step % save_per_step == 0):
#             path = model.saver.save(sess, "models/rnn-model", global_step=step)
#             print("Saved model checkpoint to {}".format(path))     
            
# 在验证集上做验证，报告损失、精确度
def do_eval(utils, sess, model, rnn_input, y, real_labels):
    predict_top_5 = tf.nn.top_k(model.logits, k=5)
    feed_dict = {model.rnn_input: rnn_input,
                 model.output_keep_prob: 1.0,    
                 model.labels:y}
    curr_eval_loss, predict_5 = sess.run([model.loss, predict_top_5], feed_dict)
#     print ("Evaluation loss:",curr_eval_loss)
#     print ("real_labels:",real_labels[:5])
#     print ("predict:",predict_5[1][:5])
#     print ("predict:",predict_5[0][:5])
    predict_label_and_marked_label_list = []
    for predict,label in zip(predict_5[1],real_labels):
        predict_label_and_marked_label_list.append((list(predict),list(label)))
    score = utils.eval(predict_label_and_marked_label_list)
    return score, curr_eval_loss
#     print("score:",score)
#     print "--------------------Parting Line---------------------"
    
def predict(utils, restore=True):
    with tf.Session() as sess:
        if restore:
            model = cnn.CNN(cnn_seq_length, rnn_seq_length, batch_size, eval_batch_size, 
                            hidden_size, lr, reg_rate, epoch_num, save_per_step, eval_per_step, 
                            keep_prob, atn_hidden_size, shuffle, ckpt_path, num_sentences, 
                            filter_sizes, num_filters, decay_steps, decay_rate, 
                            vocab_size, char_size, embed_size, is_train)
            model.saver.restore(sess=sess, save_path=tf.train.latest_checkpoint(ckpt_path))
        predict_top_5 = tf.nn.top_k(model.logits, k=5)
        pred_batches = utils.generate_p_batch()
        for count, pred_batch in enumerate(pred_batches):
            sys.stdout.write("Count %d\r" % count)
            sys.stdout.flush()
            x_word, x_word_length = zip(*(pred_batch))
            feed_dict = {model.rnn_input: x_word,
                         model.output_keep_prob: 1.0}
            predict_5 = sess.run(predict_top_5, feed_dict=feed_dict)
            if count == 0:
                predict = predict_5[1]
            else:
                predict = np.concatenate((predict,predict_5[1]))
        np.savetxt("./Result/predict.txt",predict,fmt='%d')
        
def save_best_model(score_list, step):
    global max_score
    avg_score = np.mean(score_list)
    avg_score_str = "%d steps avg score: %f\n" % (step, avg_score) 
    file_name = 'avg_loss_file'
    save_path = './best_model/'
    if os.path.exists(save_path):
        wr = open(save_path + file_name, 'a')
        wr.write(avg_score_str)
        if avg_score > max_score:
            max_score = avg_score
            os.system('rm ./best_model/cnn-model*')
            os.system('cp ./models/cnn-model-' + str(step) + '* ' + save_path)
    else:
        os.mkdir(save_path)
        os.system('cp ./models/cnn-model-' + str(step) + '* ' + save_path)   
    print(avg_score_str)
    print "--------------------Parting Line---------------------"

In [6]:
with tf.Session() as sess:
    train(sess, utils)

Saved model checkpoint to models/cnn-model-10500
('Step:', 10500)
('Train loss:', 0.0050972379)
('avg eval loss:', 0.0052345358)
10500 steps avg score: 0.302046

--------------------Parting Line---------------------
Saved model checkpoint to models/cnn-model-11000
('Step:', 11000)
('Train loss:', 0.0053465725)
('avg eval loss:', 0.0052435682)
11000 steps avg score: 0.303561

--------------------Parting Line---------------------
Saved model checkpoint to models/cnn-model-11500
('Step:', 11500)
('Train loss:', 0.0054845195)
('avg eval loss:', 0.0052352571)
11500 steps avg score: 0.303405

--------------------Parting Line---------------------
Saved model checkpoint to models/cnn-model-12000
('Step:', 12000)
('Train loss:', 0.0048052236)
('avg eval loss:', 0.0052445675)
12000 steps avg score: 0.302362

--------------------Parting Line---------------------
Saved model checkpoint to models/cnn-model-12500
('Step:', 12500)
('Train loss:', 0.0052740923)
('avg eval loss:', 0.0051871547)
12500 s

KeyboardInterrupt: 

In [6]:
print("params conf. file store success!")
predict(utils)

params conf. file store success!
INFO:tensorflow:Restoring parameters from ./models/4split-rnn-model-59500
('predict_set_num_batches:', 109)
Count 108

In [8]:
batches = utils.generate_e_batch()

In [9]:
a,b,c,y,r= zip(*(batches.next()))

In [11]:
len(a)

10

In [12]:
b

([9643,
  10,
  597,
  3933,
  1586,
  27804,
  1252,
  706,
  26963,
  378,
  5328,
  45,
  19,
  158,
  16,
  17,
  301,
  13,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [9213,
  318,
  16554,
  94,
  329,
  2022,
  19917,
  598,
  12438,
  1331,
  267,
  13,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [985,
  64651,
  53,
  3516,
  417211,
  10,
  4762,
  16,
  17,
  13,
  11200,
  10232,
  1,
  64651,
  53,
  169,
  1923,
  1

In [13]:
c

(18, 12, 45, 80, 11, 15, 58, 7, 18, 80)

In [15]:
r

([1775, 1239],
 [1672],
 [749, 278, 217],
 [904, 254, 163, 119],
 [1190],
 [417],
 [901, 410, 172, 75],
 [299],
 [1151, 534, 511, 377],
 [323])