# img2text implementation 
- 使用的是show and tell model
- 使用flckr30k dataset
- 使用已有的inception v3提取img features

## generate vocab

In [None]:
import os
import sys
from pprint import pprint

input_description_file = './flickr 30k/results_20130124.token'
output_vocab_file = './flickr 30k/vocab.txt'

def count_vocab(input_file):
    """get vocab and distribution of disc length"""
    with open(input_file,'r',encoding='utf-8') as f:
        lines = f.readlines()
    max_length = 0
    length_dict = {}
    vocab_dict = {}
    for line in lines:
        img_id,desc = line.strip('\n').split('\t')
        words = desc.strip(' ').split()
        max_length = max(max_length,len(words))
        length_dict.setdefault(len(words),0)
        length_dict[len(words)] += 1
        for word in words:
            vocab_dict.setdefault(word,0)
            vocab_dict[word] += 1
    print(max_length)
    pprint(length_dict)
    return vocab_dict

vocab_dict = count_vocab(input_description_file)


In [None]:
def store_vocab(vocab_dict,output_file):
    vocab_dict = sorted(vocab_dict.items(),
                        key=lambda w:w[1],reverse=True)
    with open(output_file,'w',encoding='utf-8') as f:
        f.write('<UNK>\t1000000\n')
        for item in vocab_dict:
            f.write('%s\t%d\n'%item)

store_vocab(vocab_dict,output_vocab_file)

## extract img feature
- use freeze graph tool to combine checkpoint and graph
- check point only save params
- graph contain graph structure
- pb file save graph and checkpoint
    - can get graph structure and params

In [16]:
import tensorflow as tf
from tensorflow import logging
from tensorflow import gfile
import numpy as np
import math
import sklearn 
from pprint import pprint
import os

model_file = './flickr 30k/inception-2015-12-05/classify_image_graph_def.pb'
input_img_dir = './flickr 30k/flickr30k-images'
input_description_file = './flickr 30k/results_20130124.token'
output_dir = './flickr 30k/img_features'

# 防止特征文件过大 所以分开存储在一个dir下
img_feature_batch_size = 1000
'''
使用gfile 
1 保证兼容性更好 
2 在分布式环境下 tf宝恒可移植性 os不保证
'''
if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)

### token prepocessing

In [2]:
def parse_token_file(token_file):
    """
    解析tokenfile 得到一个图像 对应的disc list的dict
    """
    img_name_to_tokens = {}
    with gfile.GFile(token_file,'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id,descs = line.strip('\r\n').split('\t')
        img_name,_ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name,[])
        img_name_to_tokens[img_name].append(descs)
    return img_name_to_tokens
img_name_to_tokens = parse_token_file(input_description_file)

In [3]:
print(img_name_to_tokens['1000092795.jpg'])
img_names = list(img_name_to_tokens.keys())
logging.info('num of imgs is %d\n' % len(img_names))
print(type(img_names))

['Two young guys with shaggy hair look at their hands while hanging out in the yard .', 'Two young , White males are outside near many bushes .', 'Two men in green shirts are standing in a yard .', 'A man in a blue shirt standing in a garden .', 'Two friends enjoy time spent together .']
INFO:tensorflow:num of imgs is 31783

<class 'list'>


### load model

In [13]:
def load_prepocessing_inception_v3_model(model_file):
    with gfile.FastGFile(model_file,'rb') as f:
        '''
        - 获取图定义 
        - 解析已有图定义 
        - 导入图作为默认使用图
            - 效果：后面打开的session默认使用这个graph_def 并且tf.get_default_graph得到的是这个图
        '''
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        tf.import_graph_def(graph_def,name='inceptionv3_def')

load_prepocessing_inception_v3_model(model_file)

In [20]:
# 由于上面已经import 
# 所以sess 默认的图是inceptionv3 的graph
import pickle
batch_num = int(len(img_names)/hps.batch_size)
if len(img_names) % hps.batch_size != 0:
    batch_num += 1
def get_all_variables_and_tensors_name(sess):
    '''获取全部variable name,tensor name and op name
    由于导入的已训练好的的图结构 所以variables为空 
    但是可以通过graph_editor.get_tensors(graph) 获得tensor(output)
    也可以通过sess.graph_def 得到响应的属性 如node 
    
    通过以上操作 可以得到op_name variables_name 从而可以通过sess.graph.get_tensor_by_name 获取相应的tensor/op
    并且可以通过feed_dict 对于制定的placeholder进行输入 
    '''
    
    print(tf.trainable_variables())
    print('---------------------------------')
    print('---------------------------------')
    # 获取所有variable（每个op中可训练的张量）的name：
    for variable_name in tf.global_variables():
        print(variable_name)
    print('---------------------------------')
    print('---------------------------------')
    # 获取所有tensor（每个op的输出张量）的name
    for tensor_name in tf.contrib.graph_editor.get_tensors(tf.get_default_graph()):
        print(tensor_name)
    print('---------------------------------')
    print('---------------------------------')
    # 获取所有op及其输入输出的name：
    for op_node in sess.graph_def.node:
        print(op_node)
'''
get 
Tensor("inceptionv3_def/DecodeJpeg/contents:0", shape=(), dtype=string)
Tensor("inceptionv3_def/pool_3:0", shape=(1, 1, 1, 2048), dtype=float32)
'''        

# with tf.Session() as sess:
#     get_all_variables_and_tensors_name(sess)

with tf.Session() as sess:
#     get tensor by sess.graph by tensor name
#     获取倒数第二层输出 是global_pooling output
    second_to_last_tensor = sess.graph.get_tensor_by_name(
                            'inceptionv3_def/pool_3:0')
    for i in range(batch_num):
        # TypeError: 'dict_keys' object is not subscriptable
        # python3 中keys对象可以迭代 不可索引
        batch_img_names = img_names[i*hps.batch_size:(i+1)*hps.batch_size]
        batch_features = []
        for img_name in batch_img_names:
            img_path = os.path.join(input_img_dir,img_name)
            logging.info('getting feature from %s' % img_path)
            if not gfile.Exists(img_path):
                print('not exit')
                continue
            img_data = gfile.FastGFile(img_path,'rb').read()
            # 通过sess运行tensor 得到feature
            feature = sess.run(second_to_last_tensor,
                              feed_dict={
                                  'inceptionv3_def/DecodeJpeg/contents:0':img_data
                              })
            # feed_dict 中placeholder 以及tensor的名字需要通过查看graph信息得到(??)
            batch_features.append(feature)
        print(len(batch_features))
        # list 长度为0会报错
        batch_features = np.vstack(batch_features)
        output_filename = os.path.join(output_dir,
                                      'img_feature-%d.pickle'%i)
        logging.info('writing to file %s' % output_filename)
        with gfile.GFile(output_filename,'wb') as f:
            pickle.dump((batch_img_names,batch_features),f)
 

INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1000092795.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\10002456.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1000268201.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1000344755.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1000366164.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1000523639.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1000919630.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\10010052.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1001465944.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1001545525.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1001573224.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1001633352.jpg
INFO:tensorflow:gett

INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1029737941.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1029802110.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\102998070.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1030041880.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\103031977.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1030985833.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\103106960.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\103195344.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1031973097.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\103205630.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1032122270.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1032460886.jpg
INFO:tensorflow:getti

INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1067180831.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1067675215.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1067790824.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1068427675.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\106959209.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\107022557.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1071201387.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1072153132.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1072439304.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1073145694.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\107318069.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1073444492.jpg
INFO:tensorflow:get

INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\109823395.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\109823397.jpg
60
INFO:tensorflow:writing to file ./flickr 30k/img_features\img_feature-4.pickle
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1099034049.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1099434461.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\109982467.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1100214449.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1100708949.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1101207553.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1101291373.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\1101923011.jpg
INFO:tensorflow:getting feature from ./flickr 30k/flickr30k-images\110201971.jpg
INFO:tensorflow:gett

KeyboardInterrupt: 

## data generator
- load vocab
- load img features
- provid data for training

In [17]:
input_description_file = './flickr 30k/results_20130124.token'
input_img_feature_dir = './flickr 30k/img_features'
run_out_dir = './flickr 30k/run_out'
input_vocab_file = './flickr 30k/vocab.txt'

if not gfile.Exists(run_out_dir):
    gfile.MakeDirs(run_out_dir)

### hyper parameters

In [27]:
'''定义超参数'''
def get_default_params():
    """类似rnn中的超参数"""
    return tf.contrib.training.HParams(
        num_vocab_word_threshold = 5,
        num_embedding_nodes = 32,
        num_timesteps = 15,
        num_lstm_nodes = [64,64],
        num_lstm_layers = 2,
        num_fc_nodes = 32,
        cell_type = 'lstm',
        keep_prob_rnn = 0.8,
        keep_prob_fc = 0.7,
        
        learning_rate = 1e-3,
        clip_lstm_grad = 1.0,
        
        batch_size = 60,
        log_frequent = 100,
        save_frequent = 500,
    )
hps = get_default_params()

### vocab 封装

In [8]:
class Vocab:
    """vocab 封装 word2id  id2word 不同于txtclassification
    """
    def __init__(self,filename,word_num_threshold):
        self._filename = filename
        self._word_num_threshold = word_num_threshold
        self._word2id_dict = {}
        self._id2word_dict = {}
        self._unk = -1
        self._eos = -1
        self._read_dict(self._filename)
        
    def _read_dict(self,filename):
        with gfile.GFile(filename,'r') as f:
            lines = f.readlines()
        for line in lines:
            word,freq = line.strip('\r\n').split('\t')
            freq = int(freq)
            if freq < self._word_num_threshold:
                continue
            idx = len(self._word2id_dict)
            if word == '<UNK>':
                self._unk = idx
            elif word == '.':
                self._eos = idx
            if word in self._word2id_dict or idx in self._id2word_dict:
                raise Exception('duplicated words in vocab')
            self._word2id_dict[word] = idx
            self._id2word_dict[idx] = word
    @property
    def unk(self):
        return self._unk
    @property
    def eos(self):
        return self._eos
    def id2word(self,idx):
        return self._id2word_dict.get(idx,'<UNK>')
    def word2id(self,word):
        return self._word2id_dict.get(word,self._unk)
    def size(self):
        return len(self._word2id_dict)
    def encode(self,sentence):
        '''encode sentence into id list'''
        return [self.word2id(word) for word in sentence.split(' ')]
    def decode(self,idlist):
        '''decode id list into sentence'''
        return ' '.join([self.id2word(idx) for idx in idlist])
        
vocab = Vocab(input_vocab_file,hps.num_vocab_word_threshold)
vocab_size = vocab.size()
testidlist = vocab.encode('hello , nice to meet you')
print(vocab.decode([5,6,7,8,9]))
print(vocab_size)
print(testidlist)

the on and is man
8186
[6081, 12, 994, 15, 2144, 1386]


### 辅助函数 借助前面的tokens_dict+vocab->token_ids_dict

In [9]:
def convert_tokens_to_id(img_name_to_tokens,vocab):
    """from token dict to id dict"""
    img_name_to_token_ids = {}
    for img_name in img_name_to_tokens.keys():
        img_name_to_token_ids.setdefault(img_name,[])
        for desc in img_name_to_tokens[img_name]:
            token_ids = vocab.encode(desc)
            img_name_to_token_ids[img_name].append(token_ids)
    return img_name_to_token_ids
# 上面已经实现了parse token file
img_name_to_tokens = parse_token_file(input_description_file)
img_name_to_token_ids = convert_tokens_to_id(
            img_name_to_tokens,vocab)
logging.info('num of img is %d\n'%len(img_name_to_token_ids))
logging.info('num of img is %d\n'%len(img_name_to_tokens))
pprint(img_name_to_token_ids['1000092795.jpg'])
pprint(img_name_to_tokens['1000092795.jpg'])

INFO:tensorflow:num of img is 31783

INFO:tensorflow:num of img is 31783

[[16, 23, 342, 11, 2171, 113, 189, 17, 66, 159, 24, 332, 75, 4, 5, 503, 2],
 [16, 23, 12, 1098, 734, 14, 56, 85, 319, 1465, 2],
 [16, 34, 4, 48, 260, 14, 32, 4, 1, 503, 2],
 [3, 9, 4, 1, 26, 21, 32, 4, 1, 739, 2],
 [16, 523, 830, 617, 0, 140, 2]]
['Two young guys with shaggy hair look at their hands while hanging out in the '
 'yard .',
 'Two young , White males are outside near many bushes .',
 'Two men in green shirts are standing in a yard .',
 'A man in a blue shirt standing in a garden .',
 'Two friends enjoy time spent together .']


 ### dataset 实现
 - 实现通过img_name 得到random desc id + weights(通过timesteps截断后的)
 - 实现读取目录获得img_filenames + features
 - 用filenames 关联得到\[descs,weights,features\]
 - next_batch 提供得到的data (+random_shuffle)

In [24]:
import pickle
class ImageCaptionDataSet:
    def __init__(self,img_name_to_token_ids,
                img_feature_dir,num_time_steps,vocab,
                deterministic = False):
        self._img_name_to_token_ids = img_name_to_token_ids
        self._img_feature_dir = img_feature_dir
        self._num_time_steps = num_time_steps
        self._vocab = vocab # 用于提取填充字符
        self._deterministic = deterministic
        self._indicator = 0
        
        self._img_names = []
        self._img_features = []
        
        # 读取目录 
        self._all_img_feature_filepaths = []
        for filename in gfile.ListDirectory(self._img_feature_dir):
            self._all_img_feature_filepaths.append(
                os.path.join(img_feature_dir,filename))
        logging.info(len(self._all_img_feature_filepaths))
        # 通过目录中的文件路径读取feature
        self._load_img_features()
        # shuffle 
        if not self._deterministic:
            self._random_shuffle()
    def size(self):
        return len(self._all_img_feature_filepaths)
    
    def _random_shuffle(self):
        idx = np.random.permutation(self.size())
        self._img_names[idx]
        self._img_features[idx]
        
    def _load_img_features(self):
        """通过文件名字得到features保存到_img_features"""
        for img_feature_filepath in self._all_img_feature_filepaths:
            logging.info('loading %s'%img_feature_filepath)
#             with gfile.GFile(img_feature_filepath,'rb') as f:
#                 img_names,features = pickle.load(f)
            with open(img_feature_filepath,'rb') as f:
                img_names,features = pickle.load(f)
                # 注意两种不同的处理方式
                self._img_features.append(features)
                self._img_names += img_names
                # 放到with中可以自动缓冲读取?? 如何缓冲读取??
                
        self._img_names = np.asarray(self._img_names)
        
        self._img_features = np.vstack(self._img_features)
        # [1000,1,1,num]*2->[2000,1,1,num]->[2000,num]
#         self._img_features = np.asarray(self._img_features)
        original_shape = self._img_features.shape
        print(original_shape)
        # 只能使用np.reshape 自动实现np array转换
        # self._img_features.reshape((original_shape[0],original_shape[3]))
        # reshape 无效 尽管已经是nparray reshape返回新的np.array对象
        self._img_features = np.reshape(self._img_features,(original_shape[0],original_shape[3]))
        logging.info('shape is:%s\t%s\n' % (self._img_features.shape,
                                            self._img_names.shape))
    def size(self):
        return len(self._img_names)
    
    def img_feature_size(self):
        return self._img_features.shape[1]
    
    def next_batch(self,batch_size):
        """return batchsize data(id,weright,feature)"""
        if batch_size > self.size():
            raise Exception('too large batch_size')
        end_indicator = self._indicator + batch_size
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        batch_filenames = self._img_names[self._indicator:end_indicator]
        batch_img_features = self._img_features[self._indicator:end_indicator]
        # get descs and weights
        batch_desc_ids,batch_weights = self._get_img_descid_and_w(batch_filenames)
        self._indicator = end_indicator
        return (batch_img_features,batch_desc_ids,
                batch_weights,batch_filenames)
    
    def _get_img_descid_and_w(self,batch_filenames):
        """get desc id and weight by batch filenames"""
        batch_desc_ids = []
        batch_weights = []
        for filename in batch_filenames:
            # 获取 并且进行random choice
            token_ids = self._img_name_to_token_ids[filename]
#             chosen_token_ids = np.random.choice(token_ids)
            chosen_token_ids = token_ids[np.random.randint(0,len(token_ids))]
            # 进行截断 / 补长 并调整weight
            chosen_token_ids_len = len(chosen_token_ids)
            weight = [1 for _ in range(chosen_token_ids_len)]
            
            if chosen_token_ids_len >= self._num_time_steps:
                chosen_token_ids = chosen_token_ids[0:self._num_time_steps]
                weight = weight[0:self._num_time_steps]
            else:
                padding_len = self._num_time_steps - chosen_token_ids_len
                # 用unk / eof 填充
                chosen_token_ids += [self._vocab.eos for i in range(padding_len)] 
                weight += [0 for _ in range(padding_len)]
            batch_desc_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        # 小心 所有return 转成np array
        batch_desc_ids = np.asarray(batch_desc_ids)
        batch_weights = np.asarray(batch_weights)
        return batch_desc_ids,batch_weights
# demo    
caption_dataset = ImageCaptionDataSet(img_name_to_token_ids,
                                     input_img_feature_dir,
                                     hps.num_timesteps,
                                     vocab,False)
img_feature_dim = caption_dataset.img_feature_size()
caption_dataset_size = caption_dataset.size()
logging.info('img_feature_dim:%d'%img_feature_dim)
logging.info('caption data size:%d'%caption_dataset_size)

batch_img_features,batch_desc_ids,batch_weights,batch_filenames \
    = caption_dataset.next_batch(2)
pprint(batch_img_features)
pprint(batch_weights)
pprint(batch_desc_ids)
pprint(batch_filenames)

INFO:tensorflow:530
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-0.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-1.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-10.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-100.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-101.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-102.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-103.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-104.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-105.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-106.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-107.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-108.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-109.pickle
INFO:tensorflow:loading ./flickr 30k

INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-20.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-200.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-201.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-202.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-203.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-204.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-205.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-206.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-207.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-208.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-209.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-21.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-210.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img

INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-301.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-302.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-303.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-304.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-305.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-306.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-307.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-308.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-309.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-31.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-310.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-311.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-312.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\im

INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-403.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-404.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-405.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-406.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-407.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-408.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-409.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-41.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-410.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-411.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-412.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-413.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-414.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\im

INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-505.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-506.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-507.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-508.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-509.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-51.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-510.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-511.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-512.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-513.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-514.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-515.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\img_feature-516.pickle
INFO:tensorflow:loading ./flickr 30k/img_features\im

## create model

### 计算图构建
- 输入不同：第一个是img_feature embedding 
- 输出不同：不再是一个输出 二十多个输出 
- loss不同：需要考虑每个输出的得到的loss的gd 而不再是最后一个的
    - weight控制loss计算
- 网络结构可选

In [25]:
tf.reset_default_graph()
def create_rnn_cell(hidden_dim,cell_type):
    """return cell by cell type and hidden_dim(output_dim)"""
    if cell_type == 'lstm':
        return tf.nn.rnn_cell.BasicLSTMCell(hidden_dim,state_is_tuple=True)
    elif cell_type == 'gru':
        # gru 不需要指定state_is_tupe 变量
        return tf.nn.rnn_cell.GRUCell(hidden_dim)
    else:
        raise Exception('%s type has not been supported' % cell_type)
        
def dropout(cell,keep_prob):
    """wrapper dropout layer"""
    return tf.nn.rnn_cell.DropoutWrapper(cell,
                                        output_keep_prob=keep_prob)
def get_train_model(hps,vocab_size,img_feature_dim):
    '''
    Args:
    - vocab_size:embedding table + output logits(用于预测)
    - img_feature_dim:transfer into num_embedding_nodes by fc
    '''
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    # placeholder
    img_features = tf.placeholder(tf.float32,(batch_size,img_feature_dim))
    descs = tf.placeholder(tf.int32,(batch_size,num_timesteps))
    weight_mask = tf.placeholder(tf.int32,(batch_size,num_timesteps))
    keep_prob = tf.placeholder(tf.float32,[],name='keep_prob')
    
    global_step = tf.Variable(tf.zeros([],tf.int64),
                             name='global_step',
                             trainable=False)
    # set up embedding layer
    embedding_initializer = tf.random_uniform_initializer(-1.0,1.0)
    with tf.variable_scope('embedding',initializer=embedding_initializer):
        embedding_table = tf.get_variable(
            'embedding_table',
            [vocab_size,hps.num_embedding_nodes],
            tf.float32)
        # apply to descs --(id->embedding_table[id])
        # 并且保证第一个是img_feature_embedding 所以 descs少用一个
        embed_descs = tf.nn.embedding_lookup(embedding_table,
                            descs[: , 0:num_timesteps-1])
    # embedding img_feature by fc
    #全连接层通常可用uniform_unit_scaling_initializer
    img_feature_embed_initializer = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('img_feature_embed',initializer=img_feature_embed_initializer):
        '''
        [batch_size,num_timesteps-1,num_embedding_nodes]
        [batch_size,img_feature_dim]->[batch_size,num_embedding_nodes]
        [batch_size,num_embedding_nodes]->[batch_size,1,num_embedding_nodes]
        concat -> [batch_size,num_timesteps,num_embedding_nodes]
        '''
        embed_img = tf.layers.dense(img_features,hps.num_embedding_nodes)
        embed_img = tf.expand_dims(embed_img,axis=1)
        embed_rnn_inputs = tf.concat([embed_img,embed_descs],axis=1)
    # set up rnn network
    scale = 1.0/math.sqrt(hps.num_embedding_nodes+hps.num_lstm_nodes[-1])/3.0
    lstm_initializer = tf.random_uniform_initializer(-scale,scale)
    with tf.variable_scope('lstmnn',initializer=lstm_initializer):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i],hps.cell_type)
            cell = dropout(cell,keep_prob)
            cells.append(cell)
        multirnncell = tf.nn.rnn_cell.MultiRNNCell(cells)
        # call rnncell by dynamic_rnn
        initial_state = multirnncell.zero_state(hps.batch_size,tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[-1]]
        rnn_outputs,_ = tf.nn.dynamic_rnn(multirnncell,
                                          embed_rnn_inputs,
                                          initial_state=initial_state)
    
    # set up fc layer
    fc_initializer = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc',initializer=fc_initializer):
        # 全连接 转成2dim 保存num_lstm_node[-1]不变
        # 否则不知道怎么全连接 必须是2-dim才可进行
        rnn_outputs = tf.reshape(rnn_outputs,(-1,hps.num_lstm_nodes[-1]))
        fc1 = tf.layers.dense(rnn_outputs,hps.num_fc_nodes,name='fc1')
        #droped_fc1 = tf.contrib.nn.dropout(fc1,keep_prob)
        droped_fc1 = tf.nn.dropout(fc1,keep_prob=keep_prob)
        activated_fc1 = tf.nn.relu(droped_fc1)
        # 变成vocab_size 分类问题 巧妙
        logits = tf.layers.dense(activated_fc1,vocab_size,name='logits')
    
    # calculate loss and accuracy
    with tf.name_scope('loss_and_accu'):
        # 由于输出[batch_size*num_timesteps,vocab_size]
        # 所以对比需要用[batch_size*num_timesteps,] -> weight/descsid
        # flatten 即可 []表示没有维度 [None]/[-1]表示自动维度??
        descs_flatten = tf.reshape(descs,[-1])
        weight_mask_flatten = tf.reshape(weight_mask,[-1])
        mask_sum = tf.reduce_sum(weight_mask_flatten)
        mask_sum = tf.cast(mask_sum,tf.float32)
        # get loss(需要细致处理 所以不能直接reduce_mean 所以不用losses下的)
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits,labels=descs_flatten)
        # 运算时记得类型转换
        weighted_softmax_loss = tf.multiply(softmax_loss,
                                           tf.cast(weight_mask_flatten,tf.float32))
        loss = tf.reduce_sum(weighted_softmax_loss) / mask_sum
        
        y_pred = tf.argmax(logits,1,output_type=tf.int32)
        correction_pred = tf.equal(y_pred,descs_flatten)
        weighted_correction = tf.multiply(tf.cast(correction_pred,tf.float32),
                                         tf.cast(weight_mask_flatten,tf.float32))
        accuracy = tf.reduce_sum(weighted_correction) / mask_sum
        # scalar summary for tensorboard logging 
        tf.summary.scalar('accu',accuracy)
        tf.summary.scalar('loss',loss)

        
    # train_op
    with tf.name_scope('train_op'):
        # 控制梯度大小 防止爆炸
        tvars = tf.trainable_variables()
        for var in tvars:
            logging.info('var name is %s'%var.name)
        grads = tf.gradients(loss,tvars)
        clipped_grads,global_normal = tf.clip_by_global_norm(grads,
                                                            hps.clip_lstm_grad)
        for grad,var in zip(clipped_grads,tvars):
            tf.summary.histogram('grad_%s'%var.name,grad)
        
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(clipped_grads,tvars),global_step)

    
    return ((img_features,descs,weight_mask,keep_prob),
           (loss,accuracy),
           (train_op,global_step))

placeholders,metrics,others = get_train_model(hps,vocab_size,img_feature_dim)
img_features,descs,weight_mask,keep_prob = placeholders
loss,accuracy = metrics
train_op,global_step = others
        
init_op = tf.global_variables_initializer()
summary_op = tf.summary.merge_all()
# save sess 通过max_to_keep 制定最多保存的模型数目 以及保存时间 
saver = tf.train.Saver(max_to_keep=10)

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
INFO:tensorflow:var name is embedding/embedding_table:0
INFO:tensorflow:var name is img_feature_embed/dense/kernel:0
INFO:tensorflow:var name is img_feature_embed/dense/bias:0
INFO:tensorflow:var name is lstmnn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0
INFO:tensorflow:var name is lstmnn/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0
INFO:tensorflow:var name is lstmnn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0
INFO:tensorflow:var name is lstmnn/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0
INFO:tensorflow:var name is fc/fc1/kernel:0
INFO:tensorflow:var name is fc/fc1/bias:0
INFO:tensorflow:var name is fc/logits/ke

## train process

In [30]:
# ? tf.clip_by_global_norm
train_steps = 10000

with tf.Session() as sess:
    sess.run(init_op)
    # 声明 tensrboard记录的文件句柄 
    # 不同于saver：saver只需要在save时指定
    writer = tf.summary.FileWriter(run_out_dir,sess.graph)
    for i in range(train_steps):
        batch_data = caption_dataset.next_batch(hps.batch_size)
        input_vals = batch_data[0:3]+(hps.keep_prob_rnn,)
        
        feed_dict = dict(zip(placeholders,input_vals))
        fetches = [global_step,loss,accuracy,train_op]
        
        should_log = (i+1)%hps.log_frequent == 0
        should_save = (i+1)%hps.save_frequent == 0
        
        if should_log:
            fetches += [summary_op]
        outputs = sess.run(fetches,feed_dict)
        global_step_val,loss_val,accuracy_val = outputs[0:3]
        if should_log:
            summary_str = outputs[4]
            writer.add_summary(summary_str,global_step_val+1)
            logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f'
                         % (global_step_val+1, loss_val, accuracy_val))
        if should_save:
            logging.info("Step: %d, image caption model saved" % (global_step_val+1))
            saver.save(sess,os.path.join(run_out_dir,"img_caption"),global_step=global_step_val+1)

# training_steps = 10000

# with tf.Session() as sess:
#     sess.run(init_op)
#     writer = tf.summary.FileWriter(output_dir, sess.graph)
#     for i in range(training_steps):
#         batch_img_features, batch_sentence_ids, batch_weights, _ = caption_dataset.next_batch(hps.batch_size)
#         input_vals = (batch_img_features, batch_sentence_ids, batch_weights, hps.keep_prob_rnn)
        
#         feed_dict = dict(zip(placeholders, input_vals))
#         fetches = [global_step, loss, accuracy, train_op]
        
#         should_log = (i + 1) % hps.log_frequent == 0
#         should_save = (i + 1) % hps.save_frequent == 0
#         if should_log:
#             fetches += [summary_op]
#         outputs = sess.run(fetches, feed_dict)
#         global_step_val, loss_val, accuracy_val = outputs[0:3]
#         if should_log:
#             summary_str = outputs[4]
#             writer.add_summary(summary_str, global_step_val)
#             logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f'
#                          % (global_step_val, loss_val, accuracy_val))
#         if should_save:
#             logging.info("Step: %d, image caption model saved" % (global_step_val))
#             saver.save(sess, os.path.join(output_dir, "image_caption"), global_step=global_step_val)

INFO:tensorflow:Step:   100, loss: 5.294, accuracy: 0.242
INFO:tensorflow:Step:   200, loss: 4.826, accuracy: 0.266
INFO:tensorflow:Step:   300, loss: 4.871, accuracy: 0.251
INFO:tensorflow:Step:   400, loss: 4.718, accuracy: 0.265
INFO:tensorflow:Step:   500, loss: 5.038, accuracy: 0.255
INFO:tensorflow:Step: 500, image caption model saved
INFO:tensorflow:Step:   600, loss: 4.350, accuracy: 0.328
INFO:tensorflow:Step:   700, loss: 4.103, accuracy: 0.364
INFO:tensorflow:Step:   800, loss: 4.231, accuracy: 0.312
INFO:tensorflow:Step:   900, loss: 4.159, accuracy: 0.334
INFO:tensorflow:Step:  1000, loss: 4.556, accuracy: 0.311
INFO:tensorflow:Step: 1000, image caption model saved
INFO:tensorflow:Step:  1100, loss: 4.013, accuracy: 0.352
INFO:tensorflow:Step:  1200, loss: 3.933, accuracy: 0.357
INFO:tensorflow:Step:  1300, loss: 3.849, accuracy: 0.385


KeyboardInterrupt: 