In [2]:
import string
import os
from os import listdir, mkdir
from pickle import dump, load
import zhon.hanzi
import jieba

import numpy as np
from numpy import array, argmax

from keras.applications.inception_v3 import InceptionV3, preprocess_input
# from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, TimeDistributed, Lambda, BatchNormalization
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.callbacks import ModelCheckpoint
from keras import regularizers
import keras.backend as K

from nltk.translate.bleu_score import corpus_bleu
# from kulc.attention import ExternalAttentionRNNWrapper

Using TensorFlow backend.


# inception_v3提取图片特征

In [4]:
#cp /public/keras_pretrained_model/inception_v3_weights_tf_dim_ordering_tf_kernels.h5 ~/.keras/models
def extract_features(directory):
    # 去除最后一层，因为目的不是分类，而是特征抽取
    in_layer = Input(shape=(224, 224, 3))
    base_model = InceptionV3(weights='imagenet', input_tensor=in_layer) 
    model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)
    model.summary()
    
    # 每张图片抽取特征
    features = dict()
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        # 将像素转存为数组形式
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # VGG的函数，预处理并预测
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        #存储图片特征
        image_id = name.split('.')[0]
        features[image_id] = feature
        # print('>%s' % name)
    return features

features = extract_features('Data/Images')
print('Extracted Features: %d' % len(features))

if not os.path.exists('MiddleFiles_New'):
    mkdir('MiddleFiles_New')
dump(features, open('MiddleFiles_New/features.pkl', 'wb'))


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv2d_95 (Conv2D)              (None, 111, 111, 32) 864         input_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_95 (BatchNo (None, 111, 111, 32) 96          conv2d_95[0][0]                  
__________________________________________________________________________________________________
activation_95 (Activation)      (None, 111, 111, 32) 0           batch_normalization_95[0][0]     
____________________________________________________________________________________________

# 提取文本信息

In [5]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# 用字典存储标题-语句信息
def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        if not line:
            continue
        # 以空格分隔
        tokens = line.split()
        # 第一个是图片标题，后面的是描述语句
        image_id, image_desc = tokens[0], tokens[1]
        if image_id[-1] != '0':
            continue
        # 去除.jpg的后缀
        image_id = image_id.split('.')[0]
        #以字典形式存储
        mapping[image_id] = image_desc
    return mapping

#保存文件
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc in descriptions.items():
        lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


doc = load_doc('Data/Captions/flickr8kzhc.caption.txt')

# 加载描述语句
descriptions = load_descriptions(doc)
print('Loaded: %d' % len(descriptions))

# 保存文件
if not os.path.exists('MiddleFiles_New'):
    mkdir('MiddleFiles_New')
save_descriptions(descriptions, 'MiddleFiles_New/descriptions.txt')

Loaded: 8091


# 加载数据集

In [6]:
# 获取训练集的图片名称
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        # 可能有空行
        if not line:
            continue
        dataset.append(line)
    return set(dataset)

import jieba.posseg
# 根据图片名称加载描述语句
def load_clean_descriptions(filename, dataset):
    punctuations = string.punctuation + zhon.hanzi.punctuation
    poslist = ['dg','d','e','o','u','x','w','y','un']
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        if not line:
            continue
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1]
        image_descl = jieba.posseg.cut(image_desc)
        image_desc = []
        for word, pos in image_descl:
            if pos not in poslist:
                image_desc.append(word)
        
        # 描述语句首尾加开始/终止词，用于序列输入
        image_desc.insert(0, 'startseq')
        image_desc.append('endseq')
        
        # 跳过不在训练集中的图片
        if image_id in dataset:
            descriptions[image_id] = image_desc
    return descriptions

# 根据图片名称加载图片的特征向量
def load_photo_features(filename, dataset):
    all_features = load(open(filename, 'rb'))
    
    # features = {k: all_features[k].reshape(1, 49, 512) for k in dataset}
    return features


#加载图片名称
image_ids_train = load_set('Data/Partition/flickr8ktrain.txt')
image_ids_val = load_set('Data/Partition/flickr8kval.txt')
image_ids_test = load_set('Data/Partition/flickr8ktest.txt')

image_ids = set.union(image_ids_train,image_ids_val)

print('Dataset: %d' % len(image_ids))

#通过图片名称得到图片的描述语句
descriptions_train = load_clean_descriptions('MiddleFiles_New/descriptions.txt', image_ids_train)
print('Descriptions_train: %d' % len(descriptions_train))

descriptions_val = load_clean_descriptions('MiddleFiles_New/descriptions.txt', image_ids_val)
print('Descriptions_val: %d' % len(descriptions_val))

descriptions = load_clean_descriptions('MiddleFiles_New/descriptions.txt', image_ids)
print('Descriptions: %d' % len(descriptions))


#通过图片名称得到图片的特征向量
features_train = load_photo_features('MiddleFiles_New/features.pkl', image_ids_train)
print('Photos_train: %d' % len(features_train))

#通过图片名称得到图片的特征向量
features_val = load_photo_features('MiddleFiles_New/features.pkl', image_ids_val)
print('Photos_val: %d' % len(features_val))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


Dataset: 7000


Loading model cost 0.654 seconds.
Prefix dict has been built successfully.


Descriptions_train: 6000
Descriptions_val: 1000
Descriptions: 7000
Photos_train: 8091
Photos_val: 8091


# 创建单词映射与序列

In [7]:
# 建立分词器tokenizer
def create_tokenizer(descriptions):
    lines = list(descriptions.values())
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def one_hot_encode(data, MAXIMUM_CAPTION_LENGTH, n_classes):
    result = np.zeros((len(data), MAXIMUM_CAPTION_LENGTH, n_classes))
    for i, item in enumerate(data):
        a = 0
        for j, word in enumerate(item):
            result[i, j, word] = 1.0
            a = j
        for k in range(a+1, MAXIMUM_CAPTION_LENGTH):
            result[i, k, 0] = 1.0
    return result

def data_generator(batch_size, captions, get_image, tokenizer, max_length, vocab_size):
    while True:
        for i in range(0,len(captions),batch_size):
            if((i+batch_size)<(len(captions))):
                batch_indices = np.arange(i, i + batch_size)
            else :
                batch_indices = np.arange(i, len(captions))
        
            L = list(captions.keys())

            batch_image_features = np.empty((len(batch_indices), 2048))
            for i, j in enumerate(batch_indices):
                batch_image_features[i] = get_image[L[j]]

            batch_captions1 = [captions[L[item]][:-1] for item in batch_indices]
            batch_captions2 = [captions[L[item]][1:] for item in batch_indices]

            input_captions = tokenizer.texts_to_sequences(batch_captions1)
            output_captions = tokenizer.texts_to_sequences(batch_captions2)

            input_captions = pad_sequences(input_captions, maxlen= max_length, padding='post')
            output_captions = one_hot_encode(output_captions, max_length, vocab_size)
       
            batch_image_features = np.array(batch_image_features, dtype=np.float32)

            x_data = [batch_image_features,input_captions, 
                      np.zeros([input_captions.shape[0], unit_size]), np.zeros([input_captions.shape[0], unit_size])]
            y_data = output_captions

            yield (x_data, y_data)


#输出所有语句不同的单词总数
tokenizer = create_tokenizer(descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

#输出单个语句最大长度
max_length = max([len(desc) for desc in descriptions.values()])
print('Description Length: %d' % max_length)

Vocabulary Size: 2166
Description Length: 20


# 模型定义与训练

In [70]:
unit_size = 2048

In [None]:
def NICmodel(vocab_size, max_len, reg):

    # 图像处理
    inputs1 = Input(shape=(2048,))
    X_img = Dropout(0.5)(inputs1)
    X_img = Dense(unit_size, use_bias = False, 
                        kernel_regularizer=regularizers.l2(reg),
                        name = 'dense_img')(X_img)
    X_img = BatchNormalization(name='batch_normalization_img')(X_img)
    X_img = Lambda(lambda x : K.expand_dims(x, axis=1))(X_img)

    # 文本处理层
    inputs2 = Input(shape=(max_len,))
    X_text = Embedding(vocab_size, unit_size, mask_zero = True, name = 'emb_text')(inputs2)
    X_text = Dropout(0.5)(X_text)

    # 初始化
    a0 = Input(shape=(unit_size,))
    c0 = Input(shape=(unit_size,))

    LSTMLayer = LSTM(unit_size, return_sequences = True, return_state = True, dropout=0.5, name = 'lstm')

    # 将图片转为向量作为input
    _, a, c = LSTMLayer(X_img, initial_state=[a0, c0])

    A, _, _ = LSTMLayer(X_text, initial_state=[a, c])
    output = TimeDistributed(Dense(vocab_size, activation='softmax',
                                     kernel_regularizer = regularizers.l2(reg), 
                                     bias_regularizer = regularizers.l2(reg)), name = 'time_distributed_softmax')(A)

    return Model(inputs=[inputs1, inputs2, a0, c0], outputs=output, name='NIC')


def greedy_inference_model(vocab_size, max_len):
    
    EncoderDense = Dense(unit_size, use_bias=False, name = 'dense_img')
    EmbeddingLayer = Embedding(vocab_size, unit_size, mask_zero = True, name = 'emb_text')
    LSTMLayer = LSTM(unit_size, return_state = True, name = 'lstm')
    SoftmaxLayer = Dense(vocab_size, activation='softmax', name = 'time_distributed_softmax')
    BatchNormLayer = BatchNormalization(name='batch_normalization_img')

    # 图片特征向量化
    inputs1 = Input(shape=(2048,))
    X_img = EncoderDense(inputs1)
    X_img = BatchNormLayer(X_img)
    X_img = Lambda(lambda x : K.expand_dims(x, axis=1))(X_img)

    # 描述文本向量化
    inputs2 = Input(shape=(1,))
    X_text = EmbeddingLayer(inputs2)

    # 初始化
    a0 = Input(shape=(unit_size,))
    c0 = Input(shape=(unit_size,))

    a, _, c = LSTMLayer(X_img, initial_state=[a0, c0])

    x = X_text

    outputs = []
    for i in range(max_len):
        
        a, _, c = LSTMLayer(x, initial_state=[a, c])
        output = SoftmaxLayer(a)
        outputs.append(output)
        x = Lambda(lambda x : K.expand_dims(K.argmax(x)))(output)
        x = EmbeddingLayer(x)

    return Model(inputs=[inputs1, inputs2, a0, c0], outputs=outputs, name='NIC_greedy_inference_v2')


lr=0.01
decay=0.
reg = 1e-4
max_len= max_length
model = NICmodel(vocab_size, max_len, reg)


# 编译模型
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr = lr, decay=decay), metrics=['accuracy'])

# 保存模型
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True)

from keras.callbacks import  EarlyStopping,ReduceLROnPlateau
tfcallbacks = [
    EarlyStopping(monitor='loss',patience = 15,min_delta=0),
    ReduceLROnPlateau(monitor='loss', patience=10, mode='auto')
]

steps = len(descriptions)
batch_size = 64

# create the data generator
generator = data_generator(batch_size, descriptions_train, features_train, tokenizer, max_length, vocab_size)
validation_generator = data_generator(batch_size, descriptions_val, features_val, tokenizer, max_length, vocab_size)

# fit for one epoch
history = model.fit_generator(generator, epochs=500, validation_data=validation_generator,
                              validation_steps=int(np.ceil(len(descriptions_val) / batch_size)),callbacks=tfcallbacks,
                              steps_per_epoch=int(np.ceil(len(descriptions_train) / batch_size)), verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500

In [None]:
model_dir = 'model_weights2.h5'
model.save_weights(model_dir)

# 模型评价

In [None]:
model_dir = 'model_weights2.h5'
# 数字映射到词语
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# 把模型预测转换成语句
def generate_desc(model, tokenizer, photo, max_length): ## 需要修改!!!!!
    in_text = 'startseq'
    # 模型每次生成一个单词，直到endseq停止
    for i in range(max_length-1):
        # 把上一个循环为止生成的所有单词构成序列
        intext = in_text.split()
        sequence = tokenizer.texts_to_sequences([intext])[0]
        now = sequence
        # 填充剩余部分，让序列长度为maxlength
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        # 输入模型，预测下一个单词
        yhat = model.predict([sequence,photo], verbose=0)[0][i+1]
        yhat[0] = 0            #去掉停止词
        for i in range(len(now)):
            yhat[now[i]] = 0
        # 选择概率最高的为下一个单词
        yhat = yhat.argmax()
        word = word_for_id(yhat, tokenizer)
        
        # 无法预测，停止
        if word is None:
            break
        # 逐步添加单词
        in_text +=' '+  word
        # 遇到结尾词，停止
        if word == 'endseq':
            break
    return in_text[0:len(in_text)]

def inference(image_features, plot_attention):
    image_features = np.array([image_features])
    state_h, state_c = initial_state_inference_model.predict(image_features)

    caption = [le.transform_word("<START>")]
    attentions = []

    current_word = None
    for t in range(MAXIMUM_CAPTION_LENGTH):
        caption_array = np.array(caption).reshape(1, -1)
        output, state_h, state_c, attention = inference_model.predict([image_features, caption_array, state_h, state_c])
        attentions.append(attention[0, -1].reshape((14, 14)))

        current_word = np.argmax(output[0, -1])
        caption.append(current_word)

        if current_word == le.transform_word("<STOP>"):
            break
    sentence = [le._index_word_map[i] for i in caption[1:]]

    if plot_attention:
        print(len(attentions))
        x = int(np.sqrt(len(attentions)))
        y = int(np.ceil(len(attentions) / x))
        _, axes = plt.subplots(y, x, sharex="col", sharey="row")
        axes = axes.flatten()
        for i in range(len(attentions)):
            atn = skimage.transform.pyramid_expand(attentions[i], upscale=16, sigma=20)
            axes[i].set_title(sentence[i])
            axes[i].imshow(atn, cmap="gray")

        plt.show()

    return " ".join(sentence) + " ({0})".format(len(caption)-1)



#模型BLEU分数计算
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    cout=0
    for key, desc_list in descriptions.items():
        # yhat为模型的输出语句
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # references使实际的描述语句
        references = [d.split() for d in desc_list]
        
        actual.append(references)
        predicted.append(yhat.split())
        
    # 计算BLEU分数
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# evaluate_model(model, descriptions, features, tokenizer, max_length)

In [None]:
# save the tokenizer
dump(tokenizer, open('MiddleFiles/tokenizer.pkl', 'wb'))

In [None]:
NIC_inference = greedy_inference_model(vocab_size, max_len)
NIC_inference.load_weights(model_dir, by_name = True, skip_mismatch=True)

In [None]:
# load the tokenizer
# tokenizer = load(open('tokenizer.pkl', 'rb'))
# pre-define the max sequence length (from training)
max_length = 20

# extract features from each photo in the directory
def extract_features(filename):
    #通过model.predict提取图片特征
    in_layer = Input(shape=(224, 224, 3))

    base_model = InceptionV3(weights='imagenet', input_tensor=in_layer) 
    #base_model = VGG16(include_top=False,weights='imagenet')
    base_model.trainable=False
    model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)
    image = load_img(filename, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)

    return model.predict(image)

def decoder(inf_model, tokenizer, features, post_process = True):

    '''
    采用greedy search的算法
    '''

    assert(features.shape[0]>0 and features.shape[1] == 2048)

    N = features.shape[0]

    startseq = np.repeat([tokenizer.word_index['startseq']], N)
    a0 = np.zeros([N, unit_size])
    c0 = np.zeros([N, unit_size])

    # 注意：输出维度为: [32, N, 7378]
    y_preds = np.array(inf_model.predict([features, startseq, a0, c0], verbose = 1))

    # 注意：输出维度改为了: [N, 32, 7378]
    y_preds = np.transpose(y_preds, axes = [1,0,2])
    print(y_preds)
    
    sequences = np.argmax(y_preds,axis = -1)
    sents = tokenizer.sequences_to_texts(sequences)

    if post_process:
        sents_pp = []
        for sent in sents:
            if 'endseq' in sent.split():
                words = sent.split()
                sents_pp.append(' '.join(words[:words.index('endseq')]))
            else:
                sents_pp.append(sent)
        sents = sents_pp

    return sents

def generate_caption_from_directory(file_directory):
    # Encoder
    img_features_dict = extract_features(file_directory)
    # Decoder
    captions = decoder(NIC_inference, tokenizer, img_features_dict, True)
    
    return captions

# 载入图片进行预测
image_id = '2294598473_40637b5c04'
file_directory = f'Data/Images/{image_id}.jpg'
description = generate_caption_from_directory(file_directory)
description

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
image_file_dir = f'Data/Images/{image_id}.jpg'

# 展示图片
img = mpimg.imread(image_file_dir)
plt.imshow(img)
#真实标签
''.join(descriptions[image_id][1:-1])

In [None]:
# 载入图片进行预测
file_directory = 'test/0.jpg'
description = generate_caption_from_directory(file_directory)
print(description)

# 展示图片
img = mpimg.imread(file_directory)
plt.imshow(img)