# 读取每张图片描述语句信息

In [1]:
import string

# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        #take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id(.jpg)
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # store the first description for each image
        if image_id not in mapping:
            mapping[image_id] = image_desc
    return mapping

def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc in descriptions.items():
        # tokenize
        desc = desc.split()
        # convert to lower case
        desc = [word.lower() for word in desc]
        # remove punctuation from each token
        desc = [w.translate(table) for w in desc]
        # remove hanging 's' and 'a'
        desc = [word for word in desc if len(word)>1]
        # store as string
        descriptions[key] =  ' '.join(desc)

# save descriptions to file, one per line
def save_doc(descriptions, filename):
    lines = list()
    for key, desc in descriptions.items():
        lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

filename = 'Flickr8k_text/Flickr8k.token.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
clean_descriptions(descriptions)

# summarize vocabulary
all_tokens = ' '.join(descriptions.values()).split()
vocabulary = set(all_tokens)
print('Vocabulary Size: %d' % len(vocabulary))

# save descriptions
save_doc(descriptions, 'descriptions.txt')

Loaded: 8092 
Vocabulary Size: 4484


# 利用训练好的卷积神经网络VGG提取图片特征

In [2]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.layers import Input
 
# extract features from each photo in the directory
def extract_features(directory):
    # load the model
    in_layer = Input(shape=(224, 224, 3))
    model = VGG16(include_top=False, input_tensor=in_layer)
    print(model.summary())
    # extract features from each photo
    features = dict()
    for name in listdir(directory):
        # load an image from file
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # prepare the image for the VGG model
        image = preprocess_input(image)
        # get features
        feature = model.predict(image, verbose=0)
        # get image id
        image_id = name.split('.')[0]
        # store feature
        features[image_id] = feature
        print('>%s' % name)
    return features
 
extract features from all images
directory = 'Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
save to file
dump(features, open('features.pkl', 'wb'))

# 准备好训练集和测试集

In [8]:
from pickle import load

# load a pre-defined list of photo identifiers
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    # process line by line
    for line in doc.split('\n'):
        # skip empty lines
        if len(line) < 1:
            continue
        # get the image identifier
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)
 
# split a dataset into train/test elements
def train_test_split(dataset):
    # order keys so the split is consistent
    ordered = sorted(dataset)
    # return split dataset as two new sets
    return set(ordered[:800]), set(ordered[800:1000])
 
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    # load document
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        # split id from description
        image_id, image_desc = tokens[0], tokens[1:]
        # skip images not in the set
        if image_id in dataset:
            # store
            descriptions[image_id] = 'startseq ' + ' '.join(image_desc) + ' endseq'
    return descriptions
 
# load photo features
def load_photo_features(filename, dataset):
    # load all features
    all_features = load(open(filename, 'rb'))
    # filter features
    features = {k: all_features[k] for k in dataset}
    return features
 
# load dev set
filename = 'Flickr8k_text/Flickr_8k.devImages.txt'
dataset = load_set(filename)
print('Dataset: %d' % len(dataset))
# train-test split
train, test = train_test_split(dataset)
print('Train=%d, Test=%d' % (len(train), len(test)))
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: train=%d, test=%d' % (len(train_descriptions), len(test_descriptions)))
# photo features
train_features = load_photo_features('features.pkl', train)
test_features = load_photo_features('features.pkl', test)
print('Photos: train=%d, test=%d' % (len(train_features), len(test_features)))

Dataset: 1000
Train=800, Test=200
Descriptions: train=800, test=200
Photos: train=800, test=200


# 分词器把描述中的单词转换为数字

In [9]:
from keras.preprocessing.text import Tokenizer

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = list(descriptions.values())
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# prepare tokenizer
tokenizer = create_tokenizer(descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 4485


# 模型训练

In [23]:
from numpy import array
from numpy import argmax
from pandas import DataFrame
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from pickle import load

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Embedding
from keras.layers.merge import concatenate
from keras.layers.pooling import GlobalMaxPooling2D,GlobalAveragePooling2D

# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, desc, image, max_length):
    Ximages, XSeq, y = list(), list(),list()
    vocab_size = len(tokenizer.word_index) + 1
    # integer encode the description
    seq = tokenizer.texts_to_sequences([desc])[0]
    # split one sequence into multiple X,y pairs
    for i in range(1, len(seq)):
        # select
        in_seq, out_seq = seq[:i], seq[i]
        # pad input sequence
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # store
        Ximages.append(image)
        XSeq.append(in_seq)
        y.append(out_seq)
    # Ximages, XSeq, y = array(Ximages), array(XSeq), array(y)
    return [Ximages, XSeq, y]

# define the captioning model
def define_model(vocab_size, max_length):
    # feature extractor (encoder)
    inputs1 = Input(shape=(7, 7, 512))
    fe1 = GlobalMaxPooling2D()(inputs1)
    fe2 = Dense(128, activation='relu')(fe1)
    fe3 = RepeatVector(max_length)(fe2)
    # embedding
    inputs2 = Input(shape=(max_length,))
    emb2 = Embedding(vocab_size, 50, mask_zero=True)(inputs2)
    emb3 = LSTM(256, return_sequences=True)(emb2)
    emb3 = LSTM(256, return_sequences=True)(emb3)
    emb4 = TimeDistributed(Dense(128, activation='relu'))(emb3)
    # merge inputs
    merged = concatenate([fe3, emb4])
    # language model (decoder)
    lm2 = LSTM(256)(merged)
    lm3 = Dense(256, activation='relu')(lm2)
    outputs = Dense(vocab_size, activation='softmax')(lm3)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='plot.png')
    return model

# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length, n_step):
    # loop until we finish training
    while 1:
        # loop over photo identifiers in the dataset
        keys = list(descriptions.keys())
        for i in range(0, len(keys), n_step):
            Ximages, XSeq, y = list(), list(),list()
            for j in range(i, min(len(keys), i+n_step)):
                image_id = keys[j]
                # retrieve photo feature input
                image = features[image_id][0]
                # retrieve text input
                desc = descriptions[image_id]
                # generate input-output pairs
                in_img, in_seq, out_word = create_sequences(tokenizer, desc, image, max_length)
                for k in range(len(in_img)):
                    Ximages.append(in_img[k])
                    XSeq.append(in_seq[k])
                    y.append(out_word[k])
            # yield this batch of samples to the model
            yield ([array(Ximages), array(XSeq)], array(y))

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        actual.append([desc.split()])
        predicted.append(yhat.split())
    
    # calculate BLEU score
    bleu = corpus_bleu(actual, predicted, smoothing_function=SmoothingFunction().method1)
    return bleu

# determine the maximum sequence length
max_length = max(len(s.split()) for s in list(train_descriptions.values()))
print('Description Length: %d' % max_length)

# define experiment
model_name = 'baseline1'
verbose = 2
n_epochs = 50
n_photos_per_update = 2
n_batches_per_epoch = int(len(train) / n_photos_per_update)
n_repeats = 3

# run experiment
train_results, test_results = list(), list()
for i in range(n_repeats):
    # define the model
    model = define_model(vocab_size, max_length)
    # fit model
    model.fit_generator(data_generator(train_descriptions, train_features, tokenizer, max_length, n_photos_per_update), steps_per_epoch=n_batches_per_epoch, epochs=n_epochs, verbose=verbose)
    
    
    # evaluate model on training data
    train_score = evaluate_model(model, train_descriptions, train_features, tokenizer, max_length)
    test_score = evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)
    # store
    train_results.append(train_score)
    test_results.append(test_score)
    print('>%d: train=%f test=%f' % ((i+1), train_score, test_score))
# save results to file
df = DataFrame()
df['train'] = train_results
df['test'] = test_results
print(df.describe())
df.to_csv(model_name+'.csv', index=False)

Description Length: 25
Model: "model_28"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_57 (InputLayer)           [(None, 7, 7, 512)]  0                                            
__________________________________________________________________________________________________
input_58 (InputLayer)           [(None, 25)]         0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 25088)        0           input_57[0][0]                   
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 25, 50)       18300       input_58[0][0]                   
____________________________________________________________________

Epoch 1/50
50/50 - 4s - loss: 5.4370 - accuracy: 0.0870
Epoch 2/50
50/50 - 4s - loss: 5.1516 - accuracy: 0.0956
Epoch 3/50
50/50 - 4s - loss: 5.0527 - accuracy: 0.0975
Epoch 4/50
50/50 - 4s - loss: 4.9543 - accuracy: 0.0975
Epoch 5/50
50/50 - 4s - loss: 4.8870 - accuracy: 0.0985
Epoch 6/50
50/50 - 4s - loss: 4.7902 - accuracy: 0.0994
Epoch 7/50
50/50 - 4s - loss: 4.7034 - accuracy: 0.1023
Epoch 8/50
50/50 - 4s - loss: 4.5651 - accuracy: 0.1004
Epoch 9/50
50/50 - 4s - loss: 4.4313 - accuracy: 0.0994
Epoch 10/50
50/50 - 4s - loss: 4.2899 - accuracy: 0.0994
Epoch 11/50
50/50 - 4s - loss: 4.1696 - accuracy: 0.1071
Epoch 12/50
50/50 - 4s - loss: 3.9509 - accuracy: 0.1071
Epoch 13/50
50/50 - 4s - loss: 3.7314 - accuracy: 0.1109
Epoch 14/50
50/50 - 4s - loss: 3.5652 - accuracy: 0.1080
Epoch 15/50
50/50 - 4s - loss: 3.4851 - accuracy: 0.1052
Epoch 16/50
50/50 - 4s - loss: 3.3401 - accuracy: 0.1013
Epoch 17/50
50/50 - 4s - loss: 3.2749 - accuracy: 0.1033
Epoch 18/50
50/50 - 4s - loss: 3.1910 - 

Epoch 43/50
50/50 - 4s - loss: 2.6451 - accuracy: 0.1472
Epoch 44/50
50/50 - 4s - loss: 2.6654 - accuracy: 0.1453
Epoch 45/50
50/50 - 4s - loss: 2.6311 - accuracy: 0.1472
Epoch 46/50
50/50 - 4s - loss: 2.6433 - accuracy: 0.1453
Epoch 47/50
50/50 - 4s - loss: 2.6332 - accuracy: 0.1358
Epoch 48/50
50/50 - 4s - loss: 2.6239 - accuracy: 0.1453
Epoch 49/50
50/50 - 4s - loss: 2.6179 - accuracy: 0.1520
Epoch 50/50
50/50 - 4s - loss: 2.6254 - accuracy: 0.1597
>3: train=0.004642 test=0.004212
          train      test
count  3.000000  3.000000
mean   0.005146  0.003641
std    0.001180  0.000895
min    0.004302  0.002610
25%    0.004472  0.003356
50%    0.004642  0.004101
75%    0.005569  0.004157
max    0.006495  0.004212


In [20]:
model_name = 'baseline_generate'

# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        actual.append([desc.split()])
        predicted.append(yhat.split())
        print('Actual:    %s' % desc)
        print('Predicted: %s' % yhat)
        if len(actual) >= 5:
            break
    # calculate BLEU score
    bleu = corpus_bleu(actual, predicted)
    return bleu

# evaluate model on training data
train_score = evaluate_model(model, train_descriptions, train_features, tokenizer, max_length)
test_score = evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

Actual:    startseq child and woman are at waters edge in big city endseq
Predicted: startseq child and woman are at waters edge in city endseq
Actual:    startseq boy with stick kneeling in front of goalie net endseq
Predicted: startseq boy with stick kneeling in of goalie goalie net endseq
Actual:    startseq woman crouches near three dogs in field endseq
Predicted: startseq woman crouches near three dogs field endseq
Actual:    startseq boy bites hard into treat while he sits outside endseq
Predicted: startseq boy bites hard into treat while sits outside endseq
Actual:    startseq person eats takeout while watching small television endseq
Predicted: startseq person eats takeout while watching small television endseq
Actual:    startseq couple with young child wrapped in blanket sitting on concrete step endseq
Predicted: startseq man of people while while in in traffic endseq
Actual:    startseq adults and children stand and play in front of steps near wooded area endseq
Predicted: s