In [None]:
import tensorflow as tf
import numpy as np

# Load the pre-trained image recognition model
vgg16 = tf.keras.applications.VGG16(weights='imagenet')

# Define the RNN model
class RNNCaptioningModel(tf.keras.Model):
    def __init__(self, vocab_size):
        super(RNNCaptioningModel, self).__init__()

        # Create an embedding layer to convert words to vectors
        self.embedding = tf.keras.layers.Embedding(vocab_size, 128)

        # Create an RNN layer to generate the caption
        self.rnn = tf.keras.layers.LSTM(128)

        # Create a fully connected layer to predict the next word in the sequence
        self.output = tf.keras.layers.Dense(vocab_size)

    def call(self, image_features):
        # Embed the image features
        embedded_image_features = self.embedding(image_features)

        # Initialize the RNN state
        rnn_state = None

        # Generate the caption one word at a time
        caption = []
        for i in range(100):
            # Pass the current image features and RNN state to the RNN model
            output, rnn_state = self.rnn(embedded_image_features, rnn_state)

            # Predict the next word in the sequence
            next_word = tf.argmax(self.output(output), axis=1)

            # Add the predicted word to the caption
            caption.append(next_word)

            # Stop generating the caption if we reach the end-of-sequence token
            if next_word == tf.constant(0):
                break

        return caption

# Load the image captioning dataset
(train_images, train_captions), (val_images, val_captions) = tf.keras.datasets.flickr8k.load_data()

# Create a vocabulary from the training captions
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_captions)

# Convert the training and validation captions to word vectors
train_captions_vectors = tokenizer.texts_to_sequences(train_captions)
val_captions_vectors = tokenizer.texts_to_sequences(val_captions)

# Pad the captions to the same length
max_caption_length = 100
train_captions_vectors = tf.keras.preprocessing.sequence.pad_sequences(train_captions_vectors, maxlen=max_caption_length)
val_captions_vectors = tf.keras.preprocessing.sequence.pad_sequences(val_captions_vectors, maxlen=max_caption_length)

# Create the image captioning model
model = RNNCaptioningModel(5000)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_images, train_captions_vectors, epochs=10)

# Evaluate the model
val_loss, val_accuracy = model.evaluate(val_images, val_captions_vectors)

print('Validation loss:', val_loss)
print('Validation accuracy:', val_accuracy)

# Generate a caption for a new image
new_image = tf.image.decode_jpeg(tf.io.read_file('image.jpg'))
new_image_features = vgg16.predict(new_image[tf.newaxis, ...])

# Generate the caption
caption = model(new_image_features)[0]

# Decode the caption
decoded_caption = tokenizer.sequences_to_texts([caption])[0]

# Print the caption
print('Caption:', decoded_caption)
