In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint

# Load the Flickr8k dataset
filename = 'Flickr8k_text/Flickr8k.token.txt'
file = open(filename, 'r')
doc = file.read()

# Parse the captions and create a dictionary with image filenames and their captions
captions = {}
for line in doc.split('\n'):
    tokens = line.split()
    if len(line) < 2:
        continue
    image_id, image_caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    image_caption = ' '.join(image_caption)
    if image_id not in captions:
        captions[image_id] = []
    captions[image_id].append(image_caption)

# Load the image features extracted by InceptionV3
features = np.load('features.npy')

# Create a dictionary with image filenames and their features
features_dict = {}
for feature in features:
    features_dict[feature[0]] = feature[1]

# Tokenize the captions
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(list(captions.values()))
sequences = tokenizer.texts_to_sequences(list(captions.values()))

# Pad the sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Create input and output data
X = np.array([features_dict[image_id] for image_id in captions.keys()])
y = padded_sequences

# Define the model architecture
input_layer = Input(shape=(2048,))
dropout_layer = Dropout(0.5)(input_layer)
embedding_layer = Embedding(input_dim=10000, output_dim=256, input_length=max_length)(y)
lstm_layer = LSTM(256)(embedding_layer)
output_layer = Dense(10000, activation='softmax')(lstm_layer)
model = Model(inputs=[input_layer, y], outputs=output_layer)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit([X, y], y, epochs=50, batch_size=64)
