In [None]:
#####Step 1: Import the necessary libraries and download the dataset.

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import to_categorical
import pickle
import os
import re

# Download and extract the Flickr8k dataset: https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
# Download the captions file: https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip

# Set the paths to your dataset and captions
image_folder = 'Flickr8k_Dataset'
captions_file = 'Flickr8k.token.txt'


######Step 2: Preprocess the dataset, including image feature extraction and text tokenization.

# Image feature extraction using InceptionV3
def preprocess_images(image_path):
    model = InceptionV3(include_top=False, weights='imagenet')
    model = Model(inputs=model.input, outputs=model.layers[-1].output)

    image = keras.preprocessing.image.load_img(image_path, target_size=(299, 299))
    image = keras.preprocessing.image.img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = keras.applications.inception_v3.preprocess_input(image)

    features = model.predict(image)
    return features

# Tokenize captions
def tokenize_captions(captions):
    tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Load and preprocess captions
with open(captions_file, 'r') as file:
    captions = file.readlines()

# Initialize a list to store preprocessed captions
preprocessed_captions = []

for caption in captions:
    # Preprocess one caption at a time
    # Convert to lowercase
    caption = caption.lower()

    # Remove digits, special characters, and anything that is not a letter
    caption = re.sub(r'[^a-zA-Z]', ' ', caption)

    # Remove additional spaces
    caption = re.sub(r'\s+', ' ', caption).strip()

    # Add start and end tags to the caption
    caption = '<start> ' + caption + ' <end>'

    preprocessed_captions.append(caption)

tokenizer = tokenize_captions(preprocessed_captions)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(preprocessed_captions)
max_sequence_length = max(len(sequence) for sequence in sequences)
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')




#####Step 3: Create an image captioning model using LSTM and train it.
# Image captions model
image_model = keras.Sequential([
    keras.layers.Input(shape=(2048,)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(256, activation='relu'),
])

caption_model = keras.Sequential([
    keras.layers.Input(shape=(max_sequence_length,)),
    keras.layers.Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True),
    keras.layers.LSTM(256),
    keras.layers.Dense(256, activation='relu'),
])

combined_model = keras.layers.Concatenate()([image_model.output, caption_model.output])
x = keras.layers.Dense(256, activation='relu')(combined_model)
outputs = keras.layers.Dense(vocab_size, activation='softmax')(x)

model = keras.Model(inputs=[image_model.input, caption_model.input], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

# Train the model using your dataset


#######Step 4: Generate captions for images using the trained model
# Generate captions for images
def generate_caption(image_path):
    image = preprocess_images(image_path)
    image_features = image_model.predict(image)

    caption = ['<start>']
    for _ in range(max_sequence_length):
        caption_sequence = tokenizer.texts_to_sequences([caption])[0]
        caption_sequence = pad_sequences([caption_sequence], maxlen=max_sequence_length, padding='post')

        next_word_prob = model.predict([image_features, caption_sequence])[0]
        next_word = tokenizer.index_word[np.argmax(next_word_prob)]

        if next_word == '<end>':
            break
        caption.append(next_word)

    return ' '.join(caption[1:-1])

# Example usage
image_path = 'example.jpg'
caption = generate_caption(image_path)
print(caption)
