# Step 1: Load and prepare the dataset

In [1]:
import os
import string
from numpy import array
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
import numpy as np

# Load image features (assuming images are in a 'Flickr30k_images' folder)

In [2]:
def load_image(path, target_size=(224, 224)):
    img = load_img(path, target_size=target_size)  # Load the image with PIL
    img = img_to_array(img)  # Convert to array
    img = preprocess_input(img)  # Preprocess for VGG16
    return img

# Load captions file

In [3]:
def load_captions(filename):
    with open(filename, 'r') as file:
        captions = file.read()
    return captions

# Parse captions

In [4]:
def parse_captions(captions):
    descriptions = {}
    for line in captions.split('\n'):
        tokens = line.split('\t')
        if len(tokens) < 2:
            continue
        image_id, caption = tokens[0], tokens[1]
        image_id = image_id.split('.')[0]  # Extract image ID
        if image_id not in descriptions:
            descriptions[image_id] = []
        descriptions[image_id].append(caption)
    return descriptions

# Apply all preprocessing steps

In [None]:
captions_file = 'Flickr30k.token.txt'
captions_text = load_captions(captions_file)
descriptions = parse_captions(captions_text)

In [9]:
print("Loaded and parsed captions successfully!")

Loaded and parsed captions successfully!


In [10]:

from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg16 import VGG16


In [11]:
# Load VGG16 model pre-trained on ImageNet, excluding the top classification layer
def create_encoder_model():
    vgg = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
    for layer in vgg.layers:
        layer.trainable = False
    return Model(inputs=vgg.input, outputs=vgg.layers[-1].output)

In [12]:
# Define the decoder model
def create_decoder_model(vocab_size, max_length):
    # Input for image features
    inputs1 = Input(shape=(7, 7, 512))
    fe1 = Dense(256, activation='relu')(inputs1)
    fe2 = Dropout(0.5)(fe1)
    fe3 = Dense(256, activation='relu')(fe2)
    
    # Input for the sequence of words (captions)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (combines image and text features)
    decoder1 = add([fe3, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    return Model(inputs=[inputs1, inputs2], outputs=outputs)


In [13]:
# Instantiate the encoder and decoder models
encoder = create_encoder_model()
decoder = create_decoder_model(vocab_size=10000, max_length=34)  # Adjust vocab_size and max_length as needed

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [15]:
print("Model created successfully!")

Model created successfully!
