1. Step 1: Install Necessary Libraries

In [1]:
pip install numpy pandas tensorflow keras pillow matplotlib scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


2. Import Required Libraries


In [2]:
import numpy as np
import pandas as pd
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Sequential
import string





3. Load and Preprocess the Dataset

In [3]:
# Load the VGG16 model pre-trained on ImageNet
model = VGG16(weights='imagenet')
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)  # Remove the final dense layer

# Function to preprocess the image and extract features
def extract_features(directory):
    features = {}
    for img_name in os.listdir(directory):
        # Load image from file
        filename = os.path.join(directory, img_name)
        image = load_img(filename, target_size=(224, 224))
        
        # Convert image pixels to a numpy array
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        
        # Prepare image for VGG16
        image = preprocess_input(image)
        
        # Extract features
        feature = model.predict(image, verbose=0)
        image_id = img_name.split('.')[0]
        features[image_id] = feature
    return features








In [4]:
# Directory where images are stored
image_dir = r'C:\Users\Houda\Downloads\archive (2)\Images'
features = extract_features(image_dir)

3. Process Captions

In [5]:
# Load the caption file
def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

# Parse the caption file
def load_descriptions(doc):
    mapping = {}
    for line in doc.split('\n'):
        tokens = line.split()
        if len(tokens) < 2:
            continue
        img_id, img_caption = tokens[0], tokens[1:]
        img_id = img_id.split('.')[0]  # Remove file extension
        img_caption = ' '.join(img_caption)
        if img_id not in mapping:
            mapping[img_id] = []
        mapping[img_id].append(img_caption)
    return mapping

filename = r'C:\Users\Houda\Downloads\archive (2)\captions.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)

# Clean the descriptions by removing punctuation and converting to lowercase
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc_list[i] = ' '.join(desc)

clean_descriptions(descriptions)


4. Tokenize the Captions

In [6]:
# Convert descriptions to a list of captions
def to_lines(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

# Create tokenizer and fit on captions
all_desc = to_lines(descriptions)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_desc)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Convert a description to a sequence of integers
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Maximum length of caption sequences
max_length = max(len(d.split()) for d in all_desc)


5. Define the Image Captioning Model 

In [7]:
# Define the image captioning model
def define_model(vocab_size, max_length):
    # Image feature extractor
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # Sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    # Decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # Merge models
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = define_model(vocab_size, max_length)
model.summary()



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 35)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 4096)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 35, 256)              2238976   ['input_3[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 4096)                 0         ['input_2[0][0]']             
                                                                                           

6. Train the Model 

In [8]:
# Convert descriptions dictionary to lists for splitting
keys = list(descriptions.keys())
values = [descriptions[key] for key in keys]

# Split the keys and values into training and validation sets
train_keys, test_keys, train_descs, test_descs = train_test_split(keys, values, test_size=0.2, random_state=42)

# Convert back to dictionary format
train_descriptions = {k: descriptions[k] for k in train_keys}
test_descriptions = {k: descriptions[k] for k in test_keys}

# Train the model
epochs = 20
for epoch in range(epochs):
    for key, desc_list in train_descriptions.items():
        if key not in features:
            print(f"Skipping missing feature for image ID: {key}")
            continue
        
        photo = features[key][0]  # Get image feature
        X1, X2, y = create_sequences(tokenizer, max_length, desc_list, photo)
        # Check shapes of data before fitting
        print(f"Epoch {epoch+1}/{epochs} - X1 shape: {X1.shape}, X2 shape: {X2.shape}, y shape: {y.shape}")
        model.fit([X1, X2], y, epochs=1, verbose=1)


Epoch 1/20 - X1 shape: (42, 4096), X2 shape: (42, 35), y shape: (42, 8746)



7. Generate Captions 

In [None]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Example usage (using a test image ID)
test_image_id = test_keys[0]  # Replace with an actual test image ID
if test_image_id in features:
    photo = features[test_image_id][0]  # Get image feature
    caption = generate_desc(model, tokenizer, photo, max_length)
    print('Generated Caption:', caption)
else:
    print(f"Image ID {test_image_id} not found in features")
