In [4]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add

In [19]:
IMG_DIR = "E:\\126156019\\ImageCaptioningReducedSamples\\Image Captioning\\Images-20241006T055258Z-001\\image"
CAP_DIR = "E:\\126156019\\ImageCaptioningReducedSamples\\Image Captioning\\Captions"
BASE_DIR="E:\\126156019\\ImageCaptioningReducedSamples\\Image Captioning"

In [6]:
# Step 1: Load and preprocess images
def extract_features(directory):
    # Load VGG16 model
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    
    features = {}
    
    for img_name in tqdm(os.listdir(directory)):
        img_path = os.path.join(directory, img_name)
        
        # Load and preprocess image
        image = load_img(img_path, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, *image.shape))
        image = preprocess_input(image)
        
        # Extract features
        feature = model.predict(image, verbose=0)
        image_id = img_name.split('.')[0]
        features[image_id] = feature
    
    return features
print("Extracting image features...")
features = extract_features(IMG_DIR)

Extracting image features...


  0%|          | 0/2000 [00:00<?, ?it/s]



In [11]:
pickle.dump(features, open(os.path.join(BASE_DIR, 'features.pkl'), 'wb'))

In [72]:
def load_captions(filename):
    with open(filename, 'r') as f:
        captions_doc = f.read()
    
    mapping = {}
    
    for line in tqdm(captions_doc.split('\n')):
        tokens = line.split(',')
        if len(line) < 2:
            continue
        
        image_id, caption = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        caption = " ".join(caption)
        
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(caption)
    
    return mapping

# Load captions
captions_file = os.path.join(CAP_DIR, 'captions.txt')
mapping = load_captions(captions_file)

  0%|          | 0/10001 [00:00<?, ?it/s]

In [73]:
# Step 3: Clean captions
def clean_captions(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            # Convert to lowercase
            caption = caption.lower()
            # Remove special characters and digits
            caption = ''.join([char for char in caption if char.isalpha() or char == ' '])
            # Remove extra spaces
            caption = ' '.join(caption.split())
            # Add start and end tokens
            caption = 'startseq ' + caption + ' endseq'
            captions[i] = caption

clean_captions(mapping)

In [74]:
# Step 4: Prepare text data
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

# Create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Find maximum caption length
max_length = max(len(caption.split()) for caption in all_captions)


In [75]:
#Step 5 Prepare training data
def create_sequences(tokenizer, max_length, captions_list, feature):
    X1, X2, y = [], [], []
    
    for caption in captions_list:
        # Encode sequence
        seq = tokenizer.texts_to_sequences([caption])[0]
        
        # Split into input-output pairs
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            # Pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # Encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    
    return np.array(X1), np.array(X2), np.array(y)

# Step 6: Split data
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.8)
train_ids = image_ids[:split]
test_ids = image_ids[split:]

In [76]:
# Step 7: Build the model
def build_model(vocab_size, max_length):
    # Feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # Sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    # Decoder model
    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model
model = build_model(vocab_size, max_length)
print(model.summary())

None


In [80]:
# Step 8: Data generator
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size=32):
    X1, X2, y = [], [], []
    n = 0
    
    while True:
        for key in data_keys:
            n += 1
            if key in features:
                captions = mapping[key]
            feature = features[key][0]
            
            # Create sequences for each caption
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]
                
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
            
            if n >= batch_size:
                yield (np.array(X1), np.array(X2)), np.array(y)
                X1, X2, y = [], [], []
                n = 0

In [81]:
print("Training model...")
epochs = 2
batch_size = 32
steps = len(train_ids) // batch_size

for i in range(epochs):
    generator = data_generator(train_ids, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

Training model...




[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2s/step - loss: 6.1607
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 2s/step - loss: 4.8910


In [83]:
# Save the model
model.save(BASE_DIR + 'image_captioning_model.h5')





In [84]:
# Step 10: Generate captions
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image, tokenizer, max_length):
    in_text = 'startseq'
    
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)
        
        yhat = model.predict([image, sequence], verbose=0)
        yhat = np.argmax(yhat)
        
        word = idx_to_word(yhat, tokenizer)
        
        if word is None:
            break
            
        in_text += " " + word
        
        if word == 'endseq':
            break
            
    return in_text

# Test the model
print("\nTesting the model...")
for i in range(5):
    key = test_ids[i]
    image = features[key]
    caption = predict_caption(model, image, tokenizer, max_length)
    
    print(f"Image: {key}")
    print(f"Generated Caption: {caption}")
    print(f"Actual Captions: {mapping[key][:2]}")
    print("-" * 50)

print("Image captioning model training completed!")


Testing the model...




Image: 3595216998_0a19efebd0
Generated Caption: startseq a dog dog is is in a endseq
Actual Captions: ['startseq a black dog leaps in the air while playing outside endseq', 'startseq a small black and white dog jumps with red plastic fence in background endseq']
--------------------------------------------------
Image: 3595408539_a7d8aabc24
Generated Caption: startseq a man in a man in a shirt endseq
Actual Captions: ['startseq a man and a woman holding cups with another man nearby endseq', 'startseq a woman stands next to a man in a hat holding a cup standing next to another man in a yellow hat endseq']
--------------------------------------------------
Image: 3595992258_6f192e6ae7
Generated Caption: startseq a brown dog dog in a brown endseq
Actual Captions: ['startseq a brown dog is running on a rock endseq', 'startseq a brown dog running endseq']
--------------------------------------------------
Image: 3596131692_91b8a05606
Generated Caption: startseq a man is in a ball endseq
Act