In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:



import os
import pickle
import numpy as np
import re
from tqdm import tqdm
from PIL import Image

import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Concatenate, Layer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Import necessary libraries for BLEU score
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# USER PATHS
ZIP_PATH = "/content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/images"           # Folder with images (not zipped)
CAPTION_FILE = "/content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/captions.txt"  # Captions file
FEATURES_FILE = "/content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/features.pkl"
TOKENIZER_FILE = "/content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/tokenizer.pkl"
MODEL_PATH = "/content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5"

# 1. Extract or Load Features

def extract_image_id(filename):
    return os.path.splitext(os.path.basename(filename))[0]

if not os.path.exists(FEATURES_FILE):
    print("Extracting features from images folder...")
    base_model = InceptionV3(weights='imagenet')
    model_incep = Model(inputs=base_model.input, outputs=base_incep.layers[-2].output)

    features = {}
    image_files = [f for f in os.listdir(ZIP_PATH) if f.lower().endswith('.jpg')]
    print(f"Total images found: {len(image_files)}")
    for file_name in tqdm(image_files):
        img_path = os.path.join(ZIP_PATH, file_name)
        img = Image.open(img_path).convert('RGB')
        img = img.resize((299, 299))
        image = img_to_array(img)
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)
        feature = model_incep.predict(image, verbose=0)
        img_id = extract_image_id(file_name)
        features[img_id] = feature
    with open(FEATURES_FILE, 'wb') as f:
        pickle.dump(features, f)
    print(f"Saved features for {len(features)} images.")
else:
    print(f"Loading existing features from {FEATURES_FILE}...")
    with open(FEATURES_FILE, 'rb') as f:
        features = pickle.load(f)
    print(f"Loaded features for {len(features)} images.")



Loading existing features from /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/features.pkl...
Loaded features for 8091 images.


In [8]:
#  2. Load and Process Captions

mapping = {}
print(f"Reading captions from: {CAPTION_FILE}")
with open(CAPTION_FILE, 'r') as f:
    captions_doc = f.read()

lines = captions_doc.strip().split('\n')
print(f"Total lines in captions file: {len(lines)}")

for line in lines:
    if len(line) < 2 or ',' not in line:
        continue
    image_id, caption = line.split(',', 1)
    image_id = image_id.strip()
    caption = caption.strip()
    image_id = image_id.split('.')[0]
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

print(f"Unique image IDs in captions: {len(mapping)}")

def clean_captions(mapping):
    for key, caps in mapping.items():
        for i in range(len(caps)):
            caption = caps[i].lower()
            caption = re.sub(r'[^a-z ]', '', caption)
            caption = re.sub(r'\s+', ' ', caption).strip()
            caps[i] = 'startseq ' + ' '.join([w for w in caption.split() if len(w) > 1]) + ' endseq'

clean_captions(mapping)
print("Captions cleaned.")

caption_ids = set(mapping.keys())
feature_ids = set(features.keys())

print(f"Number of caption image IDs: {len(caption_ids)}")
print(f"Number of feature image IDs: {len(feature_ids)}")

# Add print statements to help diagnose the mismatch
print("\nFirst 10 Feature Image IDs:", list(feature_ids)[:10])
print("First 10 Caption Image IDs:", list(caption_ids)[:10])


common_ids = caption_ids.intersection(feature_ids)
print(f"Number of common IDs: {len(common_ids)}")

if len(common_ids) == 0:
    raise ValueError("No common image IDs between captions and features! Check dataset and filenames.")

filtered_mapping = {img_id: mapping[img_id] for img_id in common_ids}
image_ids = list(filtered_mapping.keys())

all_captions = [cap for caps in filtered_mapping.values() for cap in caps]




Reading captions from: /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/captions.txt
Total lines in captions file: 40456
Unique image IDs in captions: 8092
Captions cleaned.
Number of caption image IDs: 8092
Number of feature image IDs: 8091

First 10 Feature Image IDs: ['473220329_819a913bbb', '864290968_eccb46d5ab', '3429465163_fb8ac7ce7f', '2701603045_6cbdc4ce7c', '3690431163_1d81e19549', '2255332561_3375897ff0', '3106787167_e5f2312622', '309049466_1d7e7d5fc2', '2924870944_90ff9eca1a', '2249264723_d08655d9f2']
First 10 Caption Image IDs: ['473220329_819a913bbb', '864290968_eccb46d5ab', '3429465163_fb8ac7ce7f', '2701603045_6cbdc4ce7c', '3690431163_1d81e19549', '2255332561_3375897ff0', '3106787167_e5f2312622', '309049466_1d7e7d5fc2', '2924870944_90ff9eca1a', '2249264723_d08655d9f2']
Number of common IDs: 8091


In [11]:
# 3. Create or Load Tokenizer

if not os.path.exists(TOKENIZER_FILE):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    with open(TOKENIZER_FILE, 'wb') as f:
        pickle.dump(tokenizer, f)
    print(f"Tokenizer created and saved to {TOKENIZER_FILE}")
else:
    with open(TOKENIZER_FILE, 'rb') as f:
        tokenizer = pickle.load(f)
    print("Tokenizer loaded.")

vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in all_captions)

print(f"Vocabulary size: {vocab_size}")
print(f"Max caption length: {max_length}")

Tokenizer loaded.
Vocabulary size: 8768
Max caption length: 34


In [12]:
#  4. Train/Validation Split

train_ids, val_ids = train_test_split(image_ids, test_size=0.1, random_state=42)
print(f"Training samples: {len(train_ids)}, Validation samples: {len(val_ids)}")

Training samples: 7281, Validation samples: 810


In [13]:
# 5. Data Generator

batch_size = 64

def data_generator(image_ids, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = [], [], []
    n = 0
    while True:
        for img_id in image_ids:
            caps = mapping[img_id]
            for cap in caps:
                seq = tokenizer.texts_to_sequences([cap])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1.append(features[img_id][0])
                    X2.append(in_seq)
                    y.append(out_seq)
                    n += 1

                    if n == batch_size:
                        yield (np.array(X1), np.array(X2)), np.array(y)
                        X1, X2, y = [], [], []
                        n = 0

In [14]:
#  6. Bahdanau Attention
class BahdanauAttention(Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [15]:
#  7. Build the Model

units = 256

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(units, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, units, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(units, return_sequences=True)(se2)

attention = BahdanauAttention(units)
context_vector, attention_weights = attention(fe2, se3)

decoder1 = Concatenate(axis=-1)([fe2, context_vector])
decoder2 = Dense(units, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

print(model.summary())

None


In [16]:
#  8. Wrap Generators into tf.data.Dataset

output_signature = (
    (
        tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32),
    ),
    tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)
)

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(train_ids, filtered_mapping, features, tokenizer, max_length, vocab_size, batch_size),
    output_signature=output_signature,
).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(val_ids, filtered_mapping, features, tokenizer, max_length, vocab_size, batch_size),
    output_signature=output_signature,
).prefetch(tf.data.AUTOTUNE)

steps_per_epoch = max(len(train_ids) * 5 // batch_size, 1)  # approx 5 captions/image
validation_steps = max(len(val_ids) * 5 // batch_size, 1)

In [17]:
checkpoint = ModelCheckpoint(MODEL_PATH, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
earlystop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

print("Starting model training...")
history = model.fit(
    train_dataset,
    epochs=20,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_dataset,
    validation_steps=validation_steps,
    callbacks=[checkpoint, earlystop]
)
print("Model training finished.")

Starting model training...
Epoch 1/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 6.2915
Epoch 1: val_loss improved from inf to 5.33909, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - loss: 6.2888 - val_loss: 5.3391
Epoch 2/20
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 5.2125
Epoch 2: val_loss improved from 5.33909 to 4.93244, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 27ms/step - loss: 5.2123 - val_loss: 4.9324
Epoch 3/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - loss: 4.7978
Epoch 3: val_loss improved from 4.93244 to 4.70125, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - loss: 4.7976 - val_loss: 4.7012
Epoch 4/20
[1m567/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 4.5633
Epoch 4: val_loss improved from 4.70125 to 4.53338, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - loss: 4.5631 - val_loss: 4.5334
Epoch 5/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 4.3800
Epoch 5: val_loss improved from 4.53338 to 4.42670, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 4.3797 - val_loss: 4.4267
Epoch 6/20
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 4.3352
Epoch 6: val_loss improved from 4.42670 to 4.36298, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 4.3352 - val_loss: 4.3630
Epoch 7/20
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 4.1522
Epoch 7: val_loss improved from 4.36298 to 4.30534, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 4.1523 - val_loss: 4.3053
Epoch 8/20
[1m567/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 4.2492
Epoch 8: val_loss improved from 4.30534 to 4.23047, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - loss: 4.2491 - val_loss: 4.2305
Epoch 9/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 4.2026
Epoch 9: val_loss improved from 4.23047 to 4.18354, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - loss: 4.2025 - val_loss: 4.1835
Epoch 10/20
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 4.0592
Epoch 10: val_loss did not improve from 4.18354
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 4.0592 - val_loss: 4.2223
Epoch 11/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 4.0086
Epoch 11: val_loss improved from 4.18354 to 4.17645, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 4.0082 - val_loss: 4.1765
Epoch 12/20
[1m567/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 3.8399
Epoch 12: val_loss did not improve from 4.17645
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 3.8398 - val_loss: 4.1893
Epoch 13/20
[1m565/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 3.7144
Epoch 13: val_loss improved from 4.17645 to 4.11873, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 3.7144 - val_loss: 4.1187
Epoch 14/20
[1m565/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - loss: 3.7895
Epoch 14: val_loss improved from 4.11873 to 4.09006, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - loss: 3.7891 - val_loss: 4.0901
Epoch 15/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 3.6412
Epoch 15: val_loss improved from 4.09006 to 4.06266, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - loss: 3.6411 - val_loss: 4.0627
Epoch 16/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 3.6384
Epoch 16: val_loss did not improve from 4.06266
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 3.6385 - val_loss: 4.0714
Epoch 17/20
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 3.6370
Epoch 17: val_loss did not improve from 4.06266
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - loss: 3.6369 - val_loss: 4.0682
Epoch 18/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - loss: 3.6589
Epoch 18: val_loss improved from 4.06266 to 4.04901, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 3.6590 - val_loss: 4.0490
Epoch 19/20
[1m567/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - loss: 3.6229
Epoch 19: val_loss improved from 4.04901 to 4.02630, saving model to /content/drive/MyDrive/Predictive Analysis Lab/Evo Astra/model_9.h5




[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - loss: 3.6229 - val_loss: 4.0263
Epoch 20/20
[1m566/568[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - loss: 3.5953
Epoch 20: val_loss did not improve from 4.02630
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - loss: 3.5954 - val_loss: 4.0696
Restoring model weights from the end of the best epoch: 19.
Model training finished.


In [18]:
# 10. Evaluate with BLEU Score (on validation set)

print("\nCalculating BLEU score on the validation set...")


Calculating BLEU score on the validation set...


In [19]:
#  Helper function for BLEU evaluation

# Map predicted index back to a word
def idx_to_word(integer, tokenizer):
    return next((word for word, index in tokenizer.word_index.items() if index == integer), None)

# Function to generate caption for BLEU scoring
def generate_caption_for_bleu(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace("startseq", "").replace("endseq", "").split()  # return as list of words

#  BLEU score calculation
references = []
candidates = []

for img_id in tqdm(val_ids):
    if img_id in features and img_id in filtered_mapping:
        # Reference captions
        reference_captions = [
            cap.replace("startseq", "").replace("endseq", "").strip().split()
            for cap in filtered_mapping[img_id]
        ]
        references.append(reference_captions)

        # Candidate caption
        candidate_caption = generate_caption_for_bleu(model, tokenizer, features[img_id], max_length)
        candidates.append(candidate_caption)
    else:
        print(f"Warning: Skipping BLEU calculation for {img_id} due to missing data.")

if references and candidates and len(references) == len(candidates):
    print("Calculating BLEU scores...")
    bleu1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    bleu2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0))
    bleu3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, candidates, weights=(0.25, 0.25, 0.25, 0.25))

    print(f"BLEU-1: {bleu1:.4f}")
    print(f"BLEU-2: {bleu2:.4f}")
    print(f"BLEU-3: {bleu3:.4f}")
    print(f"BLEU-4: {bleu4:.4f}")
else:
    print("Skipping BLEU score calculation: No valid references or candidates found.")

100%|██████████| 810/810 [09:38<00:00,  1.40it/s]


Calculating BLEU scores...
BLEU-1: 0.4372
BLEU-2: 0.2415
BLEU-3: 0.1328
BLEU-4: 0.0706


In [20]:
# Generate captions for validation images and collect references
for img_id in tqdm(val_ids):
    if img_id in features and img_id in filtered_mapping:
        # Get reference captions (cleaned and split into words)
        reference_captions = [cap.replace("startseq", "").replace("endseq", "").strip().split() for cap in filtered_mapping[img_id]]
        references.append(reference_captions)

        # Generate candidate caption (split into words)
        candidate_caption = generate_caption_for_bleu(model, tokenizer, features[img_id], max_length)
        candidates.append(candidate_caption)
    else:
        print(f"Warning: Skipping BLEU calculation for {img_id} due to missing data.")


if references and candidates and len(references) == len(candidates):
    # Calculate BLEU scores
    # weights=(1, 0, 0, 0) for BLEU-1, (0.5, 0.5, 0, 0) for BLEU-2, etc.
    print("Calculating BLEU scores...")
    bleu1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    bleu2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0))
    bleu3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, candidates, weights=(0.25, 0.25, 0.25, 0.25))

    print(f"BLEU-1: {bleu1:.4f}")
    print(f"BLEU-2: {bleu2:.4f}")
    print(f"BLEU-3: {bleu3:.4f}")
    print(f"BLEU-4: {bleu4:.4f}")
else:
    print("Skipping BLEU score calculation: No valid references or candidates found.")

100%|██████████| 810/810 [09:50<00:00,  1.37it/s]


Calculating BLEU scores...
BLEU-1: 0.4372
BLEU-2: 0.2415
BLEU-3: 0.1328
BLEU-4: 0.0706


In [23]:
from google.colab import output
from PIL import Image
from io import BytesIO
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_v3 import InceptionV3

# Global variable to store uploaded image
uploaded_img = None
model_incep = None  # will hold InceptionV3 model for feature extraction

def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def upload_image(file_bytes):
    global uploaded_img
    uploaded_img = Image.open(BytesIO(bytearray(file_bytes))).convert('RGB')
    return "Image received!"

def generate_caption():
    global uploaded_img, model, tokenizer, max_length, model_incep

    if uploaded_img is None:
        return "No image uploaded yet!"

    img_resized = uploaded_img.resize((299, 299))
    image_array = img_to_array(img_resized)
    image_array = np.expand_dims(image_array, axis=0)
    image_array = preprocess_input(image_array)

    if model_incep is None:
        base_model = InceptionV3(weights='imagenet')
        model_incep = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

    image_features = model_incep.predict(image_array, verbose=0)

    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([image_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break

    caption = in_text.replace("startseq", "").replace("endseq", "").strip()
    return caption

output.register_callback('notebook.upload_image', upload_image)
output.register_callback('notebook.generate_caption', generate_caption)

In [None]:
from IPython.display import display, HTML

display(HTML('''
<link href="https://fonts.googleapis.com/css2?family=Libre+Baskerville&display=swap" rel="stylesheet">

<style>
  body {
    font-family: 'Poppins', sans-serif;
    background: linear-gradient(120deg, #3c8ce7, #00eaff);
    background-size: 200% 200%;
    animation: auroraBG 8s ease infinite;
    color: white;
    text-align: center;
    padding: 40px;
  }

  @keyframes auroraBG {
    0% {background-position: 0% 50%;}
    50% {background-position: 100% 50%;}
    100% {background-position: 0% 50%;}
  }

  h2 {
    font-size: 2rem;
    margin-bottom: 20px;
    background: linear-gradient(90deg, #000080, #000000);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    font-weight: 600;
  }

  .glass-card {
    backdrop-filter: blur(15px);
    background: rgba(255, 255, 255, 0.15);
    border: 1px solid rgba(255, 255, 255, 0.25);
    border-radius: 20px;
    padding: 25px;
    width: 340px;
    margin: auto;
    box-shadow: 0 8px 32px rgba(0,0,0,0.2);
    transition: transform 0.3s ease, box-shadow 0.3s ease;
  }
  .glass-card:hover {
    transform: translateY(-5px);
    box-shadow: 0 12px 40px rgba(0,0,0,0.3);
  }

  .glass-card img {
    width: 100%;
    border-radius: 15px;
    box-shadow: 0 5px 20px rgba(0,0,0,0.2);
    margin-bottom: 15px;
  }

  #fileInput {
    margin-top: 10px;
    padding: 12px 18px;
    font-weight: 500;
    font-size: 0.95rem;
    border: none;
    border-radius: 8px;
    background: rgba(255,255,255,0.2);
    color: #000000;
    cursor: pointer;
    transition: background 0.3s ease;
  }
  #fileInput:hover {
    background: rgba(174, 228, 255, 0.5);
  }

  #generate-btn {
    background: linear-gradient(90deg, #3c8ce7, #00eaff);
    border: none;
    padding: 12px 28px;
    font-size: 1rem;
    font-weight: 600;
    border-radius: 25px;
    cursor: pointer;
    color: #000000;
    box-shadow: 0 5px 20px rgba(60, 140, 231, 0.4);
    transition: transform 0.2s ease, box-shadow 0.3s ease;
    margin-top: 10px;
  }
  #generate-btn:disabled {
    background: rgba(255,255,255,0.3);
    cursor: not-allowed;
    box-shadow: none;
  }
  #generate-btn:hover:not(:disabled) {
    transform: translateY(-2px);
    box-shadow: 0 8px 25px rgba(60, 140, 231, 0.6);
  }

  #caption-output {
    margin-top: 20px;
    font-size: 1.1rem;
    font-family: 'Libre Baskerville', serif;
    color: #000000;
    min-height: 40px;
  }
</style>

<div class="glass-card">
  <h2>Automatic Image Caption Generator</h2>
  <input type="file" id="fileInput" accept="image/*" />
  <img id="preview" src="" style="display:none;" />
  <br/>
  <button id="generate-btn" disabled>Generate Caption</button>
  <div id="caption-output"></div>
</div>

<script>
  const input = document.getElementById('fileInput');
  const preview = document.getElementById('preview');
  const generateBtn = document.getElementById('generate-btn');
  const captionOutput = document.getElementById('caption-output');

  input.onchange = evt => {
    const [file] = input.files;
    if (file) {
      preview.src = URL.createObjectURL(file);
      preview.style.display = 'block';
      generateBtn.disabled = false;
      captionOutput.textContent = '';

      const reader = new FileReader();
      reader.onload = function() {
        const arrayBuffer = reader.result;
        const bytes = new Uint8Array(arrayBuffer);
        google.colab.kernel.invokeFunction('notebook.upload_image', [Array.from(bytes)], {});
      };
      reader.readAsArrayBuffer(file);
    }
  };

  generateBtn.onclick = () => {
    captionOutput.textContent = 'Generating caption...';

    google.colab.kernel.invokeFunction('notebook.generate_caption', [], {}).then(result => {
      captionOutput.textContent = result.data['text/plain'].replace(/'/g, "");
    }).catch(() => {
      captionOutput.textContent = '⚠️ Error generating caption';
    });
  };
</script>
'''))


In [37]:
from IPython.display import display, HTML

display(HTML('''
<link href="https://fonts.googleapis.com/css2?family=Libre+Baskerville&family=Poppins:wght@400;600&display=swap" rel="stylesheet">

<style>
/* --- Base styles --- */
body {
  font-family: 'Poppins', sans-serif;
  background: #000;
  padding: 40px;
  text-align: center;
  color: #fff;
  min-height: 100vh;
  box-sizing: border-box;
}

/* --- Responsive & Layout --- */
.glass-card {
  backdrop-filter: blur(25px) saturate(180%);
  -webkit-backdrop-filter: blur(25px) saturate(180%);
  background: rgba(255, 255, 255, 0.08);
  border-radius: 28px;
  border: 1px solid rgba(255, 255, 255, 0.15);
  padding: 32px 24px 28px 24px;
  max-width: 400px;
  margin: 40px auto;
  box-shadow: 0 4px 30px rgba(0,0,0,0.6);
  transition: transform 0.3s, box-shadow 0.3s;
  position: relative;
}
@media (max-width: 600px) {
  .glass-card {
    max-width: 96vw;
    padding: 16px 6vw 20px 6vw;
  }
}

/* --- Step Progress Indicator --- */
.steps {
  display: flex;
  justify-content: space-between;
  margin-bottom: 20px;
  gap: 3px;
  font-size: 1rem;
}
.step {
  flex: 1 1 0;
  text-align: center;
  padding: 7px 0 5px 0;
  border-radius: 18px;
  background: rgba(255,255,255,0.05);
  color: #8eeeff;
  font-weight: 500;
  transition: background 0.3s, color 0.3s;
  letter-spacing: 0.01em;
  user-select: none;
}
.step.active {
  background: linear-gradient(90deg, #3c8ce7 60%, #00eaff 100%);
  color: #172a40;
  font-weight: 700;
  box-shadow: 0 2px 12px #00eaff40;
}

/* --- Heading Gradient --- */
h2 {
  font-size: 2.1rem;
  font-weight: 600;
  margin-bottom: 14px;
  background: linear-gradient(90deg, #3c8ce7, #00eaff);
  -webkit-background-clip: text;
  -webkit-text-fill-color: transparent;
  letter-spacing: 0.02em;
}

/* --- Drag & Drop --- */
#dropzone {
  border: 2px dashed #3c8ce7;
  border-radius: 18px;
  background: rgba(50, 95, 170, 0.05);
  color: #8eeeff;
  padding: 28px 0 18px 0;
  margin-bottom: 12px;
  font-size: 1.04rem;
  transition: border-color 0.3s, background 0.3s;
  cursor: pointer;
  position: relative;
}
#dropzone.dragover {
  border-color: #00eaff;
  background: rgba(0,234,255,0.07);
}
#dropzone:focus {
  outline: 2px solid #00eaff;
}

/* --- File input / Demo button --- */
#fileInput, #demo-btn {
  margin: 6px 0 0 0;
  padding: 12px 22px;
  font-weight: 500;
  font-size: 1rem;
  border: none;
  border-radius: 18px;
  background: rgba(255,255,255,0.14);
  color: #fff;
  backdrop-filter: blur(12px);
  cursor: pointer;
  transition: background 0.3s, transform 0.1s;
}
#fileInput:hover, #demo-btn:hover {
  background: rgba(255,255,255,0.26);
  transform: scale(1.03);
}
#fileInput:focus, #demo-btn:focus {
  outline: 2px solid #00eaff;
}
/* Demo Button special */
#demo-btn {
  background: linear-gradient(90deg, #3c8ce7, #00eaff);
  color: #192e3a;
  margin-left: 7px;
  font-weight: 600;
}

/* --- Image preview --- */
#preview {
  width: 100%;
  border-radius: 18px;
  box-shadow: 0 4px 18px rgba(0,0,0,0.5);
  margin: 18px 0 12px 0;
  display: none;
  outline: 0;
  transition: box-shadow 0.2s;
}
#preview:focus {
  box-shadow: 0 0 0 3px #00eaff80;
}

/* --- Generate Button --- */
#generate-btn {
  background: linear-gradient(90deg, #3c8ce7, #00eaff);
  border: none;
  padding: 12px 28px;
  font-size: 1.02rem;
  font-weight: 700;
  border-radius: 28px;
  cursor: pointer;
  color: #fff;
  box-shadow: 0 5px 20px rgba(0,122,255,0.3);
  margin: 13px auto 4px auto;
  transition: transform 0.2s, box-shadow 0.3s, background 0.2s;
  display: block;
}
#generate-btn:disabled {
  background: rgba(255,255,255,0.12);
  color: #b2b2b2;
  cursor: not-allowed;
  box-shadow: none;
}
#generate-btn:focus {
  outline: 2px solid #00eaff;
}
#generate-btn:hover:not(:disabled) {
  transform: translateY(-2px) scale(1.03);
  box-shadow: 0 9px 36px #00eaff55;
}

/* --- Spinner loader --- */
#spinner {
  display: none;
  margin: 15px auto 2px auto;
  width: 37px;
  height: 37px;
}
.spinner-inner {
  box-sizing: border-box;
  border: 4px solid #3c8ce7;
  border-top: 4px solid #00eaff;
  border-radius: 50%;
  width: 37px;
  height: 37px;
  animation: spin 0.8s linear infinite;
  margin: auto;
}
@keyframes spin {
  0% { transform: rotate(0);}
  100% { transform: rotate(360deg);}
}

/* --- Caption Output --- */
#caption-output {
  min-height: 38px;
  margin-top: 16px;
  font-size: 1.09rem;
  font-family: 'Libre Baskerville', serif;
  color: #fff;
  background: rgba(60, 140, 231, 0.08);
  border-radius: 12px;
  padding: 10px;
  box-shadow: 0 2px 12px #3c8ce715;
  transition: background 0.2s;
  word-break: break-word;
}

/* --- Error state --- */
#error-msg {
  color: #ff6e6e;
  font-weight: 600;
  margin: 7px 0 2px 0;
  display: none;
}

/* --- Accessibility: focus and ARIA enhancements, improved contrast, transitions --- */
/* --- Micro-interaction: soft glows, animated step highlight, hover/focus as above --- */
</style>

<div class="glass-card" role="region" aria-label="Image Caption Generator Card">
  <h2 tabindex="0">Automatic Image Caption Generator</h2>

  <div class="steps" aria-label="Steps Indicator">

    <div class="step" id="step1">1. Upload</div>
    <div class="step" id="step2">2. Preview</div>
    <div class="step" id="step3">3. Generate</div>
  </div>
  <div id="dropzone" tabindex="0" aria-label="File drop area or click to select image">
    <span id="dropzonetext">Drag & drop an image here, or click to select</span>
    <input type="file" id="fileInput" accept="image/*" aria-label="Choose image file" style="opacity:0;width:0.1px;position:absolute;z-index:-1;" />
    <button id="demo-btn" type="button" aria-label="Try sample image">Demo Image</button>
  </div>
  <img id="preview" src="" alt="Selected preview" tabindex="0" />
  <span id="error-msg" role="alert"></span>
  <button id="generate-btn" disabled aria-label="Generate Caption">Generate Caption</button>
  <div id="spinner"><div class="spinner-inner" aria-hidden="true"></div></div>
  <div id="caption-output" aria-live="polite"></div>
</div>

<script>
// --- ELEMENTS ---
const steps = [document.getElementById('step1'), document.getElementById('step2'), document.getElementById('step3')];
const dropzone = document.getElementById('dropzone');
const fileInput = document.getElementById('fileInput');
const preview = document.getElementById('preview');
const generateBtn = document.getElementById('generate-btn');
const captionOutput = document.getElementById('caption-output');
const errorMsg = document.getElementById('error-msg');
const spinner = document.getElementById('spinner');
const demoBtn = document.getElementById('demo-btn');
const dropzonetext = document.getElementById('dropzonetext');

// SVG demo image (a colorful placeholder, replace with a real img if desired)
const demoImgURL = 'https://upload.wikimedia.org/wikipedia/commons/8/89/Portrait_Placeholder.png';

// --- ACCESSIBILITY helpers ---
dropzone.addEventListener('keydown', function(e) {
  if (e.key === ' ' || e.key === 'Enter') {
    fileInput.click();
    e.preventDefault();
  }
});
dropzone.addEventListener('click', function() {
  fileInput.click();
});

// --- DRAG & DROP ---
dropzone.addEventListener('dragover', (e) => {
  e.preventDefault(); dropzone.classList.add('dragover');
});
dropzone.addEventListener('dragleave', (e) => {dropzone.classList.remove('dragover');});
dropzone.addEventListener('drop', (e) => {
  e.preventDefault(); dropzone.classList.remove('dragover');
  if (e.dataTransfer.files.length > 0) {
    fileInput.files = e.dataTransfer.files;
    fileInput.dispatchEvent(new Event('change'));
  }
});

// --- INPUT HANDLERS ---
fileInput.onchange = evt => {
  errorMsg.style.display = "none";
  const [file] = fileInput.files;
  if (!file) return;
  if (!file.type.startsWith('image/')) {
    errorMsg.textContent = "File is not an image!"; errorMsg.style.display = "block";
    return;
  }
  preview.src = URL.createObjectURL(file);
  preview.style.display = 'block';
  preview.setAttribute('aria-label', 'Selected image preview');
  // Update steps
  steps.forEach(s=>s.classList.remove('active'));
  steps[1].classList.add('active');
  generateBtn.disabled = false; captionOutput.textContent = '';
  // Upload to kernel for notebook
  const reader = new FileReader();
  reader.onload = function() {
    const arrayBuffer = reader.result;
    const bytes = new Uint8Array(arrayBuffer);
    if (window.google && google.colab && google.colab.kernel)
      google.colab.kernel.invokeFunction('notebook.upload_image', [Array.from(bytes)], {});
  };
  reader.readAsArrayBuffer(file);
};

// --- DEMO BUTTON HANDLER ---
demoBtn.onclick = () => {
  errorMsg.style.display = "none";
  preview.src = demoImgURL;
  preview.style.display = 'block';
  preview.setAttribute('aria-label', 'Sample image preview');
  // Simulate step
  steps.forEach(s=>s.classList.remove('active'));
  steps[1].classList.add('active');
  generateBtn.disabled = false; captionOutput.textContent = '';
  // If desired, also load into kernel via fetch/XHR...
};

// --- GENERATE CAPTION ---
generateBtn.onclick = () => {
  errorMsg.style.display = "none";
  spinner.style.display = "block";
  captionOutput.textContent = '';
  generateBtn.disabled = true;
  steps.forEach(s=>s.classList.remove('active'));
  steps[2].classList.add('active');

  // Always open new tab on user interaction for popup unblock
  let outputTab = window.open('', '_blank');
  if (!outputTab) {
    captionOutput.textContent = "⚠️ Please allow pop-ups for this site to view the output in a new tab.";
    spinner.style.display = "none";
    generateBtn.disabled = false;
    return;
  }

  // Helper to build and update the tab
  function updateTabUI(imgSrc, captionText) {
    let tabHtml = `
    <html>
    <head>
      <title>Automatic Image Caption Generator Output</title>
      <link href="https://fonts.googleapis.com/css2?family=Libre+Baskerville&family=Poppins:wght@400;600&display=swap" rel="stylesheet">
      <style>
        body {font-family: 'Poppins', sans-serif; background: #000; color: #fff; padding: 40px 0; margin: 0;}
        .glass-card {
          backdrop-filter: blur(25px) saturate(180%);
          -webkit-backdrop-filter: blur(25px) saturate(180%);
          background: rgba(255,255,255,0.08); border-radius: 28px;
          border: 1px solid rgba(255,255,255,0.15); padding: 38px 22px 32px 22px;
          max-width: 400px; margin: 48px auto;
          box-shadow: 0 4px 30px rgba(0,0,0,0.6); text-align: center;
        }
        h2 {
          font-size: 2.1rem;font-weight: 600;margin-bottom: 14px;
          background: linear-gradient(90deg, #3c8ce7, #00eaff);
          -webkit-background-clip: text;-webkit-text-fill-color: transparent;
        }
        .steps {
          display: flex;justify-content: space-between;margin-bottom: 18px;gap:2.5px;font-size: 1rem;
        }
        .step {
          flex: 1 1 0;text-align: center;padding:6px 0 4px 0;border-radius:18px;
          background: rgba(255,255,255,0.05); color:#8eeeff;font-weight:500;
        }
        .step.active { background: linear-gradient(90deg,#3c8ce7 60%,#00eaff 100%);
          color:#172a40;font-weight:700;box-shadow:0 2px 12px #00eaff40;}
        #preview { width: 100%; border-radius: 18px; box-shadow: 0 4px 18px rgba(0,0,0,0.5); margin:15px 0 12px 0;}
        #caption-output {
          min-height:38px;margin-top:12px;
          font-size:1.09rem;font-family:'Libre Baskerville', serif;
          color:#fff;background:rgba(60,140,231,0.08);
          border-radius:12px;padding:10px;box-shadow:0 2px 12px #3c8ce715;
          transition: background 0.2s;word-break:break-word;
        }
      </style>
    </head>
    <body>
      <div class="glass-card" role="region" aria-label="Image Caption Generator Card">
        <h2 tabindex="0">Automatic Image Caption Generator</h2>
        <div class="steps" aria-label="Steps Indicator">
          <div class="step">1. Upload</div>
          <div class="step">2. Preview</div>
          <div class="step active">3. Generate</div>
        </div>
        <img id="preview" src="${imgSrc}" alt="Selected preview" tabindex="0" style="display:block;" />
        <div id="caption-output" aria-live="polite">${captionText ? captionText : 'Generating caption...'}</div>
      </div>
    </body>
    </html>
    `;
    outputTab.document.open(); outputTab.document.write(tabHtml); outputTab.document.close();
  }

  // Get image for output tab: uploaded (file) or demo (URL)
  let imgSrc = preview.src;
  let isDemo = !(imgSrc.startsWith('blob:'));
  // Show initial UI in output tab (loading spinner equivalent)
  updateTabUI(imgSrc, "Generating caption...");

  // Now run the caption kernel function as before
  if (window.google && google.colab && google.colab.kernel) {
    google.colab.kernel.invokeFunction('notebook.generate_caption', [], {})
      .then(result => {
        spinner.style.display = "none";
        let text = (result && result.data && result.data['text/plain']) ? result.data['text/plain'].replace(/'/g, "") : "";
        // For uploaded file, re-convert to img DataURL so it always works cross-tab
        if (!isDemo) {
          // Find the file from file input
          let file = fileInput.files[0];
          if (file) {
            let reader = new FileReader();
            reader.onload = function(evt) {
              let dataUrl = evt.target.result;
              updateTabUI(dataUrl, text || "No caption returned.");
            }
            reader.readAsDataURL(file);
          } else {
            updateTabUI('', text || "No caption returned.");
          }
        } else {
          updateTabUI(imgSrc, text || "No caption returned.");
        }
        generateBtn.disabled = false;
      })
      .catch(() => {
        spinner.style.display = "none";
        let errMsg = "⚠️ Error generating caption";
        if (!isDemo) {
          let file = fileInput.files;
          if (file) {
            let reader = new FileReader();
            reader.onload = function(evt) {
              updateTabUI(evt.target.result, errMsg);
            }
            reader.readAsDataURL(file);
          } else {
            updateTabUI('', errMsg);
          }
        } else {
          updateTabUI(imgSrc, errMsg);
        }
        generateBtn.disabled = false;
      });
  } else {
    setTimeout(() => {
      spinner.style.display = "none";
      let errMsg = "Notebook kernel API not available.";
      if (!isDemo) {
        let file = fileInput.files;
        if (file) {
          let reader = new FileReader();
          reader.onload = function(evt) {
            updateTabUI(evt.target.result, errMsg);
          }
          reader.readAsDataURL(file);
        } else {
          updateTabUI('', errMsg);
        }
      } else {
        updateTabUI(imgSrc, errMsg);
      }
      generateBtn.disabled = false;
    }, 800);
  }
};



// --- INITIAL STATE STEP HIGHLIGHT ---
steps.forEach(s=>s.classList.remove('active'));
steps[0].classList.add('active');

/* --- Keyboard a11y for demo button, preview, etc. */
demoBtn.tabIndex = 0;
preview.tabIndex = 0;
fileInput.tabIndex = -1;

/* --- Error state reset for every action --- */
['click','focus','input'].forEach(ev=>{
  preview.addEventListener(ev, ()=>{errorMsg.style.display="none"; });
  fileInput.addEventListener(ev, ()=>{errorMsg.style.display="none"; });
  demoBtn.addEventListener(ev, ()=>{errorMsg.style.display="none"; });
});
</script>
'''))
