In [None]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16  # Or ResNet50
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import zipfile
import os

zip_path = 'Cars.zip'
extract_path = '/content/images'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

image_dir = extract_path
print("Images extracted to:", image_dir)

In [None]:
def preprocess_image(image_path):
    img = load_img(image_path, target_size=(224, 224))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.vgg16.preprocess_input(img)
    return img

image_files = [os.path.join(extract_path, f) for f in os.listdir(extract_path) if f.lower().endswith('.jpg')]

preprocessed_images = [preprocess_image(path) for path in image_files]

print(f"Processed {len(preprocessed_images)} images.")

cnn_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

image_features = {}
for img_path in image_files:
    features = cnn_model.predict(preprocess_image(img_path))
    image_features[img_path] = features.flatten()
print("Extracted features from images.")

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
caption_file = 'Captions.txt'caption_file = 'Captions.txt'

In [None]:
image_captions = {}
with open(caption_file, 'r') as f:
    for line in f:
        image_path, caption = line.strip().split(' ', 1)
        if image_path not in image_captions:
            image_captions[image_path] = []
        image_captions[image_path].append(caption)

all_captions = [caption for captions in image_captions.values() for caption in captions]
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

caption_sequences = {}
for img_path, captions in image_captions.items():
    sequences = tokenizer.texts_to_sequences(captions)
    caption_sequences[img_path] = sequences

max_caption_length = max(len(seq) for seqs in caption_sequences.values() for seq in seqs)
padded_sequences = {}
for img_path, sequences in caption_sequences.items():
    padded_sequences[img_path] = pad_sequences(sequences, maxlen=max_caption_length, padding='post')



captions_dict = {}

In [None]:
with open(caption_file, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            image_path, caption = parts
            captions_dict[image_path] = caption
        else:
            print("Skipping malformed line:", line)

In [None]:
image_input = Input(shape=(cnn_model.output_shape[1],))

caption_input = Input(shape=(max_caption_length,))

embedding_layer = Embedding(vocab_size, 256, mask_zero=True)(caption_input)

rnn_layer = LSTM(256)(embedding_layer)

decoder = tf.keras.layers.concatenate([image_input, rnn_layer])

output = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[image_input, caption_input], outputs=output)

model.compile(optimizer='adam', loss='categorical_crossentropy')



In [None]:
for i, img_path in enumerate(image_files):
    file_name = os.path.basename(img_path)
    caption = captions_dict.get(file_name, "No caption found")
    preprocessed = preprocess_image(os.path.join(extract_path, file_name))
    print(f"{file_name}: {caption} | Preprocessed shape: {preprocessed.shape}")
    if i >= 1:
        break

In [None]:
print("\n--- Preprocessing and Feature Extraction Results ---")
if preprocessed_images:
  print("Shape of the first preprocessed image:", preprocessed_images[0].shape)
else:
  print("No images were preprocessed.")

print("\nFirst 2 entries of image_features:")
for i, (key, value) in enumerate(image_features.items()):
  print(f"{key}: shape {value.shape}")
  if i >= 1:
    break

print("\nFirst 2 entries of image_captions:")
for i, (key, value) in enumerate(image_captions.items()):
  print(f"{key}: {value}")
  if i >= 1:
    break

print(f"\nVocabulary size: {vocab_size}")
print(f"Maximum caption length: {max_caption_length}")
print("\nFirst 10 words in tokenizer's word index:")
# We can print the first few items of the word index to see the vocabulary
for i, (word, index) in enumerate(tokenizer.word_index.items()):
  print(f"{word}: {index}")
  if i >= 9:
    break

print("\nShape of padded sequences for the first image:")
if list(padded_sequences.values()):
    print(list(padded_sequences.values())[0].shape)
else:
    print("No padded sequences generated.")

print("\n--- Model Summary ---")
model.summary()
