In [3]:
README = """
* Use NLTK for tokenizing input - https://www.nltk.org/
* Word2Vec for Embedding vector
* Use Resnet-18
* Use RNN
"""

In [2]:
import pandas as pd
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Embedding, Flatten, SimpleRNN,Reshape, Dense, Input, Concatenate
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import load_img, img_to_array 
from tqdm import tqdm
import numpy as np
import cv2

2025-02-04 04:03:52.254266: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738641832.376476     400 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738641832.415065     400 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-04 04:03:52.721405: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
data = pd.read_csv("../data/flickr8k_dataset_kaggle/captions.txt")
print(f"num of examples {len(data)}")

data.head()

num of examples 40455


Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [5]:
tokenized_sentences = list()
for i in range(len(data)):
    sentence = data.iloc[i]["caption"]
    tokenized_sentences.append(sentence.lower().split())

In [6]:
tokens = set()
max_len = 0
for token_list in tokenized_sentences:
    max_len = max(len(token_list), max_len)
    for token in token_list:
        tokens.add(token) 

num_tokens = len(tokens)
print(f"Num of tokens : {num_tokens}")
print(f"Max Length of sentence : {max_len}")

Num of tokens : 8918
Max Length of sentence : 38


In [7]:
token2ind = dict()
ind2token = dict() 
ind = 1

for word in tokens:
    token2ind[word] = ind 
    ind2token[ind] = word 
    ind += 1 

In [8]:
resnet_model = ResNet50(weights='imagenet', include_top=False)
resnet_model.trainable = False

I0000 00:00:1738641880.627281     400 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1767 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [9]:
folder_path = "../data/flickr8k_dataset_kaggle/Images/"

def load(image_path, size = (224,224)):
    img = load_img(image_path, target_size=size)
    img_array = img_to_array(img) 
    img_array = np.expand_dims(img_array, axis = 0)
    img_array = preprocess_input(img_array) 
    return img_array

def extract_feature(image_path, model): 
    img_array = load(image_path) 
    features = model.predict(img_array, verbose = 0) 
    features = features.flatten()
    return features

In [None]:
image_features = list() 
for i in tqdm(range(len(data))):
    img_path = folder_path +  data.iloc[0]["image"] 
    features = extract_feature(img_path, resnet_model) 
    image_features.append(features) 

In [34]:
embedding_model = Embedding(input_dim = num_tokens + 1,
                             output_dim = 100)

In [36]:
def get_model():
    input_ = Input(shape=(224,224,3)) 
    x = resnet_model(input_) 
    x = Flatten()(x)

    y = Input(shape=(max_len,)) 
    y = embedding_model(y) 
    y = SimpleRNN(256)(y) 

    output = Concatenate()([x,y]) 
    output = Dense(100, activation = "relu")(output)
    output = Dense(num_tokens, activation = "softmax")(output) 

    model = Model(inputs = input_, outputs = output)  
    return model
    
get_model().summary()