In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image

import numpy as np

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import os

from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model

# load the ResNet50 Model
feature_extractor = ResNet50(weights='imagenet', include_top=False)
feature_extractor_new = Model(feature_extractor.input, feature_extractor.layers[-2].output)
feature_extractor_new.summary()

image_path = "Images/"

data_images = []

i = 0
for file in os.listdir(image_path):
    if i >= 1000:
        break
    path = image_path + file
    img = image.load_img(path, target_size=(90, 90))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    
    feature = feature_extractor_new.predict(img_data)
    feature_reshaped = np.array(feature).flatten()
    
    data_images.append(feature_reshaped)
    
    i+= 1

2021-12-06 14:37:57.423505: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, None, None,   0           ['input_1[0][0]']                
                                3)                                                                
                                                                                                  
 conv1_conv (Conv2D)            (None, None, None,   9472        ['conv1_pad[0][0]']              
                                64)                                                           

In [2]:
print (len(data_images))

1000


Split train and val

In [3]:
image_list = os.listdir(image_path)
total = 1000
split = 857
image_list = image_list[:total]
len(image_list)
train_data_images = image_list[:split]
test_data_images = image_list[split:]

Tokenization

In [4]:
# loading captions from captions file
import pandas as pd

# loading captions.txt
#captions = pd.read_csv('/kaggle/input/flickr8k/captions.txt', sep=",")
captions = pd.read_csv('captions.txt', sep=",")
captions = captions.rename(columns=lambda x: x.strip().lower())
captions['image'] = captions['image'].apply(lambda x: x.split(".")[0])
captions = captions[['image', 'caption']]
# adding <start> and <end> to every caption
captions['caption'] = "<start> " + captions['caption'] + " <end>"

# in case we have any missing caption/blank caption drop it
print(captions.shape)
captions = captions.dropna()
print(captions.shape)

# training and testing image captions split
train_image_captions = {}
test_image_captions = {}

# list for storing every caption
all_captions = []

# storing training data
i = 0
for image in train_data_images:
    tempDf = captions['caption'][i]
    list_of_captions = tempDf.split()
    train_image_captions[image] = list_of_captions
    all_captions.append(list_of_captions)
    i += 1

# store testing data\
for image in test_data_images:
    tempDf = captions['caption'][i]
    list_of_captions = tempDf.split()
    print(list_of_captions)
    test_image_captions[image] = list_of_captions
    all_captions.append(list_of_captions)
    i+=1

print("Data Statistics")
print(f"Training Images Captions {len(train_image_captions.keys())}")
print(f"Testing Images Captions {len(test_image_captions.keys())}")

(40455, 2)
(40455, 2)
['<start>', 'A', 'man', 'and', 'woman', 'standing', 'in', 'front', 'of', 'a', 'refreshment', 'stand', '<end>']
['<start>', 'A', 'man', 'in', 'a', 'black', 'shirt', 'and', 'jeans', 'walks', 'beside', 'a', 'woman', 'in', 'black', 'pants', 'and', 'a', 'flowery', 'shirt', 'in', 'front', 'of', 'a', 'refreshment', 'stand', 'with', 'others', 'walking', 'nearby', '.', '<end>']
['<start>', 'A', 'woman', 'with', 'a', 'floral', 'shirt', 'and', 'purse', 'and', 'a', 'man', 'with', 'a', 'black', 'shirt', 'are', 'walking', 'while', 'another', 'man', 'walks', 'behind', 'them', '.', '<end>']
['<start>', 'A', 'boy', 'lays', 'on', 'a', 'picnic', 'table', 'bench', '.', '<end>']
['<start>', 'A', 'child', 'is', 'laying', 'down', 'on', 'a', 'wooden', 'bench', '.', '<end>']
['<start>', 'A', 'child', 'is', 'on', 'a', 'bench', 'with', 'arms', 'stretched', 'out', '.', '<end>']
['<start>', 'A', 'little', 'boy', 'in', 'a', 'red', 'shirt', 'sitting', 'on', 'a', 'wooden', 'picnic', 'table', 'be

Tokenizer

In [35]:
import spacy
nlp = spacy.load("en_core_web_sm")

# tokenize evry captions, remove punctuations, lowercase everything
for key, value in train_image_captions.items():
    ls = []
    for v in value:
        doc = nlp(v)
        new_v = " "
        for token in doc:
            if not token.is_punct:
                if token.text not in [" ", "\n", "\n\n"]:
                    new_v = new_v + " " + token.text.lower()
        
        new_v = new_v.strip()
        ls.append(new_v)
    train_image_captions[key] = ls
#print(train_image_captions)

# create a vocabulary of all the unique words present in captions
# flatten the list
all_captions = [caption for list_of_captions in all_captions for caption in list_of_captions]

# use spacy to convert to lowercase and reject any special characters 
tokens = []
for captions in all_captions:
    doc = nlp(captions)
    for token in doc:
        if not token.is_punct:
            if token.text not in [" ", "\n", "\n\n"]:
                tokens.append(token.text.lower())

# get tokens with frequency less than 10
import collections
word_count_dict = collections.Counter(tokens)
reject_words = []
for key, value in word_count_dict.items():
    if value < 10:
        reject_words.append(key)
        
reject_words.append("<")
reject_words.append(">")

 # remove tokens that are in reject words
tokens = [x for x in tokens if x not in reject_words]
#print(tokens)
# convert the token to equivalent index using Tokenizer class of Keras
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='0')
tokenizer.fit_on_texts(tokens)

KeyboardInterrupt: 

In [34]:
tokenizer.word_index

{'0': 1,
 'a': 2,
 't': 3,
 'e': 4,
 'n': 5,
 'r': 6,
 'i': 7,
 's': 8,
 'o': 9,
 'd': 10,
 'l': 11,
 'h': 12,
 'g': 13,
 'w': 14,
 'c': 15,
 'b': 16,
 'u': 17,
 'p': 18,
 'm': 19,
 'y': 20,
 'f': 21,
 'k': 22,
 'v': 23,
 'j': 24,
 'x': 25,
 'z': 26}

In [7]:

# compute length of vocabulary and maximum length of a caption (for padding)
vocab_len = len(tokenizer.word_counts) + 1
print(f"Vocabulary length - {vocab_len}")

max_caption_len = max([len(x) for x in all_captions])
#max_caption_len = 10

print(f"Maximum length of caption - {max_caption_len}")

Vocabulary length - 156
Maximum length of caption - 13


BEGINNNING OF LSTM FIX FIRST BLOCK

In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# generator function to generate inputs for model
def create_training_data(captions, images, tokenizer, max_caption_length, vocab_len, photos_per_batch):
    
    X1, X2, y = list(), list(), list()
    n=0
    for i in range(len(captions)):
        #img_name = captions[i][0]
        img = images[i]
        cap = captions[img]

        tokenized_captions = tokenizer.texts_to_sequences(cap)
        print(cap)
        print(tokenized_captions)
        input_seq = pad_sequences([inp], maxlen=max_caption_length)[0]
        X1
        
    '''
    # loop through every image
    while 1:
        for key, cap in list(captions.items()):
            n+=1
            # retrieve the photo feature
            image = key
            
            for c in cap:
                # encode the sequence #TODO
                sequence = [tokenizer.word_index[word] for word in c.split(' ') if word in list(tokenizer.word_index.keys())]
                
                # split one sequence into multiple X, y pairs
                
                for i in range(1, len(sequence)):
                    # creating input, output
                    inp, out = sequence[:i], sequence[i]
                    # padding input                     
                    input_seq = pad_sequences([inp], maxlen=max_caption_length)[0]
                    # encode output sequence
                    output_seq = to_categorical([out], num_classes=vocab_len)[0]
                    # store
                    X1.append(image)
                    X2.append(input_seq)
                    y.append(output_seq)
                    
            # yield the batch data
            if n==photos_per_batch:
                yield ([np.array(X1), np.array(X2)], np.array(y))
                X1, X2, y = list(), list(), list()
                n=0
    '''

In [9]:
import keras

def create_model(max_caption_length, vocab_length):
    
    # sub network for handling the image feature part
    input_layer1 = keras.Input(shape=(18432,)) #18432
    feature1 = keras.layers.Dropout(0.2)(input_layer1)
    feature2 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature1)
    feature3 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature2)
    feature4 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature3)
    feature5 = keras.layers.Dense(max_caption_length*4, activation='relu')(feature4)
    
    # sub network for handling the text generation part
    input_layer2 = keras.Input(shape=(max_caption_len,)) #max_caption_length
    cap_layer1 = keras.layers.Embedding(vocab_length, 300, input_length=max_caption_length)(input_layer2)
    cap_layer2 = keras.layers.Dropout(0.2)(cap_layer1)
    cap_layer3 = keras.layers.LSTM(max_caption_length*4, activation='relu', return_sequences=True)(cap_layer2)
    cap_layer4 = keras.layers.LSTM(max_caption_length*4, activation='relu', return_sequences=True)(cap_layer3)
    cap_layer5 = keras.layers.LSTM(max_caption_length*4, activation='relu', return_sequences=True)(cap_layer4)
    cap_layer6 = keras.layers.LSTM(max_caption_length*4, activation='relu')(cap_layer5)
    
    # merging the two sub network
    decoder1 = keras.layers.merge.add([feature5, cap_layer6])
    decoder2 = keras.layers.Dense(256, activation='relu')(decoder1)
    decoder3 = keras.layers.Dense(256, activation='relu')(decoder2)
    
    # output is the next word in sequence
    output_layer = keras.layers.Dense(vocab_length, activation='softmax')(decoder3)
    model = keras.models.Model(inputs=[input_layer1, input_layer2], outputs=output_layer)
    
    model.summary()

    return model

In [10]:
print(list(train_image_captions.items())[0])

('2387197355_237f6f41ee.jpg', ['< start >', 'a', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', '', '< end >'])


In [11]:
import spacy
nlp = spacy.load('en_core_web_lg')

# create word embeddings
embedding_dimension = 300
embedding_matrix = np.zeros((vocab_len, embedding_dimension))

# travel through every word in vocabulary and get its corresponding vector
for word, index in tokenizer.word_index.items():
    doc = nlp(word)
    embedding_vector = np.array(doc.vector)
    embedding_matrix[index] = embedding_vector

predictive_model = create_model(max_caption_len, vocab_len)
# adding embeddings to model
predictive_model.layers[2]
predictive_model.layers[2].set_weights([embedding_matrix])
predictive_model.layers[2].trainable = False

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 13)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 18432)]      0           []                               
                                                                                                  
 embedding (Embedding)          (None, 13, 300)      46800       ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 18432)        0           ['input_2[0][0]']                
                                                                                            

In [32]:
# get training data
train_data = create_training_data(train_image_captions, train_data_images, tokenizer, max_caption_len, vocab_len, 32)
# initialize model
model = create_model(max_caption_len, vocab_len)
steps_per_epochs = len(train_image_captions)//32

# compile model
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit_generator(train_data, epochs=100, steps_per_epoch=steps_per_epochs)

['< start >', 'a', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', '', '< end >']
[[3], [1], [30], [4], [1], [141], [142], [7], [61], [51], [1], [], [12], [], [4], [26], [], [], [], [2]]
['< start >', 'a', 'girl', 'going', 'into', 'a', 'wooden', 'building', '', '< end >']
[[3], [1], [16], [], [42], [1], [], [116], [], [2]]
['< start >', 'a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '', '< end >']
[[3], [1], [32], [16], [61], [42], [1], [], [], [], [2]]
['< start >', 'a', 'little', 'girl', 'climbing', 'the', 'stairs', 'to', 'her', 'playhouse', '', '< end >']
[[3], [1], [32], [16], [61], [5], [], [23], [69], [], [], [2]]
['< start >', 'a', 'little', 'girl', 'in', 'a', 'pink', 'dress', 'going', 'into', 'a', 'wooden', 'cabin', '', '< end >']
[[3], [1], [32], [16], [4], [1], [141], [142], [], [42], [1], [], [], [], [2]]
['< start >', 'a', 'black', 'dog', 'and', 'a', 'spotted', 'dog', 'are', 'fighti

TypeError: 'NoneType' object is not an iterator

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
%matplotlib inline

# method for generating captions
def generate_captions(model, image, tokenizer, max_caption_length):
    # input is <start>
    input_text = '<start>'
    
    # keep generating words till we have encountered <end>
    for i in range(max_caption_length):
        seq = [tokenizer.word_index[w] for w in in_text.split() if w in list(tokenizer.word_index.keys())]
        seq = pad_sequences([sequence], maxlen=max_caption_length)
        prediction = model.predict([photo,sequence], verbose=0)
        prediction = np.argmax(prediction)
        word = tokenizer.index_word[prediction]
        input_text += ' ' + word
        if word == '<end>':
            break
    
    # remove <start> and <end> from output and return string
    output = in_text.split()
    output = output[1:-1]
    output = ' '.join(output)
    return output

# traverse through testing images to generate captions
count = 0
for key, value in test_data_images.items():
    test_image = test_data_images[key]
    test_image = np.expand_dims(test_image, axis=0)
    final_caption = generate_captions(predictive_model, test_image, tokenizer.word_index, max_caption_len, tokenizer.index_word)
    
    plt.figure(figsize=(7,7))
    image = Image.open(image_path + "//" + key + ".jpg")
    plt.imshow(image)
    plt.title(final_caption)
    
    count = count + 1
    if count == 3:
        break

NameError: name 'test_image_features' is not defined