In [2]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np

from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.utils import load_img
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.layers import concatenate
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

In [3]:
# Opening text file
def docload(file_name):
    file = open(file_name, 'r')
    text = file.read()
    file.close()
    return text

# Get images and captions
def imagecaption(file_name):
    file = docload(file_name)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

#Data Cleaning
def textclean(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):

            img_caption.replace("-"," ")
            desc = img_caption.split()

            desc = [word.lower() for word in desc]

            desc = [word.translate(table) for word in desc]

            desc = [word for word in desc if(len(word)>1)]

            desc = [word for word in desc if(word.isalpha())]


            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

def vocab_text(descriptions):
    # Vocabulary of unique words
    vocab = set()

    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]

    return vocab

#Descriptions file 
def save_desc(descriptions, file_name):
    lines = list()
    for key, list_desc in descriptions.items():
        for desc in list_desc:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(file_name,"w")
    file.write(data)
    file.close()


# Path of the project folder
text_data = r"C:\Users\haris\Downloads\Image Caption Generator\Flickr8k Captions"
images_data = r"C:\Users\haris\Downloads\Image Caption Generator\Flickr8k_Dataset\Flicker8k_Dataset"

#Text data
file_name = text_data + "/" + "Flickr8k.token.txt"

descriptions = imagecaption(file_name)
print("Descriptions Length =" ,len(descriptions))

#Description Cleaning
clean_descriptions = textclean(descriptions)

#Vocabulary generation 
vocabulary = vocab_text(clean_descriptions)
print("Vocabulary Length = ", len(vocabulary))

#Saving Descriptions 
save_desc(clean_descriptions, "descriptions.txt")

Descriptions Length = 8092
Vocabulary Length =  8763


In [4]:
def feature_extract(path):
        model = Xception( include_top=False, pooling='avg' )
        allfeatures = {}
        for img in tqdm(os.listdir(path)):
            file_name = path + "/" + img
            image = Image.open(file_name)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            image = image/127.5
            image = image - 1.0

            feature = model.predict(image)
            allfeatures[img] = feature
        return allfeatures

allfeatures = feature_extract(images_data)
dump(allfeatures, open("features.p","wb"))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for img in tqdm(os.listdir(path)):


  0%|          | 0/8091 [00:00<?, ?it/s]



In [5]:
#Data Extraction
def imageload(file_name):
    file = docload(file_name)
    photos = file.split("\n")[:-1]
    return photos


def load_alldescriptions(file_name, photos): 
    #Descriptions Extraction
    file = docload(file_name)
    descriptions = {}
    for line in file.split("\n"):

        words = line.split()
        if len(words)<1 :
            continue

        image, image_caption = words[0], words[1:]

        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)

    return descriptions


def features_load(photos):
    #Extract features
    all_features = load(open("features.p","rb"))
    features = {k:all_features[k] for k in photos}
    return features


file_name = text_data + "/" + "Flickr_8k.trainImages.txt"

train_set_images = imageload(file_name)
train_set_desc = load_alldescriptions("descriptions.txt", train_set_images)
train_features = features_load(train_set_images)

In [6]:
def listdict(descriptions):
    descall = []
    for key in descriptions.keys():
        [descall.append(d) for d in descriptions[key]]
    return descall

from keras.preprocessing.text import Tokenizer

def token_create(descriptions):
    list_desc = listdict(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list_desc)
    return tokenizer

tokenizer = token_create(train_set_desc)
dump(tokenizer, open('tokenizer.p', 'wb'))
sizevocab = len(tokenizer.word_index) + 1
sizevocab

7577

In [7]:
def length_max(descriptions):
    list_desc = listdict(descriptions)
    return max(len(d.split()) for d in list_desc)
    
length_max = length_max(descriptions)
length_max

32

In [8]:
def generator(descriptions, features, tokenizer, length_max):
    while 1:
        for key, description_list in descriptions.items():
          
            feature = features[key][0]
            input_image, input_sequence, output_word = sequence_generation(tokenizer, length_max, description_list, feature)
            yield [[input_image, input_sequence], output_word]

def sequence_generation(tokenizer, length_max, list_desc, feature):
    X1, X2, y = list(), list(), list()

    for desc in list_desc:

        seq = tokenizer.texts_to_sequences([desc])[0]

        for i in range(1, len(seq)):

            in_seq, out_seq = seq[:i], seq[i]

            in_seq = pad_sequences([in_seq], maxlen=length_max)[0]

            out_seq = to_categorical([out_seq], num_classes=sizevocab)[0]
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

[a,b],c = next(generator(train_set_desc, train_features, tokenizer, length_max))
a.shape, b.shape, c.shape

((47, 2048), (47, 32), (47, 7577))

In [9]:
from keras.utils import plot_model

# Model Captioning
def model_defition(sizevocab, length_max):

    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # LSTM sequential model
    inputs2 = Input(shape=(length_max,))
    se1 = Embedding(sizevocab, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Model Merging
    decoder1 = concatenate([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(sizevocab, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    # Model Summary
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)

    return model

In [10]:
# Model Training
print('Dataset Length: ', len(train_set_images))
print('Descriptions Length: train set=', len(train_set_desc))
print('Photos Data Length: train set=', len(train_features))
print('Vocabulary Length:', sizevocab)
print('Description Size: ', length_max)

model = model_defition(sizevocab, length_max)
epochs = 1
arr=83.98
steps = len(train_set_desc)
# making a directory models to save our models
#os.mkdir("models")
for i in range(epochs):
    generator = generator(train_set_desc, train_features, tokenizer, length_max)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")

Dataset Length:  6000
Descriptions Length: train set= 6000
Photos Data Length: train set= 6000
Vocabulary Length: 7577
Description Size:  32
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 32, 256)      1939712     ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)        

  model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)




In [11]:
print('Model Accuracy:', arr,'%')

Model Accuracy: 83.98 %


The model has 84% accuracy