In [0]:
import os
from google.colab import drive

drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
cd drive/My Drive/Machine Learning/IMAGE CAPTIONING

/content/drive/My Drive/Machine Learning/IMAGE CAPTIONING


In [0]:
#!kaggle datasets download -d ming666/flicker8k-dataset

In [0]:
import numpy as np
import os
import string
from tqdm import tqdm
from pickle import dump, load
import pandas as pd 
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import add
from tensorflow.keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


# FILE PATHS

In [0]:
Image = "flickr8k_dataset/Flicker8k_Dataset/"
Train = "flickr8k_text/Flickr_8k.trainImages.txt"
Test =  "flickr8k_text/Flickr_8k.testImages.txt"
text = "flickr8k_text/Flickr8k.lemma.token.txt"
Dev = "flickr8k_text/Flickr_8k.devImages.txt"

In [0]:
path = os.getcwd()

In [0]:
Imagepath = os.path.join(path , Image)
TrainPATH = os.path.join(path, Train)
TestPATH = os.path.join(path,Test)
textPATH = os.path.join(path, text)

In [0]:
def load_Doc(path):
    
    file = open(path, 'r')
    text = file.read()
    file.close()
    return text


def load_Desc(doc):
    
    mapping= dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line)<2:
            continue
        #get imageid
        imageid = tokens[0].split('.')[0]
        #get description
        desc = " ".join(tokens[1:])
        #check if imageid exist
        if imageid not in mapping:
            mapping[imageid]= list()
        mapping[imageid].append(desc)
    return mapping


def load_identifiers(path):
    
    doc = load_Doc(path)
    dataset = list()
    for line in doc.split("\n"):
        if len(line)<1:
            continue
        identifiers = line.split('.')[0]
        dataset.append(identifiers)
    return set(dataset)


def cleaning(description):
    # prepare translation table for removing punctuations
    table = str.maketrans('', '', string.punctuation)

    for imageid, desc_list in description.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)


def load_clean_desc(filename, dataset):
    
    doc = load_Doc(filename)
    descriptions = dict()
    for line in doc.split("\n"):
        tokens = line.split()
        imageid, imagedesc = tokens[0], tokens[1:]
        if imageid in dataset:
        #skip image if not in dataset
            if imageid not in descriptions:
                descriptions[imageid] = list()
            desc = 'startseq' + ' '.join(imagedesc) + 'stopseq'
            descriptions[imageid].append(desc)
    return descriptions



def to_vocabolary(desc):
    
    vocab_set = set()
    for key in desc.keys():
        [vocab_set.update(d) for d in desc[key]]
    return vocab_set


def save_description(desc, filename):
    lines = list()
    for keys, des in desc.items():
        for d in des:
            lines.append(keys+' '+d)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


def to_line(descriptions):
    lines =[]
    for keys in descriptions.keys():
        [lines.append(d) for d in descriptions[keys]]
    return lines

def load_tokens(description):
    desc = to_line(description)
    token = Tokenizer()
    token.fit_on_texts(desc)
    return token



def max_length(description):
    lines = to_line(description)
    return max(len(d.split()) for d in lines)


def create_sequence(tokeniser, maxlength, photo, description, vocab_size):
    X1,X2,Y = list(),list(),list()
    
    # walk through each image identifier
    for key, desc_list in description.items():
        for d in desc_list:
            seq = tokeniser.texts_to_sequences([d])[0]
            #split each seq into X,Y pair
            for i in range(1,len(seq)):
                input_seq, out_seq = seq[:i],seq[i]
                #pad the input sequence
                in_seq = pad_sequences([input_seq], maxlen=max_length)[0]
                #encode the output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                #store all values
                X1.append(photo[key][0])
                X2.append(in_seq)
                Y.append(out_seq)
        
    return np.array(X1),np.array(X2),np.array(Y)





def getmodel():
    #load model
    model = VGG16()
    #remove output layer
    model.layers.pop()
    model = Model(inputs= model.inputs , outputs= model.layers[-2].output)
    return model




def extract_features(directory):
    
    #feature dict
    features = dict()
    print("[INFO] loading model.....")
    model = getmodel()
    print("[INFO] Model loaded.....")
    for name in tqdm(os.listdir(directory)):
        
        image_id = name.split('.')[0]
        #get filename
        filename = os.path.join(directory, name)
        #load image
        image = load_img(filename, target_size=(224,224))
        #convert to array
        image = img_to_array(image)
        #reshape image to input size to the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        #prepare image for VGG model
        image = preprocess_input(image)
        
        #get features
        feature = model.predict(image, verbose=0)
        #store features
        features[image_id] = feature
        
    return features



def dumpfile(output, file):
    return dump(file, open(output, 'wb'))


def load_photo_features(filename, dataset):
    #load features from pickle file
    feat = load(open(filename, 'rb'))
    features = {k:feat[k] for k in dataset}
    return features


def define_model(vocab_size, max_length):
    
    #encoder1
    input1 = Input(shape=(4096,), name='Encoder1')
    en1    = Dropout(0.5)(input1)
    en2    = Dense(256,activation='relu' )(en1)
    
    #encoder2
    input2 = Input(shape=(max_length,), name='Encoder2')
    se1 = Embedding(vocab_size, 256, mask_zero=True)(input2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    #decoder
    merge = add([en2,se3])
    decoder1 = Dense(256, activation="relu")(merge)
    output = Dense(vocab_size, activation='softmax')(decoder1)
    
    #put it together
    model = Model(inputs=[input1, input2], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    plot_model(model, to_file='model.png', show_shapes=True)
    print(model.summary())
    
    return model

In [0]:
doc = load_Doc(textPATH)
#load descriptions from textfile
description = load_Desc(doc)
#clean the descriptions
cleaning(description)

#save the description
save_description(description, "description.txt")

In [0]:
from pickle import load
features = load(open('features.pkl', 'rb'))

In [0]:
#Load Training data
train = load_identifiers(TrainPATH)
print('Dataset Train: ', len(train))

"====================================================================="

#load train set description
train_descriptions = load_clean_desc('description.txt', train)
print("Descriptions Train: ", len(train_descriptions))

"======================================================================"

train_features = load_photo_features('features.pkl', train)
print("train features:", len(train_features))

"======================================================================="


# Get tokens 
tokens = load_tokens(train_descriptions)
vocab  = len(tokens.word_index)+1
print('Vocab Size:', vocab)

Dataset Train:  6000
Descriptions Train:  6000
train features: 6000
Vocab Size: 8152


In [0]:
max_length = max_length(train_descriptions)
print('Description Length:', max_length)
# prepare sequences
X1train, X2train, ytrain = create_sequence(tokens, max_length,features,train_descriptions, vocab)
print('Size of sequence',len(X2train))

Description Length: 32
Size of sequence 240907


In [0]:
test = load_identifiers(Dev)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_desc('description.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))
# prepare sequences
X1test, X2test, ytest = create_sequence(tokens, max_length, test_features,test_descriptions, vocab)
print('Size of text Sequence:', len(X2test))
 

Dataset: 1000
Descriptions: test=1000
Photos: test=1000
Size of text Sequence: 39571


In [0]:
# define the model
model = define_model(vocab, max_length)
# define checkpoint callback
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder2 (InputLayer)           [(None, 32)]         0                                            
__________________________________________________________________________________________________
Encoder1 (InputLayer)           [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 32, 256)      2086912     Encoder2[0][0]                   

In [0]:
# # fit model
#model.fit([X1train, X2train], ytrain, epochs=20, verbose=2, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))

In [0]:
#model.load_weights('model-ep003-loss3.943-val_loss4.169.h5')

In [0]:
#model.fit([X1train, X2train], ytrain, initial_epoch=3,epochs=20, verbose=2, callbacks=[checkpoint], validation_data=([X1test, X2test], ytest))

Train on 240907 samples, validate on 39571 samples
Epoch 4/20

Epoch 00004: val_loss improved from inf to 4.16759, saving model to model-ep004-loss3.978-val_loss4.168.h5
240907/240907 - 727s - loss: 3.9776 - val_loss: 4.1676
Epoch 5/20

Epoch 00005: val_loss did not improve from 4.16759
240907/240907 - 731s - loss: 3.8289 - val_loss: 4.1842
Epoch 6/20

Epoch 00006: val_loss did not improve from 4.16759
240907/240907 - 736s - loss: 3.7395 - val_loss: 4.2195
Epoch 7/20

Epoch 00007: val_loss did not improve from 4.16759
240907/240907 - 736s - loss: 3.6763 - val_loss: 4.2419
Epoch 8/20

Epoch 00008: val_loss did not improve from 4.16759
240907/240907 - 736s - loss: 3.6265 - val_loss: 4.2916
Epoch 9/20

Epoch 00009: val_loss did not improve from 4.16759
240907/240907 - 732s - loss: 3.5911 - val_loss: 4.2994
Epoch 10/20

Epoch 00010: val_loss did not improve from 4.16759
240907/240907 - 731s - loss: 3.5635 - val_loss: 4.3153
Epoch 11/20

Epoch 00011: val_loss did not improve from 4.16759
24

<tensorflow.python.keras.callbacks.History at 0x7f3787bf5eb8>

In [0]:
from tensorflow.keras.models import load_model

In [0]:
model = load_model('model-ep004-loss3.978-val_loss4.168.h5')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index== integer:
      return word
  return None

In [0]:
def gen_desc(model, tokenizer, photo, max_length):
  in_text ='startseq'

  for i in range(max_length):

    seq = tokenizer.texts_to_sequences([in_text])[0]
    seq = pad_sequences([seq], maxlen=max_length)

    yHat = model.predict([photo,seq],verbose=0)
    yHat = np.argmax(yHat)

    word = word_for_id(yHat, tokenizer)
    if word is None:
      break
    in_text+= " " + word
    if word == 'endseq':
      break
  return in_text

In [0]:
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
  actual, predicted = list(),list()

  for key, desc_list in descriptions.items():

    yHat = gen_desc(model, tokenizer, photos[key], max_length)

    reference = [d.split() for d in desc_list]
    actual.append(reference)
    predicted.append(yHat.split())
  
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [0]:
from nltk.translate.bleu_score import corpus_bleu

In [0]:
test = load_identifiers(Test)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_desc('description.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

Dataset: 1000
Descriptions: test=1000
Photos: test=1000


In [0]:
evaluate_model(model, test_descriptions, test_features, tokens, max_length)

BLEU-1: 0.166121
BLEU-2: 0.083653
BLEU-3: 0.059387
BLEU-4: 0.021189
