In [39]:
import numpy as np
from numpy import array

import base64

from PIL import Image
from pickle import load

from tensorflow.keras.models import load_model
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.applications.inception_v3 import preprocess_input

In [40]:
project_path = "NeuraIC/"

In [41]:
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
# pre-define the max sequence length (from training)
max_length = 34
# load the model
model = load_model(project_path+'model_img_cap_pad.h5')

In [42]:
def preprocess(imageFile):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = imageFile.resize((299,299))
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess the images using preprocess_input() from inception module
    x = preprocess_input(x)
    return x

In [43]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    
    InceptionV3_model = InceptionV3(weights='imagenet')
    model_iV3 = Model(InceptionV3_model.input, InceptionV3_model.layers[-2].output)
    
    fea_vec = model_iV3.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [44]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [45]:
def greedySearch(photo):
    in_text = 'startseq'
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat,tokenizer)
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [46]:
def NeuraIC(imageFile):
    img_features = encode(imageFile).reshape((1, 2048))
    return greedySearch(img_features)

In [47]:
imageFile = Image.open('dog.jpg')
print(NeuraIC(imageFile))

black dog is running through the water
