In [2]:
from utils import *
from tensorflow.keras.models import Sequential, Model, Model
from tensorflow.keras.layers import LSTM, Embedding, Input, add

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thinkdeep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/thinkdeep/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Loading data 

In [3]:
# Location of the Flickr8k images and caption files
dataset_image_path ="flickr8k/Images/"
dataset_text_path  ="flickr8k/captions.txt" 
# Wanted shape for images
wanted_shape = (224,224,3)

# TO WORK ON A REDUCED DATASET
#n_images_considered = 899 # = 8091 / 9

In [4]:
# To obtain the text dataset corresponding to images
train, infer = True, False

if train:
    df_texts = pd.read_csv(dataset_text_path, sep=",") #["image","caption"] 
elif infer:
    df_texts = pd.read_csv("text_feature_maps.csv") # ["image","caption","cleaned","cleaned_tokenized","embedded"]

n_img = df_texts.count()/5 # 40455/5 
unique_img = pd.unique(df_texts["image"])# 8091 unique images

### Preprocessing images with pretrained VGG16 : FEATURE MAPS 4096

In [5]:
base_model = VGG16(
    include_top=True, weights='imagenet', input_tensor=None,
    input_shape=wanted_shape, pooling=None, classes=1000
)
# Feature extraction
vgg_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output) #end the modèle with a 4096 feature layer

In [6]:
import time
charge_image, one_by_one, load_fm_image_csv = False, False, True# false to gain time when testing other parts
# To obtain the feature maps
if charge_image:
    feature_maps = np.array([vgg_model.predict(load_img_from_ds(unique_img[i])) for i in range(len(unique_img))])
    print(f"Shape des fm {feature_maps.shape}")

elif one_by_one:
    feature_maps=[]
    for i in range(len(unique_img)):
        if i!=0:
            print(f"{i}/{len(unique_img)} - time elapsed :{time.time()-a}")
        else:
            print(f"{i}/{len(unique_img)}")
        a=time.time()
        img = load_img_from_ds(unique_img[i])
        feature_map = vgg_model.predict(img)
        feature_maps.append(feature_map)
    feature_maps=np.array(feature_maps)
    #save to csv
    feature_maps_sav=feature_maps[:,0,:]
    df_fm = pd.DataFrame(feature_maps_sav)
    df_fm.to_csv("image_feature_maps.csv")

elif load_fm_image_csv:
    df_fm = pd.read_csv("image_feature_maps.csv")
    feature_maps = np.array(df_fm.drop([df_fm.columns[0]], axis=1))
    print(f"Image feature maps loaded : {feature_maps.shape}")

Image feature maps loaded : (8091, 4096)


### Preprocessing captions - WORD2VEC : EMBEDDINGS 4096

In [7]:
# Not enough memory yet
word2vec_training = False

if train :
    # Text preprocessing
    df_texts["cleaned"]=[process_sentence(s) for s in df_texts["caption"]]
    df_texts["cleaned_tokenized"]=[word_tokenize(w) for w in df_texts["cleaned"]]
    if(word2vec_training):
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        word2vec_model = gensim.models.Word2Vec([word_tokenize(w) for w in df_texts["cleaned"]], min_count=1, size=4096)
        word2vec_model.save("word2vec.model")
    else:
        word2vec_model = gensim.models.Word2Vec.load("word2vec.model")
    df_texts["embedded"] = word2vec(df_texts,word2vec_model.wv)
    df_texts.to_csv("text_feature_maps.csv", columns=df_texts.columns)

elif infer:
    word2vec_model = gensim.models.Word2Vec.load("word2vec.model")

In [5]:
import logging
import gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
word2vec_model = gensim.models.Word2Vec.load("word2vec.model")

2021-01-13 17:35:18,199 : INFO : loading Word2Vec object from word2vec.model
2021-01-13 17:35:18,257 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2021-01-13 17:35:18,278 : INFO : loading vectors from word2vec.model.wv.vectors.npy with mmap=None
2021-01-13 17:35:18,519 : INFO : setting ignored attribute vectors_norm to None
2021-01-13 17:35:18,525 : INFO : loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
2021-01-13 17:35:18,531 : INFO : loading trainables recursively from word2vec.model.trainables.* with mmap=None
2021-01-13 17:35:18,536 : INFO : loading syn1neg from word2vec.model.trainables.syn1neg.npy with mmap=None
2021-01-13 17:35:18,748 : INFO : setting ignored attribute cum_table to None
2021-01-13 17:35:18,768 : INFO : loaded word2vec.model


In [6]:
df_texts["cleaned"]=[process_sentence(s) for s in df_texts["caption"]]
df_texts["cleaned_tokenized"]=[word_tokenize(w) for w in df_texts["cleaned"]]

In [14]:
df_texts.head()

Unnamed: 0,image,caption,cleaned,cleaned_tokenized
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,startseq child pink dress climbing set stairs ...,"[startseq, child, pink, dress, climbing, set, ..."
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,startseq girl going wooden building endseq,"[startseq, girl, going, wooden, building, endseq]"
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .,startseq little girl climbing wooden playhouse...,"[startseq, little, girl, climbing, wooden, pla..."
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...,startseq little girl climbing stairs playhouse...,"[startseq, little, girl, climbing, stairs, pla..."
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...,startseq little girl pink dress going wooden c...,"[startseq, little, girl, pink, dress, going, w..."


In [8]:
word2vec(df_texts,word2vec_model.wv)

In [1]:
dimages = feature_maps

# Split du dataset
prop_test, prop_val = 0.2, 0.2
N = len(df_texts["embedded"])
Ntest, Nval = int(N*prop_test), int(N*prop_val)

NameError: name 'feature_maps' is not defined

In [34]:
# dt = true image caption cleaned
dt_test, dt_val, dt_train = split_test_val_train(df_texts["embedded"], Ntest, Nval)
# di = true image array
di_test, di_val, di_train = split_test_val_train(dimages, Ntest, Nval)
# fnm = image_name
fnm_test, fnm_val, fnm_train = split_test_val_train(df_texts["image"], Ntest, Nval)

In [35]:
vocab_size = len(word2vec_model.wv.vocab)

Xtext_train, Ximage_train, ytext_train = finalpreprocessing(dt_train, di_train, vocab_size) 
Xtext_val, Ximage_val, ytext_val = finalpreprocessing(dt_val, di_val, vocab_size)

print(f"Training set : \n \tInput image : {Ximage_train.shape}\n\tInput text : {Xtext_train.shape}\n\tOutput text : {ytext_train.shape}")

NameError: name 'dtext' is not defined

### RNN Model 

In [None]:
dim_embedding=64

# image input
input_img = Input(shape=(Ximage_train.shape[1],), name="InputImage") )
input_img = ( Dense(units=256,activation='relu',name="CompressedImageFeatures") )(input_img)
# text input
input_txt = Input(shape=(maxlen,), name="InputSequence"))
input_txt = ( Embedding(vocab_size,dim_embedding, mask_zero=True))(input_txt)
input_txt = ( LSTM(units=8, activation="relu", name="CaptionFeatures") )(input_txt)

# Common part
common = add(input_txt, input_img)
common = Dense(256, activation='relu') (common)
common = Dense(vocab_size, activation='softmax')(common)

#Model
total_model  = Model(inputs=[input_image, input_txt],outputs=output)

model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

### Model training 

In [None]:
hist = model.fit([Ximage_train, Xtext_train], ytext_train, epochs=5, verbose=2, batch_size=64, validation_data=([Ximage_val, Xtext_val], ytext_val))

### Model evaluation

In [None]:
for label in ["loss", "val_loss"]:
    plt.plot(hist.history[label], label=label)]
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
'''
PREDICTION
'''

# 1 couche 256 LSTM ?
# A partir de combien de couce=hes c est ok 1 8 16 32 256 
# Temps d entrainement : compromis 
# Voir si dimensions pas trop grandes ?
# GRU ! :D mieux (3 params au lieu de 4)
# simpleRNN ? 
# Etude comparative : 3 RNN (simple, LSTM, GRU & Etude de perf)
# Limiter Dataset ! => entrainements en O(heure)

#tf.keras.utils.get_file(origin="lien", fname="nom_que_tu_veux_donner_au_fichier.zip", extract=True)