In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import glob
import numpy as np
import pandas as pd
import cv2

In [None]:
from keras.layers import Input, merge, Concatenate, Dense, Dropout, Conv2D, Add, Dot, Lambda, Conv2DTranspose, Dot, Activation, Reshape, BatchNormalization, UpSampling2D, AveragePooling2D, GlobalAveragePooling2D, Multiply, Softmax, LeakyReLU, Flatten, MaxPool2D, MaxPool3D, Embedding, GRU
from keras.layers.convolutional import Convolution2D, UpSampling2D, SeparableConv2D, UpSampling3D
from keras.layers import PReLU, LeakyReLU
from keras.models import Model
from keras.models import load_model
import keras.backend as K
from keras import layers
from keras import activations
from keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import sent_tokenize, word_tokenize 
import tensorflow as tf

In [None]:
# load all captions
captions = pd.read_csv('./coco-captions-2017-clean.csv')
annot_list = captions.values[:1600]
data = [] 
corpus_dict = {}
# iterate through each caption 
max_sequence_length = float("-inf")
for row in annot_list: 
    temp = [] 
      
    #tokenize the captions into words 
    for j in word_tokenize(row[2]): 
        temp.append(j.lower()) 
        corpus_dict[j.lower()] = 0
    if len(temp) > max_sequence_length:
        max_sequence_length = len(temp)
    data.append(temp) 

for index, word in enumerate(corpus_dict.keys()):
    corpus_dict[word] = index + 1
    
corpus_size = len(corpus_dict.keys())

In [None]:
max_sequence_length

In [None]:
# load external text embedding model (word2vec)
w2v_model = Word2Vec.load('./text_encoding.bin')

In [None]:
len(w2v_model.wv.vocab)

In [None]:
# encode all our captions
captions_X = []
caption_strings = []

for row in annot_list:
    caption_conv = []
    for word in word_tokenize(row[2].lower()):
        caption_conv.append(w2v_model[word])
    captions_X.append(np.array(caption_conv))
    caption_strings.append(row[2].lower())
    
    
    

In [None]:
# Pad with 0-vectors to max_sequence_length
for i in range(len(captions_X)):
    captions_X[i].resize((max_sequence_length, 100))
captions_X = np.array(captions_X)

In [None]:
def data_loader(file_path):
    img_paths = glob.glob(file_path + '/*')
    data = []
    for img_path in img_paths:
        
        data.append(cv2.imread(img_path) / 255.0)
    return np.array(data)

In [None]:
imgs_all = data_loader('./cleaned-data/')
imgs_y = imgs_all[:1600]

In [None]:
# custom metric to avoid binary accuracy rounding
import keras.metrics
def image_closeness(y_pred, y_true):
    return K.mean(1 - K.abs(y_pred - y_true), axis=-1)
keras.metrics.image_closeness = image_closeness




In [None]:
# load autoencoder and get decoder portion
autoencoder = load_model('./autoencoder-v2-models/autoencoder-v2-9k-epochs.h5') # transpose conv
decoder = autoencoder.layers[2]

In [None]:
# freeze decoder weights
for layer in decoder.layers:
    layer.trainable = False
decoder.trainable = False

In [None]:
def build_intermediate_model(input_shape, name='textencoder', encode_channels=[4, 8, 16, 32, 32]):
    embedding_size = 64
    gru_size = 1024
    input_length = max_sequence_length
    input_layer = Input(shape=input_shape)
    
    
    embedded_seq, state_i = GRU(gru_size, return_state=True, name='gru')(input_layer)
    state_i = Dense(1024)(state_i)
    state_i = LeakyReLU()(state_i)

    
        
 
    encoder_block = Reshape((32, 32, 1))(state_i)
    
    for index, channel in enumerate(encode_channels):
        
        shortcut = Conv2D(channel, 3, padding='same', trainable=False)(encoder_block)
        encoder_block = BatchNormalization()(encoder_block)
        encoder_block = LeakyReLU()(encoder_block)
        encoder_block = Conv2D(channel, 3, padding='same')(encoder_block)    
        
        encoder_block = BatchNormalization()(encoder_block)
        encoder_block = LeakyReLU()(encoder_block)
        encoder_block = Conv2D(channel, 3, padding='same')(encoder_block)
        
        
        encoder_block = Add()([encoder_block, shortcut])
        
        encoder_block = Conv2D(channel, 3, padding='same')(encoder_block)
        encoder_block = LeakyReLU()(encoder_block)

    output_layer = encoder_block
    return Model(input_layer, output_layer, name=name)

In [None]:
model_intermediate = build_intermediate_model((max_sequence_length, 100))
model_intermediate.summary()

In [None]:
num_layers = len(model_intermediate.layers)
print(f'Model is {num_layers} layers deep.')

In [None]:
def build_combined_model():
    input_caption = Input(shape=(27, 100))
    latent_representation = model_intermediate(input_caption)
    decoded_img = decoder(latent_representation)
    autoencoder = Model(input_caption, decoded_img, name="text2img")
    return autoencoder

In [None]:
model_text2img = build_combined_model()

In [None]:
model_text2img.summary()

In [None]:
model_text2img.compile(optimizer='adam', loss='mse', metrics=[image_closeness, 'accuracy'])

In [None]:
m = load_model('./text2img-models-v3/text2img-3k-epochs.h5')

In [None]:
history = m.fit(captions_X, imgs_y, epochs=1, batch_size=32, shuffle=True)
# plot_history(history)

In [None]:
for i in range (1, 3):
    history = model_text2img.fit(captions_X, imgs_y, epochs=1000, batch_size=32, shuffle=True)
    print(str(i) + 'k epochs~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    model_text2img.save('.\\text2img-models-v5\\text2img-' + str(i) + 'k-epochs.h5')
    plot_history(history)
#     show_converted(4)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

In [None]:
# model_text2img = load_model('./text2img-models-v2/text2img-1k-epochs.h5')
model_text2img.summary()

In [None]:
# for generating figures
model_text2img_t = load_model('./text2img-models-v4/text2img-2k-epochs.h5')

In [None]:
for caption in caption_strings:
    if 'woman on a surfboard' in caption:
        print(caption)

In [None]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import textwrap
# figure(num=None, figsize=(10, 10), dpi=200, facecolor='w', edgecolor='k')
def rgb_imshow(img, name):
#     figure(num=None, figsize=(10, 10), dpi=200, facecolor='w', edgecolor='k')
    plt.xticks([])
    plt.yticks([])
    if name is not None:
        plt.title('\n'.join(textwrap.wrap(name, 20)), fontsize=8)
    else:
        plt.title(name)
    plt.imshow(cv2.cvtColor((img).astype(np.float32), cv2.COLOR_BGR2RGB))
#     plt.show()

def show_converted(model, caption):
    
    

#     caption_input = testcaptions[0] 
# THE VAULT
# 1604, 1610, 1628, 1642, 1748, 1771, 1772, 1860, 1926, 13018, 13145, 13189, 13435, 13517
#     vault = [0, 4, 5, 69]
    figure(num=None, dpi=300, facecolor='w', edgecolor='k')

    caption_input = caption_encoding(caption)
    target_image = testimgs[0]
    decoded_img = model.predict(np.array([caption_input]), steps=None)[0]
    rgb_imshow(decoded_img, caption)
    

    



    plt.tight_layout()
#     plt.savefig(f'./testing-final-images/{caption}.png', bbox_inches='tight')
    plt.show()


# show_converted(2)
# show_converted(4)
# show_converted(18)
# show_converted(65)
# for i in range(65, 70):

# [68, 124, 47, 38]
testimgs = [imgs_y[68], imgs_y[124], imgs_y[47], imgs_y[38]]
testcaptions = [captions_X[68], captions_X[124], captions_X[47], captions_X[38]]
teststrings = [caption_strings[68], caption_strings[124], caption_strings[47], caption_strings[38]]

caps = \
[
    'A herd of animals walking across a dry grass field.',\
    'A man riding skis on top of a snow covered slope.',\
    'A man riding a wave on top of a surfboard',\
    'A little kid playing second base at a baseball game.',\
    'A young woman on a surfboard surfing on a wave',\
    'A cat is sitting inside of a suitcase.',\
    'A large propeller airplane flying through a blue sky.',\
    'A man flying a kite above in a blue sky.',\
    'A herd of animals walking across a dry grass field.',\
]

for cap in caps:
    show_converted(model_text2img_t, cap)
# show_converted(3, model_text2img_u, caption=caption_encoding(caption_strings[img_num]))




# decoded_img1 = model_text2img.predict(np.array([captions_X[4]]))[0]
# decoded_img2 = model_text2img.predict(np.array([captions_X[5]]))[0]

# flat_img = np.array(decoded_img2 - decoded_img1)
# flat_img = flat_img.flatten()
# nonzero_elems = [x for x in flat_img if x != 0]
# print (len(decoded_img1.flatten()))
# print (len(nonzero_elems))
# print (np.max(flat_img))


In [None]:
def caption_encoding(string):
    caption_conv = []
    for word in word_tokenize(string.lower()):
        caption_conv.append(w2v_model[word])
    caption_conv = np.array(caption_conv)
    caption_conv.resize((27, 100))
    return caption_conv

In [None]:
def plot_history(history):
    plt.plot(history.history['acc'])
    plt.title("Accuracy")
    plt.show()
    plt.plot(history.history['loss'])
    plt.title("Loss")
    plt.show()

In [None]:
np.shape(model_text2img.predict(np.array([captions_X[0]]))[0])

In [None]:
def pad_text(text):
    paddings = tf.constant([[0, 0], [0, 52]])
    text = tf.pad(text, paddings, 'CONSTANT')
    return text

def build_discriminator(encode_channels=[8, 16, 32], dropout=False, residual=False, concat_captions=False):
    if concat_captions:
        text_input = Input(shape=(27, 100))
    image_input = Input(shape=(128, 128, 3))
    
    if concat_captions:
        text_attn = Flatten()(text_input)
    
    
    for index, channel in enumerate(encode_channels):
        if residual:
            shortcut = Conv2D(channel, 3, padding='same', trainable=False)(image_input if index == 0 else encoder_block)
        encoder_block = BatchNormalization()(image_input if index == 0 else encoder_block)
        encoder_block = LeakyReLU()(encoder_block)
        encoder_block = Conv2D(channel, 3, padding='same')(encoder_block)
        if dropout:
            encoder_block = Dropout(0.2)(encoder_block)
        
        encoder_block = BatchNormalization()(encoder_block)
        encoder_block = LeakyReLU()(encoder_block)
        encoder_block = Conv2D(channel, 3, padding='same')(encoder_block)
        if dropout:
            encoder_block = Dropout(0.2)(encoder_block)
        if residual:
            encoder_block = Add()([encoder_block, shortcut])
        
        encoder_block = Conv2D(channel, 3, padding='same', strides=2)(encoder_block)
        encoder_block = LeakyReLU()(encoder_block)
        if dropout:
            encoder_block = Dropout(0.2)(encoder_block)
        
    
    encoder_block = Flatten()(encoder_block)
    if concat_captions:
        encoder_block = Concatenate()([encoder_block, text_attn])
    
    dense = Dense(1, activation='sigmoid')(encoder_block) #change to concatted if add text back
    
    return Model(image_input, dense, name='Discriminator') #Model([text_input, image_input], dense, name='Discriminator')

In [None]:
def build_gan():
    input_caption = Input(shape=(27, 100))
    input_image = Input(shape=(128, 128, 3))
    generated_image = model_text2img(input_caption)
    discriminator_output = discriminator(generated_image) #discriminator([input_caption, generated_image])
    

    gan = Model(input_caption, [discriminator_output, generated_image])
    return gan




In [None]:
#logic to train descriminator in adversarial model
model_intermediate = build_intermediate_model((27, 100))
model_text2img = build_combined_model()
discriminator = build_discriminator()
d_adam = keras.optimizers.adam(lr=0.00001, beta_1=0.0, beta_2=0.999)
discriminator.compile(optimizer=d_adam, loss='binary_crossentropy', metrics=['binary_accuracy'])
discriminator.trainable = False
adversarial_net = build_gan()
g_adam = keras.optimizers.adam(lr=0.0001, beta_1=0.0, beta_2=0.999)
adversarial_net.compile(optimizer=g_adam, loss=['binary_crossentropy', 'mse'], loss_weights=[1, 10], metrics=['binary_accuracy'])

In [None]:
captions_X = captions_X[:128]
imgs_y = imgs_y[:128]
caption_strings = caption_strings[:128]

In [None]:
from keras.utils.generic_utils import Progbar
from time import time

d_loss_means = []
d_acc_means = []
a_loss_means = []
t_loss_means = []

# custom adversarial training logic
BATCHSIZE = 32
EPOCHS = 1000
for epoch in range(EPOCHS):
    num_data_pts = len(captions_X)
    np.random.seed(epoch)
    np.random.shuffle(captions_X)
    np.random.seed(epoch)
    np.random.shuffle(imgs_y)
    np.random.seed(epoch)
    np.random.shuffle(caption_strings)

    d_hist_loss = []
    d_hist_acc = []
    a_hist_loss = []
    t_hist_loss = []
    print()
    print("epoch {} of {}".format(epoch+1, EPOCHS))
    num_batches = int(num_data_pts // BATCHSIZE)
    #print("number of batches: {}".format(int(X.shape[0] // (BATCHSIZE))))
    progress_bar = Progbar(target=int(num_data_pts // (BATCHSIZE)))
    minibatches_size = BATCHSIZE
    start_time = time()
    
    for index in range(int(num_data_pts // (BATCHSIZE))):
        progress_bar.update(index)
        
        #print(f'{BATCHSIZE*index}:{(BATCHSIZE*(index+1))}')
        images_real = imgs_y[BATCHSIZE*index:(BATCHSIZE*(index+1))]
        captions_batch = captions_X[BATCHSIZE*index:(BATCHSIZE*(index+1))]
        #print(edges_batch.shape)
        labels_fake = np.zeros([BATCHSIZE,1], dtype=np.float32)
        labels_real = np.ones([BATCHSIZE,1], dtype=np.float32)
        
        images_fake = model_text2img.predict(captions_batch)
        train_imgs = np.concatenate((images_real, images_fake))
        train_captions = np.concatenate((captions_batch, captions_batch))
        train_labels = np.concatenate((labels_real, labels_fake))
        

 
        np.random.seed(index)
        np.random.shuffle(train_imgs)
        np.random.seed(index)
        np.random.shuffle(train_captions)
        np.random.seed(index)
        np.random.shuffle(train_labels)
        
        # train discrimiator every 4th epoch
        if index % 4 == 0:
            d_loss, d_acc = discriminator.train_on_batch(train_imgs, train_labels)
            d_hist_loss.append(d_loss)
            d_hist_acc.append(d_acc)

        
        
        a_loss_1, _, t_loss, _, t_acc = adversarial_net.train_on_batch(captions_batch, [labels_real, images_real])
        a_hist_loss.append(a_loss_1)
        t_hist_loss.append(t_loss)
        

    
    t_loss_means.append(np.mean(t_hist_loss))
    d_loss_means.append(np.mean(d_hist_loss))
    a_loss_means.append(np.mean(a_hist_loss))
    d_acc_means.append(np.mean(d_hist_acc))
    print(np.mean(d_hist_acc))



# adversarial_net.save_weights('.\\gan-v1\\gan-2k-epochs-weights.h5')
# model_text2img.save_weights('.\\gan-v1\\text2img-2k-epochs-weights.h5')

In [None]:
plt.title('Discriminator Loss')
plt.plot(d_loss_means)
# plt.savefig('./gan-v1/discriminator_loss-2k')
plt.show()
plt.title('Discriminator Accuracy')
plt.plot(d_acc_means)
# plt.savefig('./gan-v1/discriminator_acc-2k')
plt.show()
plt.title('Adversarial Loss')
plt.plot(a_loss_means)
# plt.savefig('./gan-v1/adversarial_loss-2k')
plt.show()
plt.title('Text To Image Loss')
plt.plot(t_loss_means)
# plt.savefig('./gan-v1/text2img_loss-2k')
plt.show()

In [None]:


vault = [68, 124, 47, 38]
for img_num in vault:
    print(img_num)
    rgb_imshow(imgs_y[img_num], caption_strings[img_num]) 
    plt.show()
    show_converted(img_num, model_text2img)


In [None]:

model_intermediate = build_intermediate_model((27, 100))
model_text2img = build_combined_model()
#model_text2img.compile(optimizer='adam', loss='mse', metrics=[image_closeness, 'accuracy'])
discriminator = build_discriminator()

discriminator.compile(optimizer=d_adam, loss='binary_crossentropy', metrics=['binary_accuracy'])

discriminator.trainable = False
adversarial_net = build_gan()
adversarial_net.compile(optimizer=g_adam, loss=['binary_crossentropy', 'mse'], loss_weights=[1, 10], metrics=['binary_accuracy'])




