# Text to Image Generation using DCGAN Architecture


In [2]:
# Necessary imports
import pandas as pd
import urllib.request
import imageio
import os
import numpy as np
import gensim

from urllib.request import urlopen

import tensorflow as tf
from tensorflow.keras.layers import Input, Reshape, Dropout, Dense, Concatenate 
from tensorflow.keras.layers import Flatten, BatchNormalization
from tensorflow.keras.layers import Activation, ZeroPadding2D
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import UpSampling2D, Conv2D, Conv2DTranspose
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras import initializers
from sklearn.metrics import mean_squared_error

import numpy as np
from PIL import Image
from tqdm import tqdm
import os 
import time
import matplotlib.pyplot as plt

In [None]:
# Generation resolution - Must be square 
# Training data is also scaled to this.
GENERATE_RES = 2 # Generation resolution factor 
# (1=32, 2=64, 3=96, 4=128, etc.)
GENERATE_SQUARE = 32 * GENERATE_RES # rows/cols (should be square)
IMAGE_CHANNELS = 3

# Preview image 
PREVIEW_ROWS = 4
PREVIEW_COLS = 7
PREVIEW_MARGIN = 16

# Size vector to generate images from
SEED_SIZE = 100
EMBEDDING_SIZE = 300

# Configuration
DATA_PATH = "./jpg"
MODEL_PATH = " "
EPOCHS = 50
BATCH_SIZE = 64
BUFFER_SIZE = 4000

print(f"Will generate {GENERATE_SQUARE}px square images.")

# Data Preprocessing


#### Image Embedding

In [3]:
# importing the model GoogleNews-vectors-negative300.bin using gensim for embedding

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


In [7]:
# This section of code is for image embedding
# Image set has 8,188 images, the image embeddings are saved as a numpy file.


embedding_file = os.path.join('./training_data_/',
        f'image_embedding') 
start = time.time()
print("Loading training images...")

training_data = []
# flowers_path = sorted(os.listdir(DATA_PATH))

flowers_path = sorted(os.listdir('./flower_images/'))

for filename in range(len(flowers_path)):
    path = os.path.join('./flower_images/',flowers_path[filename])
    # print(path)
    try:
      image = Image.open(path).resize((64,64),Image.ANTIALIAS) # reducing the image size into 64px
      channel = np.asarray(image).shape[2]
      if channel == 3:
        training_data.append(np.asarray(image))
    except:
      pass
training_data = np.reshape(training_data,(-1,64,64,3))     #reshaping numpy array into (64,64,3)
training_data = training_data.astype(np.float32)
     
training_data = training_data / 127.5 - 1.            #Normalizing the input

print("Image embedding finished and saving...")
np.save(embedding_file + ".npy",training_data)

print (f'Time taken to complete embedding: {time.time()-start}')


Loading training images...


  image = Image.open(path).resize((64,64),Image.ANTIALIAS) # reducing the image size into 64px


Image embedding finished and saving...
Time taken to complete embedding: 41.230995178222656


### Text Embedding


In [9]:
# This section of code is for text embedding

text_path = "./text_c10/captions"
text_files = sorted(os.listdir(text_path))
captions = []
caption_embeddings = np.zeros((len(text_files),300),dtype=np.float32)
for filename in range(len(text_files)):
    path = os.path.join(text_path,text_files[filename])
    # print(path)
    f = open(path,'r')
    data = f.read()
    data = data.split("\n")
    f.close()
    for d in range(1):
      x = data[d].lower()
      x = x.replace("  ","")
      # x = x.replace("'","")
      captions.append(x)
      count = 0
      for t in x:
        try:
          caption_embeddings[filename] += model[t]
          count += 1
        except:
          pass
      caption_embeddings[filename] /= count
np.save('./training_data_/flowers_text_embedding.npy',caption_embeddings)
print("text embedding completed and saved")

text embedding completed and saved


In [21]:
# The captions are stored as a csv file for later usage
df_captions = pd.DataFrame([])
df_captions['captions'] = captions
df_captions.to_csv("./text_c10/flowercaptions.csv",index=None)
df_captions.head()

Unnamed: 0,captions
0,"prominent purple stigma,petals are white inc olor"
1,"this flower is blue and green in color, with p..."
2,"outer petals are green in color and klarger,in..."
3,"there are several shapes, sizes, and colors of..."
4,the stamen are towering over the stigma which ...


### Loading the Data

In [25]:
# Embedded files are loaded

caption_embeddings = np.load('./training_data_/flowers_text_embedding.npy')[:8128]
image_embeddings = np.load('./training_data_/image_embedding.npy')[:8128]
save_images_embeddings = np.copy(caption_embeddings[:28])
save_images_npy = np.copy(image_embeddings[:28])

In [None]:
# preparing the data as batches 

train_dataset = tf.data.Dataset.from_tensor_slices({'images': image_embeddings.astype('float32'),
                                                    'embeddings': caption_embeddings.astype('float32')}).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

## Defining Generator and Discriminator

In [30]:

def build_generator_func(seed_size,embedding_size, channels):
  input_seed = Input(shape=seed_size)
  input_embed = Input(shape = embedding_size)
  d0 = Dense(128)(input_embed)
  leaky0 = LeakyReLU(alpha=0.2)(d0)

  merge = Concatenate()([input_seed, leaky0])

  d1 = Dense(4*4*256,activation="relu")(merge)
  reshape = Reshape((4,4,256))(d1)

  upSamp1 = UpSampling2D()(reshape)
  conv2d1 = Conv2DTranspose(256,kernel_size=5,padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(upSamp1)
  batchNorm1 = BatchNormalization(momentum=0.8)(conv2d1)
  leaky1 = LeakyReLU(alpha=0.2)(batchNorm1)

  upSamp2 = UpSampling2D()(leaky1)
  conv2d2 = Conv2DTranspose(256,kernel_size=5,padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(upSamp2)
  batchNorm2 = BatchNormalization(momentum=0.8)(conv2d2)
  leaky2 = LeakyReLU(alpha=0.2)(batchNorm2)

  upSamp3 = UpSampling2D()(leaky2)
  conv2d3 = Conv2DTranspose(128,kernel_size=4,padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(upSamp3)
  batchNorm3 = BatchNormalization(momentum=0.8)(conv2d3)
  leaky3 = LeakyReLU(alpha=0.2)(batchNorm3)

  upSamp4 = UpSampling2D(size=(GENERATE_RES,GENERATE_RES))(leaky3)
  conv2d4 = Conv2DTranspose(128,kernel_size=4,padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(upSamp4)
  batchNorm4 = BatchNormalization(momentum=0.8)(conv2d4)
  leaky4 = LeakyReLU(alpha=0.2)(batchNorm4)

  outputConv = Conv2DTranspose(channels,kernel_size=3,padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(leaky4)
  outputActi = Activation("tanh")(outputConv)          # Activation function tanh() is used

  model = Model(inputs=[input_seed,input_embed], outputs=outputActi)
  return model

def build_discriminator_func(image_shape, embedding_size):
  input_shape = Input(shape=image_shape)
  input_embed = Input(shape=embedding_size)

  conv2d1 = Conv2D(32,kernel_size=4,strides=2,input_shape=image_shape,padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(input_shape)
  leaky1 = LeakyReLU(alpha=0.2)(conv2d1)

  drop2 = Dropout(0.25)(leaky1)
  conv2d2 = Conv2D(64, kernel_size=4, strides=2, padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(drop2)
  batchNorm2 = BatchNormalization(momentum=0.8)(conv2d2)
  leaky2 = LeakyReLU(alpha=0.2)(batchNorm2)

  drop3 = Dropout(0.25)(leaky2)
  conv2d3 = Conv2D(128, kernel_size=4, strides=2, padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(drop3)
  batchNorm3 = BatchNormalization(momentum=0.8)(conv2d3)
  leaky3 = LeakyReLU(alpha=0.2)(batchNorm3)

  drop4 = Dropout(0.25)(leaky3)
  conv2d4 = Conv2D(256, kernel_size=4, strides=2, padding="same",kernel_initializer=initializers.RandomNormal(stddev=0.02))(drop4)
  batchNorm4 = BatchNormalization(momentum=0.8)(conv2d4)
  leaky4 = LeakyReLU(alpha=0.2)(batchNorm4)

  dense_embed = Dense(128,kernel_initializer=initializers.RandomNormal(stddev=0.02))(input_embed)
  leaky_embed = LeakyReLU(alpha=0.2)(dense_embed)
  reshape_embed = Reshape((4,4,8))(leaky_embed)
  merge_embed = Concatenate()([leaky4, reshape_embed])

  drop5 = Dropout(0.25)(merge_embed)
  conv2d5 = Conv2D(512, kernel_size=4,kernel_initializer=initializers.RandomNormal(stddev=0.02))(drop5)
  batchNorm5 = BatchNormalization(momentum=0.8)(conv2d5)
  leaky5 = LeakyReLU(alpha=0.2)(batchNorm5)

  drop6 = Dropout(0.25)(leaky5)
  flatten = Flatten()(drop6)
  output = Dense(1,activation="sigmoid")(flatten)              # Activation function sigmoid is used

  model = Model(inputs=[input_shape,input_embed], outputs=output)
  return model

In [35]:
# Initlializing a generator and discriminator

image_shape = (GENERATE_SQUARE,GENERATE_SQUARE,IMAGE_CHANNELS)
generator = build_generator_func(SEED_SIZE,EMBEDDING_SIZE, IMAGE_CHANNELS)
discriminator = build_discriminator_func(image_shape,EMBEDDING_SIZE)

generator_optimizer = tf.keras.optimizers.Adam(learning_rate=2.0e-4,beta_1 = 0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=2.0e-4,beta_1 = 0.5)

# generator.load_weights("/content/drive/Shareddrives/D4NLP Project/flowers data/flowers/model/text_to_image_generator_cub_character.h5")

## Training

In [None]:
# Function for finding the Inception Score
# This section of code is adapted from https://machinelearningmastery.com/how-to-implement-the-inception-score-from-scratch-for-evaluating-generated-images/

from math import floor
from numpy import ones
from numpy import expand_dims
from numpy import log
from numpy import mean
from numpy import std
from numpy import exp
from numpy.random import shuffle
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.datasets import cifar10
from skimage.transform import resize
from numpy import asarray

# scale an array of images to a new size
def scale_images(images, new_shape):
	images_list = list()
	for image in images:
		# resize with nearest neighbor interpolation
		new_image = resize(image, new_shape, 0)
		# store
		images_list.append(new_image)
	return asarray(images_list)

# assumes images have any shape and pixels in [0,255]
def calculate_inception_score(images, n_split=10, eps=1E-16):
	# load inception v3 model
	model = InceptionV3()
	# enumerate splits of images/predictions
	scores = list()
	n_part = floor(images.shape[0] / n_split)
	for i in range(n_split):
		# retrieve images
		ix_start, ix_end = i * n_part, (i+1) * n_part
		subset = images[ix_start:ix_end]
		# convert from uint8 to float32
		subset = subset.astype('float32')
		# scale images to the required size
		subset = scale_images(subset, (299,299,3))
		# pre-process images, scale to [-1,1]
		subset = preprocess_input(subset)
		# predict p(y|x)
		p_yx = model.predict(subset)
		# calculate p(y)
		p_y = expand_dims(p_yx.mean(axis=0), 0)
		# calculate KL divergence using log probabilities
		kl_d = p_yx * (log(p_yx + eps) - log(p_y + eps))
		# sum over classes
		sum_kl_d = kl_d.sum(axis=1)
		# average over images
		avg_kl_d = mean(sum_kl_d)
		# undo the log
		is_score = exp(avg_kl_d)
		# store
		scores.append(is_score)
	# average across images
	is_avg, is_std = mean(scores), std(scores)
	return is_avg, is_std

In [49]:
# function for saving the image at each epochs while training

# inception_score = []
# steps = []

def save_images(cnt,noise,embeds,testing = False):
  
  image_array = np.full(( 
      
       + (PREVIEW_ROWS * (GENERATE_SQUARE+PREVIEW_MARGIN)), 
      PREVIEW_MARGIN + (PREVIEW_COLS * (GENERATE_SQUARE+PREVIEW_MARGIN)), 3), 
      255, dtype=np.uint8)
  if not testing:                                         # saving fake images while training
    generated_images = generator.predict((noise,embeds))
    generated_images = 0.5 * generated_images + 0.5       # De-normalising the images
    
    output_path = "./fake"
    filename = os.path.join(output_path,f"epoch-{cnt}.png")
    
    
    #################################################################################################
    # This part of the cell is to plot the inception score : ( Contributed myself)

    # is_avg, is_std = calculate_inception_score(generated_images)
    # print('score', is_avg, is_std)
    # inception_score.append(is_avg)
    # steps.append(cnt+1)
    # plt.plot(steps, inception_score)
    # plt.xlabel('epochs')
    # plt.ylabel('Inception Score')
    # plt.title('Inception Score Vs Epochs')
    # graphpath = os.path.join("./IS graphs",f"IS.png")
    # plt.savefig(graphpath)
    # plt.clf()
    #################################################################################################
  
  
  else:                                                       # saving test images
    output_path = "predictions"
    filename = os.path.join(output_path,f"fake-{cnt}.png")
  
  image_count = 0
  for row in range(PREVIEW_ROWS):
      for col in range(PREVIEW_COLS):
        r = row * (GENERATE_SQUARE+16) + PREVIEW_MARGIN
        c = col * (GENERATE_SQUARE+16) + PREVIEW_MARGIN
        image_array[r:r+GENERATE_SQUARE,c:c+GENERATE_SQUARE] = generated_images[image_count] * 255
        image_count += 1

  if not os.path.exists(output_path):
    os.makedirs(output_path)
  im = Image.fromarray(image_array)
  im.save(filename)

In [34]:
# Function for calculating losses
cross_entropy = tf.keras.losses.BinaryCrossentropy()

def discriminator_loss(real_image_real_text, fake_image_real_text, real_image_fake_text):
    real_loss = cross_entropy(tf.random.uniform(real_image_real_text.shape,0.8,1.0), real_image_real_text)
    fake_loss = (cross_entropy(tf.random.uniform(fake_image_real_text.shape,0.0,0.2), fake_image_real_text) + 
                 cross_entropy(tf.random.uniform(real_image_fake_text.shape,0.0,0.2), real_image_fake_text))/2

    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

In [36]:
#@tf.function for training the models

@tf.function
def train_step(images,captions,fake_captions):
  seed = tf.random.normal([BATCH_SIZE, SEED_SIZE],dtype=tf.float32)

  with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
    generated_images = generator((seed,captions), training=True)
    real_image_real_text = discriminator((images,captions), training=True)
    real_image_fake_text = discriminator((images,fake_captions), training=True)
    fake_image_real_text = discriminator((generated_images,captions), training=True)

    gen_loss = generator_loss(fake_image_real_text)
    disc_loss = discriminator_loss(real_image_real_text, fake_image_real_text, real_image_fake_text)
    # print(gen_loss)
    # print(disc_loss)

    gradients_of_generator = gen_tape.gradient(\
        gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(\
        disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(
        gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(
        gradients_of_discriminator, 
        discriminator.trainable_variables))
  return gen_loss,disc_loss

In [45]:
# The training process is called by this function

# g_loss_list,d_loss_list =[],[]
def train(train_dataset, epochs):
  fixed_seed = np.random.normal(0, 1, (PREVIEW_ROWS * PREVIEW_COLS, 
                                       SEED_SIZE))
  fixed_embed = save_images_embeddings

  start = time.time()

  for epoch in range(epochs):
    print("epoch start...")
    epoch_start = time.time()

    gen_loss_list = []
    disc_loss_list = []

    for batch in train_dataset[:-1]:
      # train_batch = training_data[BATCH_SIZE*image_batch : BATCH_SIZE*image_batch + BATCH_SIZE]
      # caption_batch = captions[BATCH_SIZE*image_batch : BATCH_SIZE*image_batch + BATCH_SIZE]
      train_batch = batch['images']
      caption_batch = batch['embeddings']
      
      fake_caption_batch = np.copy(caption_batch)
      np.random.shuffle(fake_caption_batch)
      
      t = train_step(train_batch,caption_batch,fake_caption_batch)
      # print(t)
      gen_loss_list.append(t[0])
      disc_loss_list.append(t[1])
      # if image_batch%50 == 0:
      #   print(image_batch)
      # print("here")
    print("now")
    g_loss = sum(gen_loss_list) / len(gen_loss_list)
    d_loss = sum(disc_loss_list) / len(disc_loss_list)

    # g_loss_list.append(g_loss)
    # d_loss_list.append(d_loss)

    save_images(epoch,fixed_seed,fixed_embed)

    #################################################################################################
    # This part of the code is to plot the loss graph (my contribution)
    
    # plt.plot(steps, g_loss_list,label = 'Generator loss')
    # plt.plot(steps,d_loss_list, label = 'Discriminator loss')
    # plt.xlabel('Epochs')
    # plt.ylabel('Avg Loss')
    # plt.title('Loss Vs Epochs')
    # plt.legend()
    # losspath = os.path.join("./loss graphs",f"loss.png")
    # plt.savefig(losspath)
    # plt.clf()
    #################################################################################################
    
    generator.compile(optimizer='adam', loss='binary_crossentropy')
    discriminator.compile(optimizer='adam', loss='binary_crossentropy')
    generator.save(os.path.join('./models',"flower_gen.h5"))
    discriminator.save(os.path.join('./models',"flower_disc.h5"))
    print("model saved")
    print(f'Epoch {epoch+1}, gen loss={g_loss},disc loss={d_loss}, {time.time()-epoch_start}')

  
  print ('Total Training time:', time.time()-start)


train(list(train_dataset.as_numpy_iterator()), 500)      # loading the dataset to train function

# Testing

In [None]:
# This is a function for calculating the inception score
# This section of code is adapted from https://machinelearningmastery.com/how-to-implement-the-frechet-inception-distance-fid-from-scratch/

from numpy import iscomplexobj
from scipy.linalg import sqrtm
from numpy import trace
from numpy import cov



fid_model = InceptionV3(include_top=False, pooling='avg', input_shape=(299,299,3))
# scale an array of images to a new size
def scale_images(images, new_shape):
	images_list = list()
	for image in images:
		# resize with nearest neighbor interpolation
		new_image = resize(image, new_shape, 0)
		# store
		images_list.append(new_image)
	return asarray(images_list)
 
# calculate frechet inception distance
def calculate_fid(model, images1, images2):
  
    images1 = images1.astype('float32')
    images2 = images2.astype('float32')
    # resize images
    images1 = scale_images(images1, (299,299,3))
    images2 = scale_images(images2, (299,299,3))
    print('Scaled', images1.shape, images2.shape)
    # pre-process images
    images1 = preprocess_input(images1)
    images2 = preprocess_input(images2)

    # calculate activations
    act1 = model.predict(images1)
    act2 = model.predict(images2)
    # calculate mean and covariance statistics
    mu1, sigma1 = act1.mean(axis=0), cov(act1, rowvar=False)
    mu2, sigma2 = act2.mean(axis=0), cov(act2, rowvar=False)
    # calculate sum squared difference between means
    ssdiff = np.sum((mu1 - mu2)**2.0)
    # calculate sqrt of product between cov
    covmean = sqrtm(sigma1.dot(sigma2))
    # check and correct imaginary numbers from sqrt
    if iscomplexobj(covmean):
      covmean = covmean.real
    # calculate score
    fid = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid

In [54]:
# Loading the pretrained generator model
gen_model = tf.keras.models.load_model('./models/flower_gen.h5')

In [50]:
# This is a function to test the model with new inputs

def test_image(text,num):
  test_embeddings = np.zeros((1,300),dtype=np.float32)

  x = text.lower()
  x = x.replace("  ","")
  count = 0
  for t in x:
    try:
      test_embeddings[0] += model[t]
      count += 1
    except:
      print(t)
      pass
  test_embeddings[0] /= count
  test_embeddings =  np.repeat(test_embeddings,28,axis=0)
  noise = tf.random.normal([28, 100])
  save_images(num,noise,test_embeddings)

In [None]:
test_image("this flower is yellow in color",600)
# test_image("this flower is very much yellow in color",305)
# test_image("this flower is purple in color",306)
# test_image("this flower has clusters of orange and red petals surrounding brown-tinted stamen",309)

In [None]:
# Displaying the predicted image
import IPython
IPython.display.Image('./predictions/fake-600.png')