# M2177.003100 Deep Learning <br> Final Project: Text-to-Image Synthesis 

Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. 

**For understanding of this work, please carefully look at given PDF file.**

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

## 1. Load datasets

In [1]:
import os
os.chdir('/home/jaehyun/Downloads/22Final/')
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms

import numpy as np

from models import GENERATOR
from config import cfg, cfg_from_file
import pprint
import datetime
import dateutil.tz
import time
from PIL import Image

from encoders import CNN_ENCODER, RNN_ENCODER
from dataset import CUBDataset
from utils import *
from loss import *

  from .autonotebook import tqdm as notebook_tqdm
2022-12-19 15:38:47.324031: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-19 15:38:47.404230: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-19 15:38:47.737384: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.3/lib64
2022-12-19 15:38:47.

In [2]:
# Set a config file as 'train_birds.yml' in training, as 'eval_birds.yml' for evaluation
cfg_from_file('cfg/eval_birds.yml') # eval_birds.yml

print('Using config:')
pprint.pprint(cfg)

os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU_ID

now = datetime.datetime.now(dateutil.tz.tzlocal())
timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
output_dir = 'sample/%s_%s_%s' % (cfg.DATASET_NAME, cfg.CONFIG_NAME, timestamp)

os.makedirs(output_dir, exist_ok=True)

Using config:
{'BATCH_SIZE': 10,
 'CA': False,
 'CHECKPOINT_DIR': './checkpoints',
 'CHECKPOINT_NAME': '',
 'CNN': {'EMBEDDING_DIM': 32, 'H_DIM': 256},
 'CONFIG_NAME': 'text-to-image',
 'CUDA': True,
 'DATASET_NAME': 'birds',
 'DATA_DIR': 'data/birds',
 'EMBEDDING_TYPE': 'cnn-rnn',
 'GAN': {'B_ATTENTION': False,
         'B_CONDITION': False,
         'B_DCGAN': False,
         'CONDITION_DIM': 100,
         'DF_DIM': 32,
         'EMBEDDING_DIM': 256,
         'GF_DIM': 64,
         'R_NUM': 2,
         'Z_DIM': 100},
 'GPU_ID': '0',
 'IMAGE_SIZE': 128,
 'NUM_BATCH_FOR_TEST': 4,
 'RANDOM_SEED': 1234,
 'RNN': {'EMBEDDING_DIM': 0,
         'H_DIM': 0,
         'TYPE': 'LSTM',
         'VOCAB_SIZE': 0,
         'WORD_EMBEDDING_DIM': 0},
 'R_PRECISION_DIR': '',
 'R_PRECISION_FILE': '',
 'R_PRECISION_FILE_HIDDEN': '',
 'TEST': {'B_EXAMPLE': False,
          'GENERATED_HIDDEN_TEST_IMAGES': '',
          'GENERATED_TEST_IMAGES': './evaluation/generated_images'},
 'TEXT': {'CAPTIONS_PER_IMAGE

In [3]:
imsize = cfg.TREE.BASE_SIZE * (4 ** (cfg.TREE.BRANCH_NUM - 1))
image_transform = transforms.Compose([
    transforms.Resize(int(imsize)),
    transforms.RandomCrop(imsize),
    transforms.RandomHorizontalFlip()])

#### Load the dataset
test_dataset = CUBDataset(cfg.DATA_DIR, transform=image_transform, split='test')

print(f'test data directory:\n{test_dataset.split_dir}\n')
print(f'# of test filenames:{test_dataset.filenames.shape}\n')
print(f'example of filename of valid image:{test_dataset.filenames[0]}\n')
print(f'example of caption and its ids:\n{test_dataset.captions[0]}\n{test_dataset.captions_ids[0]}\n')
print(f'# of test captions:{np.asarray(test_dataset.captions).shape}\n')
print(f'# of test caption ids:{np.asarray(test_dataset.captions_ids).shape}\n')

self.current_dir:
/home/jaehyun/Downloads/22Final

self.data_dir:
/home/jaehyun/Downloads/22Final/data/birds

self.image_dir:
/home/jaehyun/Downloads/22Final/data/birds/CUB-200-2011/images

filepath /home/jaehyun/Downloads/22Final/data/birds/captions.pickle
Load from:  /home/jaehyun/Downloads/22Final/data/birds/captions.pickle
test data directory:
/home/jaehyun/Downloads/22Final/data/birds/test

# of test filenames:(2933,)

example of filename of valid image:001.Black_footed_Albatross/Black_Footed_Albatross_0046_18

example of caption and its ids:
['this', 'is', 'a', 'small', 'bird', 'that', 'has', 'a', 'brilliant', 'blue', 'color', 'on', 'it', 's', 'body', 'a', 'slightly', 'darker', 'blue', 'on', 'it', 's', 'head', 'a', 'teal', 'color', 'on', 'it', 's', 'wings', 'and', 'a', 'light', 'colored', 'beak']
[18, 19, 1, 250, 2, 33, 13, 1, 853, 50, 37, 86, 53, 54, 15, 1, 178, 31, 50, 86, 53, 54, 25, 1, 1054, 37, 86, 53, 54, 17, 8, 1, 67, 89, 10]

# of test captions:(29330,)

# of test caption

  print(f'# of test captions:{np.asarray(test_dataset.captions).shape}\n')
  print(f'# of test caption ids:{np.asarray(test_dataset.captions_ids).shape}\n')


In [4]:
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True, shuffle=False, num_workers=int(cfg.WORKERS),
                                              collate_fn = my_collate_fn)

## 2. Define models and load weights

In [5]:
# load the text encoder model to generate images for evaluation
text_encoder = RNN_ENCODER(test_dataset.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM)

#######################################################
# TODO
# (1) Build Generator and Network
######################################################
netG = GENERATOR()

state_dict = torch.load(os.path.join(cfg.CHECKPOINT_DIR, cfg.TRAIN.GENERATOR), map_location=lambda storage, loc: storage)
text_encoder.load_state_dict(state_dict['text_encoder'])
for p in text_encoder.parameters():
    p.requires_grad = False
netG.load_state_dict(state_dict['netG'])
for p in netG.parameters():
    p.requires_grad = False

print('Load model from:', os.path.join(cfg.CHECKPOINT_DIR, cfg.TRAIN.GENERATOR))
text_encoder.eval()
netG.eval()

bs = cfg.BATCH_SIZE
noise = Variable(torch.FloatTensor(bs, cfg.GAN.Z_DIM))

if cfg.CUDA:
    text_encoder = text_encoder.cuda()
    netG = netG.cuda()
    noise = noise.cuda()

IsADirectoryError: [Errno 21] Is a directory: './checkpoints/'

## 3. Generate Images and save them to evaluate metrics

In [None]:
for step, data in enumerate(test_dataloader, 0):
    _, captions, cap_lens, class_ids, keys, sent_idx = prepare_data(data)
    
    #######################################################
    # TODO
    # (2) Compute Text Embeddings using RNN Encoder
    ######################################################

    #######################################################
    # TODO
    # (3) Generate fake images using Generator
    ######################################################
    noise.data.normal_(0, 1)

    fake_imgs = None

    for j in range(bs):
        if not os.path.exists(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, keys[j].split('/')[0])):
            os.makedirs(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, keys[j].split('/')[0]))

        im = fake_imgs[j].data.cpu().numpy()
        im = (im + 1.0) * 127.5
        im = im.astype(np.uint8)
        im = np.transpose(im, (1, 2, 0))
        im = Image.fromarray(im)
        print(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, keys[j] + '_{}.png'.format(sent_idx[j])))
        im.save(os.path.join(cfg.TEST.GENERATED_TEST_IMAGES, keys[j] + '_{}.png'.format(sent_idx[j])))