In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


import torch.utils.data as data
from PIL import Image
import PIL
import os
import os.path
import pickle
import random
import numpy as np
import pandas as pd
import torchvision.transforms as transforms

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.optim.lr_scheduler import StepLR
from torch.autograd import Variable

from datetime import datetime
import random


In [2]:
class TextDataset(data.Dataset):
    def __init__(self, data_dir, data_dir2, split='train', embedding_type='skip-thought',
                 imsize=64, transform=None, target_transform=None):

        self.transform = transform
        self.target_transform = target_transform
        self.imsize = imsize
        self.data = []
        self.data_dir = data_dir
        self.bbox = self.load_bbox()
        split_dir = os.path.join(data_dir2, split)

        self.filenames = self.load_filenames(split_dir)
        self.embeddings = self.load_embedding(split_dir, embedding_type)
        self.captions = self.load_all_captions()

    def get_img(self, img_path, bbox):
        img = Image.open(img_path).convert('RGB')
        width, height = img.size
        if bbox is not None:
            R = int(np.maximum(bbox[2], bbox[3]) * 0.75)
            center_x = int((2 * bbox[0] + bbox[2]) / 2)
            center_y = int((2 * bbox[1] + bbox[3]) / 2)
            y1 = np.maximum(0, center_y - R)
            y2 = np.minimum(height, center_y + R)
            x1 = np.maximum(0, center_x - R)
            x2 = np.minimum(width, center_x + R)
            img = img.crop([x1, y1, x2, y2])
        load_size = int(self.imsize * 76 / 64)
        img = img.resize((load_size, load_size), PIL.Image.Resampling.BILINEAR)
        if self.transform is not None:
            img = self.transform(img)
        return img

    def load_bbox(self):
        data_dir = self.data_dir
        bbox_path = os.path.join(data_dir, 'CUB_200_2011/bounding_boxes.txt')
        df_bounding_boxes = pd.read_csv(bbox_path,
                                        delim_whitespace=True,
                                        header=None).astype(int)
        #
        filepath = os.path.join(data_dir, 'CUB_200_2011/images.txt')
        df_filenames = \
            pd.read_csv(filepath, delim_whitespace=True, header=None)
        filenames = df_filenames[1].tolist()
        print('Total filenames: ', len(filenames), filenames[0])
        #
        filename_bbox = {img_file[:-4]: [] for img_file in filenames}
        numImgs = len(filenames)
        for i in range(0, numImgs):
            # bbox = [x-left, y-top, width, height]
            bbox = df_bounding_boxes.iloc[i][1:].tolist()

            key = filenames[i][:-4]
            if "_rgb" in key:
                key = key.replace("_rgb", "" ).strip()
            filename_bbox[key] = bbox
        #
        return filename_bbox

    def load_all_captions(self):
        caption_dict = {}
        for key in self.filenames:
            if "_rgb" in key:
                key = key.replace("_rgb", "" ).strip()
            caption_name = '%s/cvpr2016_cub/text_c10/%s.txt' % (self.data_dir, key)
            captions = self.load_captions(caption_name)
            caption_dict[key] = captions
        return caption_dict

    def load_captions(self, caption_name):
        cap_path = caption_name
        with open(cap_path, "r") as f:
            captions = f.read().split('\n')
        captions = [cap.replace("\ufffd\ufffd", " ")
                    for cap in captions if len(cap) > 0]
        return captions

    def load_embedding(self, data_dir, embedding_type):
        if embedding_type == 'cnn-rnn':
            embedding_filename = '/char-CNN-RNN-embeddings.pickle'
        elif embedding_type == 'cnn-gru':
            embedding_filename = '/char-CNN-GRU-embeddings.pickle'
        elif embedding_type == 'skip-thought':
            embedding_filename = '/skip-thought-embeddings.pickle'

        with open(data_dir + embedding_filename, 'rb') as f:
            embeddings = pickle.load(f)
            embeddings = np.array(embeddings)
            # embedding_shape = [embeddings.shape[-1]]
            print('embeddings: ', embeddings.shape)
        return embeddings

    def load_filenames(self, data_dir):
        filepath = os.path.join(data_dir, 'filenames.pickle')
        with open(filepath, 'rb') as f:
            filenames = pickle.load(f)
        print('Load filenames from: %s (%d)' % (filepath, len(filenames)))
        return filenames

    def __getitem__(self, index):
        key = self.filenames[index]
        if "_rgb" in key:
            key = key.replace("_rgb", "" ).strip()
        if self.bbox is not None:
            bbox = self.bbox[key]
            data_dir = '%s/CUB_200_2011' % self.data_dir
        else:
            bbox = None
            data_dir = self.data_dir

        captions = self.captions[key]
        embeddings = self.embeddings[index, :, :]
        img_name = '%s/images/%s.jpg' % (data_dir, key)
        img = self.get_img(img_name, bbox)

        rand_ix = random.randint(0, embeddings.shape[0]-1)
        embedding = embeddings[rand_ix, :]
        if self.target_transform is not None:
            embedding = self.target_transform(embedding)
        return img, embedding,captions[rand_ix]

    def __len__(self):
        return len(self.filenames)

In [3]:
ngpu = 1
nz = 100
ngf = 64
ndf = 64
nc = 3
nt = 256
nte = 4800

lr = 0.0002
niter = 400
workers = 2
beta1 = 0.5
imageSize = 64
BATCH_SIZE = 32

working_dir = "/kaggle/working/"
snapshots = os.path.join(working_dir, 'snapshots')
models_path = os.path.join(working_dir, 'models')


In [4]:
image_transform = transforms.Compose([
    transforms.RandomCrop(imageSize),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0, 0, 0), (1, 1, 1))
])

In [5]:
dataroot = "/kaggle/input/cub2002011"
dataroot2 = "/kaggle/input/birdsdata"

In [6]:
if not os.path.exists(training_binary_path):
    os.makedirs(training_binary_path)

if not os.path.exists(logs_files):
    os.makedirs(logs_files)

if not os.path.exists(snapshots):
    os.makedirs(snapshots)

if not os.path.exists(models_path):
    os.makedirs(models_path)

In [7]:
manualSeed = 42
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)

Random Seed:  42


<torch._C.Generator at 0x7b872fa0a7d0>

In [8]:
cuda = True
cudnn.benchmark = True

if torch.cuda.is_available() and not cuda:
    print(
    "WARNING: You have a CUDA device, so you should probably run with --cuda"
    )

In [9]:
# custom weights initialization called on netG and netD
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

In [10]:
class _netG(nn.Module):
    def __init__(self, ngpu, nz, ngf, nc, nte, nt):
        super(_netG, self).__init__()
        self.nt = nt
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(nz + nt, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            # nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4

            nn.Conv2d(ngf*8,ngf*2,1,1),
            nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # nn.SELU(True),

            nn.Conv2d(ngf*2,ngf*2,3,1,1),
            nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # nn.SELU(True),

            nn.Conv2d(ngf*2,ngf*8,3,1,1),
            nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(inplace=True),
            # nn.SELU(True),


            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),   
            nn.BatchNorm2d(ngf * 4),
            # nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            
            nn.Conv2d(ngf*4,ngf,1,1),
            nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # nn.SELU(True),

            nn.Conv2d(ngf,ngf,3,1,1),
            nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # nn.SELU(True),

            nn.Conv2d(ngf,ngf*4,3,1,1),
            nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # nn.SELU(True),            
            
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # nn.SELU(True),
            
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d(ngf * 2,     ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # nn.SELU(True),

            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d(    ngf,      nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

        self.encode_text = nn.Sequential(
            nn.Linear(nte, nt), nn.LeakyReLU(0.2, inplace=True))

    def forward(self, input, text_embedding):
        if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
            encoded_text = nn.parallel.data_parallel(self.encode_text, text_embedding, )
            input_new = torch.cat((input, encoded_text))
            output = nn.parallel.data_parallel(self.main,input_new, range(self.ngpu))
        else:
            encoded_text = self.encode_text(text_embedding).view(-1,self.nt,1,1)
            output = self.main(torch.cat((input, encoded_text), 1))
        return output


In [11]:
class _netD(nn.Module):
    def __init__(self, ngpu, nc, ndf, nte, nt):
        super(_netD, self).__init__()
        self.ngpu = ngpu
        self.nt = nt
        self.nte = nte
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),

            nn.Conv2d(ndf*8,ndf*2,1,1),
            # nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(ndf*2,ndf*2,3,1,1),
            # nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(ndf*2,ndf*8,3,1,1),
            # nn.Dropout2d(inplace=True),            
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True))

        # state size. (ndf*8) x 4 x 4

        self.encode_text = nn.Sequential(
            nn.Linear(nte, nt),
            nn.LeakyReLU(0.2, inplace=True)

        )

        self.concat_image_n_text = nn.Sequential(
            nn.Conv2d(ndf * 8 + nt, ndf * 8, 1, 1, 0, bias=False), ## TODO: Might want to change the kernel size and stride
            nn.BatchNorm2d(ndf*8),
            nn.LeakyReLU(0.2,inplace=True),
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input, text_embedding):
        if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
            encoded_img = nn.parallel.data_parallel(self.main, input,
                                               range(self.ngpu))
            encoded_text = nn.parallel.data_parallel(self.encode_text, text_embedding, range(self.ngpu))

        else:
            encoded_img = self.main(input)
            encoded_text = self.encode_text(text_embedding)
            encoded_text = encoded_text.view(-1, self.nt, 1,1)
            encoded_text = encoded_text.repeat(1, 1, 4, 4) ## can also directly expand, look into the syntax
            output = self.concat_image_n_text(torch.cat((encoded_img, encoded_text),1))

        return output.view(-1, 1).squeeze(1)

In [12]:
saved_gen = '/kaggle/input/birdsdata/models_v2/models/netG_epoch_600.pth'
saved_dis = '/kaggle/input/birdsdata/models_v2/models/netD_epoch_600.pth'

In [13]:
netG = _netG(ngpu, nz, ngf, nc, nte, nt)
netG.apply(weights_init)
netG.load_state_dict(torch.load(saved_gen))
print(netG)

_netG(
  (main): Sequential(
    (0): ConvTranspose2d(356, 512, kernel_size=(4, 4), stride=(1, 1), bias=False)
    (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
    (3): Dropout2d(p=0.5, inplace=True)
    (4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): Dropout2d(p=0.5, inplace=True)
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): Dropout2d(p=0.5, inplace=True)
    (12): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU(inplace=True)
    (14): ConvTranspose2d(512, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (15

In [14]:
netD = _netD(ngpu, nc, ndf, nte, nt)
netD.apply(weights_init)
netD.load_state_dict(torch.load(saved_dis))
print(netD)

_netD(
  (main): Sequential(
    (0): Conv2d(3, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv2d(256, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (9): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): LeakyReLU(negative_slope=0.

In [15]:
criterion = nn.BCELoss()

input = torch.FloatTensor(BATCH_SIZE, 3, imageSize, imageSize)
noise = torch.FloatTensor(BATCH_SIZE, nz, 1, 1)
fixed_noise = torch.FloatTensor(BATCH_SIZE, nz, 1, 1).normal_(0, 1)
label = torch.FloatTensor(BATCH_SIZE)
real_label = 1
fake_label = 0

In [16]:
if cuda:
    netD.cuda()
    netG.cuda()
    criterion.cuda()
    input, label = input.cuda(), label.cuda()
    noise, fixed_noise = noise.cuda(), fixed_noise.cuda()

fixed_noise = Variable(fixed_noise)

# Train

In [6]:
train_dataset = TextDataset(dataroot, dataroot2, transform=image_transform)

Total filenames:  11788 001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg
Load filenames from: /kaggle/input/birdsdata/train/filenames.pickle (8251)
embeddings:  (8251, 10, 4800)


In [7]:
train_dataloader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    num_workers=int(workers))

In [20]:
optimizerD = optim.Adam(
  netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(
  netG.parameters(), lr=lr, betas=(beta1, 0.999))

In [21]:
import csv

header = ["epoch", "g_loss", "d_loss", "d_loss_fake", "d_loss_real"]
with open(f"{snapshots}/logs.csv", "w+") as f:
    writer = csv.writer(f)
    writer.writerow(header)

In [22]:
for epoch in range(1, niter + 1):
    loss_d = 0
    loss_g = 0
    d_loss_fake_ = 0
    d_loss_real_ = 0
    if epoch % 75 == 0:
        optimizerG.param_groups[0]['lr'] /= 2
        optimizerD.param_groups[0]['lr'] /= 2
    for i, data in enumerate(train_dataloader, 0):
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        # train with real
        netD.zero_grad()
        real_cpu, text_embedding, _ = data
        batch_size = real_cpu.size(0)
        text_embedding = Variable(text_embedding)

        if cuda:
            real_cpu = real_cpu.cuda()
            text_embedding = text_embedding.cuda()

        input.resize_as_(real_cpu).copy_(real_cpu)
        label.resize_(batch_size).fill_(real_label)
        inputv = Variable(input)
        labelv = Variable(label)

        output = netD(inputv, text_embedding)
        errD_real = criterion(output, labelv)  ##
        errD_real.backward()
        D_x = output.data.mean()

        ### calculate errD_wrong
        inputv = torch.cat((inputv[1:], inputv[:1]), 0)
        output = netD(inputv, text_embedding)
        errD_wrong = criterion(output, labelv) * 0.5
        errD_wrong.backward()

        # train with fake
        noise.resize_(batch_size, nz, 1, 1).normal_(0, 1)
        noisev = Variable(noise)
        fake = netG(noisev, text_embedding)
        labelv = Variable(label.fill_(fake_label))
        output = netD(fake.detach(), text_embedding)
        errD_fake = criterion(output, labelv) * 0.5
        errD_fake.backward()
        D_G_z1 = output.data.mean()

        errD = errD_real + errD_fake + errD_wrong
        # errD.backward()
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        labelv = Variable(label.fill_(
            real_label))  # fake labels are real for generator cost
        output = netD(fake, text_embedding)
        errG = criterion(output, labelv)  ##
        errG.backward()
        D_G_z2 = output.data.mean()
        optimizerG.step()
        loss_d += errD.data
        loss_g += errG.data
        d_loss_fake_ += D_G_z1/D_G_z2
        d_loss_real_ += D_x
        
        if i % 100 == 0:
            vutils.save_image(
                real_cpu, '%s/real_samples.png' % snapshots, normalize=True)
            fake = netG(fixed_noise, text_embedding)
            vutils.save_image(
                fake.data,
                '%s/fake_samples_epoch_%03d.png' % (snapshots, epoch),
                normalize=True)
    loss_d_ = loss_d/len(train_dataloader)
    loss_g_ = loss_g/len(train_dataloader)
    d_loss_fake__ = d_loss_fake_/len(train_dataloader)
    d_loss_real__ = d_loss_real_/len(train_dataloader)
    print(
        '[%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f'
        % (epoch, niter, loss_d_,
        loss_g_, d_loss_real__, d_loss_fake__))
    with open(f"{snapshots}/logs.csv", "a+") as logs_file:
        writer = csv.writer(logs_file)
        writer.writerow([epoch,loss_g_,loss_d_,d_loss_fake__,d_loss_real__])

    # do checkpointing
    if epoch % 20 == 0:
        torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (models_path,
                                                                epoch))
        torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (models_path,
                                                                epoch))

[1/400] Loss_D: 0.3578 Loss_G: 6.8431 D(x): 0.9553 D(G(z)): 3469935.2500
[2/400] Loss_D: 0.4028 Loss_G: 5.1199 D(x): 0.9472 D(G(z)): 1099.7072
[3/400] Loss_D: 0.1900 Loss_G: 5.8324 D(x): 0.9672 D(G(z)): 562.7416
[4/400] Loss_D: 0.2282 Loss_G: 6.1631 D(x): 0.9697 D(G(z)): 4870.7666
[5/400] Loss_D: 0.1599 Loss_G: 6.5995 D(x): 0.9727 D(G(z)): 21517.8945
[6/400] Loss_D: 0.3181 Loss_G: 5.8115 D(x): 0.9534 D(G(z)): 7240.9951
[7/400] Loss_D: 0.2632 Loss_G: 5.4497 D(x): 0.9624 D(G(z)): 1328.9270
[8/400] Loss_D: 0.4164 Loss_G: 5.2738 D(x): 0.9535 D(G(z)): 2290.0430
[9/400] Loss_D: 0.3586 Loss_G: 4.9950 D(x): 0.9460 D(G(z)): 449.2559
[10/400] Loss_D: 0.1363 Loss_G: 6.1112 D(x): 0.9721 D(G(z)): 2217.8484
[11/400] Loss_D: 0.2070 Loss_G: 6.6967 D(x): 0.9786 D(G(z)): 74713.7422
[12/400] Loss_D: 0.3303 Loss_G: 5.5383 D(x): 0.9550 D(G(z)): 5560.7246
[13/400] Loss_D: 0.3503 Loss_G: 5.2873 D(x): 0.9486 D(G(z)): 708.8704
[14/400] Loss_D: 0.2656 Loss_G: 5.3297 D(x): 0.9593 D(G(z)): 1766.7609
[15/400] Loss

In [23]:
from IPython.display import FileLink

os.chdir(r'/kaggle/working/')

!tar -czf snapshots.tar.gz snapshots

FileLink(r'snapshots.tar.gz')

In [24]:
!tar -czf models.tar.gz models

FileLink(r'models.tar.gz')

# Evaluation

In [18]:
test_dataset = TextDataset(dataroot, dataroot2, transform=image_transform,split='test')

Total filenames:  11788 001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg
Load filenames from: /kaggle/input/birdsdata/test/filenames.pickle (3537)
embeddings:  (3537, 10, 4800)


In [19]:
## Completed - TODO: Make a new DataLoader and Dataset to include embeddings
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=int(workers))

In [28]:
file_path = './eval_results'
if not os.path.exists(file_path):
    os.makedirs(file_path)

for i, data in enumerate(test_dataloader, 0):
    real_image, text_embedding,caption = data
    batch_size = real_image.size(0)
    text_embedding = Variable(text_embedding)

    if cuda:
        real_image = real_image.cuda()
        text_embedding = text_embedding.cuda()

    input.resize_as_(real_image).copy_(real_image)
    inputv = Variable(input)

    noise.resize_(batch_size, nz, 1, 1).normal_(0, 1)
    noisev = Variable(noise)
    num_test_outputs = 10

    print(f"Creating images for batch {i}")
    synthetic_image = netG(noisev, text_embedding)
    synthetic_image = synthetic_image.detach()
    
    for y in range(synthetic_image.size()[0]):
        
        with open(f"{file_path}/caption_{i+y}.txt", "w+") as f:
            f.write(caption[y])
        
        try:
            vutils.save_image(synthetic_image[y].data,f"{file_path}/{i+y}_image.jpg")
        except e:
            print (e)

Creating images for batch 0
Creating images for batch 1
Creating images for batch 2
Creating images for batch 3
Creating images for batch 4
Creating images for batch 5
Creating images for batch 6
Creating images for batch 7
Creating images for batch 8
Creating images for batch 9
Creating images for batch 10
Creating images for batch 11
Creating images for batch 12
Creating images for batch 13
Creating images for batch 14
Creating images for batch 15
Creating images for batch 16
Creating images for batch 17
Creating images for batch 18
Creating images for batch 19
Creating images for batch 20
Creating images for batch 21
Creating images for batch 22
Creating images for batch 23
Creating images for batch 24
Creating images for batch 25
Creating images for batch 26
Creating images for batch 27
Creating images for batch 28
Creating images for batch 29
Creating images for batch 30
Creating images for batch 31
Creating images for batch 32
Creating images for batch 33
Creating images for batc

In [29]:
from IPython.display import FileLink

os.chdir(r'/kaggle/working/')

!tar -czf eval_results.tar.gz eval_results

FileLink(r'eval_results.tar.gz')

# Inception score

In [30]:
# Code adapted from
# https://github.com/openai/improved-gan/blob/master/inception_score/model.py
# which was in turn derived from
# tensorflow/tensorflow/models/image/imagenet/classify_image.py

# Code from https://gist.github.com/jcjohnson/0779568bf0e4e64d141cae22414da549
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import tarfile

import numpy as np
from six.moves import urllib
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
# import tensorflow as tf
import glob
from imageio import imread
from skimage.transform import resize
import math
import sys

In [31]:
input_image_dir  = '/kaggle/working/eval_results'
num_splits = 1
tensor_layout = 'NHWC'
IMAGE_EXTS = ['.png', '.jpg', '.jpeg']


MODEL_DIR = '/tmp/imagenet'
DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
softmax = None
image_size = 64

In [32]:
def load_images(image_dir):
    print('Loading images from ', image_dir)
    images = []
    for fn in os.listdir(image_dir):
        ext = os.path.splitext(fn)[1].lower()
        if ext not in IMAGE_EXTS:
            continue
        img_path = os.path.join(image_dir, fn)
        img = imread(img_path)
        
        if image_size is not None:
            img = resize(img, (image_size, image_size))
            img = img * 255.0
        images.append(img)
    print('Found %d images' % len(images))
    return images


# Call this function with list of images. Each of elements should be a 
# numpy array with values ranging from 0 to 255.
def get_inception_score(images):
    splits = num_splits
    layout = tensor_layout

    assert(type(images) == list)
    assert(type(images[0]) == np.ndarray)
    assert(len(images[0].shape) == 3)
    print(images[0].min(), images[0].max(), images[0].dtype)
    assert(np.max(images[0]) > 10)
    assert(np.min(images[0]) >= 0.0)
    inps = []
    for img in images:
        img = img.astype(np.float32)
        inps.append(np.expand_dims(img, 0))
    bs = 100
    with tf.Session() as sess:
        preds = []
        n_batches = int(math.ceil(float(len(inps)) / float(bs)))
        n_preds = 0
        for i in range(n_batches):
            sys.stdout.write(".")
            sys.stdout.flush()
            inp = inps[(i * bs):min((i + 1) * bs, len(inps))]
            inp = np.concatenate(inp, 0)
            if layout == 'NCHW':
                inp = inp.transpose(0, 2, 3, 1)
            pred = sess.run(softmax, {'ExpandDims:0': inp})
            preds.append(pred)
            n_preds += pred.shape[0]
            print('Ran %d / %d images' % (n_preds, len(images)))
    preds = np.concatenate(preds, 0)
    scores = []
    for i in range(splits):
        part = preds[(i * preds.shape[0] // splits):((i + 1) * preds.shape[0] // splits), :]
        kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
        kl = np.mean(np.sum(kl, 1))
        scores.append(np.exp(kl))
    return np.mean(scores), np.std(scores)


In [25]:

# This function is called automatically.
def _init_inception():
    global softmax
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(MODEL_DIR, filename)
    if not os.path.exists(filepath):
        def _progress(count, block_size, total_size):
            sys.stdout.write('\r>> Downloading %s %.1f%%' % (
              filename, float(count * block_size) / float(total_size) * 100.0))
            sys.stdout.flush()
        filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
        print()
        statinfo = os.stat(filepath)
        print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
    tarfile.open(filepath, 'r:gz').extractall(MODEL_DIR)
    with tf.gfile.FastGFile(os.path.join(
        MODEL_DIR, 'classify_image_graph_def.pb'), 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')
  # Works with an arbitrary minibatch size.
    with tf.Session() as sess:
        pool3 = sess.graph.get_tensor_by_name('pool_3:0')
        ops = pool3.graph.get_operations()
        for op_idx, op in enumerate(ops):
            for o in op.outputs:
                shape = o.get_shape()
                shape = [s.value for s in shape]
                new_shape = []
                for j, s in enumerate(shape):
                    if s == 1 and j == 0:
                        new_shape.append(None)
                    else:
                        new_shape.append(s)
                o.__dict__['_shape_val'] = tf.TensorShape(new_shape)
                #o._shape = tf.TensorShape(new_shape)
    w = sess.graph.get_operation_by_name("softmax/logits/MatMul").inputs[1]
    logits = tf.matmul(tf.squeeze(pool3,[1,2]), w)
    #sslogits = tf.matmul(tf.squeeze(pool3), w)
    softmax = tf.nn.softmax(logits)


if softmax is None:
    _init_inception()


>> Downloading inception-2015-12-05.tgz 100.0%
Succesfully downloaded inception-2015-12-05.tgz 88931400 bytes.


In [33]:

images = load_images(f"{input_image_dir}")

#print("images.shape")
#sprint(images.shape)
mean, std = get_inception_score(images)
print('Inception mean: ', mean)
print('Inception std: ', std)

Loading images from  /kaggle/working/eval_results


  if __name__ == "__main__":


Found 705 images
0.0 255.0 float64
.

TypeError: Argument `fetch` = None has invalid type "NoneType". Cannot be None

# Inference

In [17]:
!wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
!wget http://www.cs.toronto.edu/~rkiros/models/utable.npy
!wget http://www.cs.toronto.edu/~rkiros/models/btable.npy
!wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz
!wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl
!wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz
!wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl

--2023-04-09 03:54:49--  http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7996547 (7.6M) [text/plain]
Saving to: ‘dictionary.txt’


2023-04-09 03:54:50 (10.8 MB/s) - ‘dictionary.txt’ saved [7996547/7996547]

--2023-04-09 03:54:51--  http://www.cs.toronto.edu/~rkiros/models/utable.npy
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2342138474 (2.2G)
Saving to: ‘utable.npy’


2023-04-09 03:55:44 (42.9 MB/s) - ‘utable.npy’ saved [2342138474/2342138474]

--2023-04-09 03:55:45--  http://www.cs.toronto.edu/~rkiros/models/btable.npy
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.ed

In [20]:
!cp -r /kaggle/input/skipthoughts/* ./

In [21]:
import skipthoughts
import numpy as np
import pickle
import os

In [22]:
model = skipthoughts.load_model(path_to_models="/kaggle/working/",
                               path_to_tables="/kaggle/working/")
print("Model loaded.")

Loading model parameters...
Compiling encoders...
Loading tables...
Packing up...
Model loaded.


In [23]:
encoder = skipthoughts.Encoder(model)

In [69]:
# this bird had a bright red head and belly with a short triangular beak and a black throat.
sentence = ["this bird is red with a short beak that curves down and black markings around its bill."]

sent_embeddings = encoder.encode(sentence, verbose=False)

Preprocessing...
Preprocess completed.
Running encoding...


100%|██████████| 1/1 [00:00<00:00,  7.46it/s]

Encoding completed.





In [70]:
working_dir = "/kaggle/working/"

sent_embeddings = torch.from_numpy(sent_embeddings)
sent_embeddings = Variable(sent_embeddings)

nz = 100
noise = torch.FloatTensor(1, nz, 1, 1)

In [71]:
noise.resize_(1, nz, 1, 1).normal_(0, 1)
noisev = Variable(noise)

if cuda:
    sent_embeddings = sent_embeddings.cuda()
    noisev=noisev.cuda()

synthetic_image = netG(noisev, sent_embeddings)
synthetic_image = synthetic_image.detach()

In [72]:
vutils.save_image(synthetic_image.data,f"{working_dir}sample1.jpg")