In [None]:
# Connect with your Google Drive
from google.colab import drive
drive.mount('/content/drive') 
!rm -rf /content/sample_data

Mounted at /content/drive


# Setup required environment

In [None]:
# Install the required environment
!pip install wget
!pip install git+https://github.com/openai/CLIP.git
!pip install DALL-E

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=553cd247808c201e17329e26a9b48899a5f4b64c9b70676cf1933d68cc0d1076
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-l3ick2ji
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-l3ick2ji
Collecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 1.7 MB/s 
Building wheels for collected packages: clip, ftfy
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl

# Import packages and Set the parameters

In [None]:
%cd '/content/drive/MyDrive/Portfolio/Poem2Image'

/content/drive/MyDrive/Portfolio/Poem2Image


In [None]:
# Import packages
from dall_e import map_pixels, unmap_pixels, load_model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as T
import torch.nn.functional as F
import torch.nn as nn
import random
import clip
import os
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

# Set the parameters
image_size = 512
epochs = 300

# Load the CLIP model

In [None]:
# Load the pre-trained CLIP model
perceptor, preprocess   = clip.load('ViT-B/32')
perceptor               = perceptor.eval()

# Load the pre-trained descrete VAE model
model = load_model("https://cdn.openai.com/dall-e/decoder.pkl", 'cuda').eval()

100%|███████████████████████████████████████| 338M/338M [00:05<00:00, 68.5MiB/s]


#Load Poetry

In [None]:
# Load poems
poems = []
with open('./poetry_EN.txt') as inputfile:
    for line in inputfile:
      poems.append(line.strip())

# Init and Training Process

In [None]:
# The Normalization of the images
nom = T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

class Pars(nn.Module):
    '''
    The class used to generate the random latents.
    '''
    def __init__(self):
        super(Pars, self).__init__()
        self.normu = nn.Parameter(torch.zeros(1, 8192, 64, 64).cuda())

    def forward(self):
        normu = F.gumbel_softmax(self.normu.view(1, 8192, -1), dim=-1, tau = 2).view(1, 8192, 64, 64)
        return normu


def pad_augs(image):
    '''
    The class used to padding images
    '''
    pad = random.randint(1,50)
    pad_px = random.randint(10,90)/100
    pad_py = random.randint(10,90)/100
    pad_dims = (int(pad*pad_px), pad-int(pad*pad_px), int(pad*pad_py), pad-int(pad*pad_py))
    return F.pad(image, pad_dims, "constant", 1)

def computing_loss(model, lats, image_size, perceptor, percep, tokenizedtxt):
    '''
    The class used to generate images and calculate loss function
    '''
    cutn = 32
    zs = lats()
    out = unmap_pixels(torch.sigmoid(model(zs)[:, :3].float()))

    p_s = []
    for ch in range(cutn):
        size = int(image_size*torch.zeros(1,).normal_(mean=.39, std=.865).clip(.362, .7099))
        offsetx = torch.randint(0, image_size - size, ())
        offsety = torch.randint(0, image_size - size, ())
        apper = out[:, :, offsetx:offsetx + size, offsety:offsety + size]
        apper = pad_augs(apper)
        apper = F.interpolate(apper, (224, 224), mode='nearest')
        p_s.append(apper)

    into = torch.cat(p_s, 0)
    into = nom((into + 1) / 2)
    iii = perceptor.encode_image(into)

    return [-100*torch.cosine_similarity(percep, iii).view(-1, 1).T.mean(), zs, out]

def train(i, model, lats, image_size, perceptor, percep, optimizer, tokenizedtxt):
    '''
    The class is used to train the model
    '''
    output = computing_loss(model, lats, image_size, perceptor, percep, tokenizedtxt)
    
    # the loss of models
    loss = output[0]
    loss = loss.mean()
    zs = output[1]
    img  = output[2].cpu()

    # updata the parameters
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    '''
    # show plots during the iteration times
    if(i % 25 == 0):
      print(i)
      img = img[0].detach().numpy()
      img = np.transpose(img, (1, 2, 0))
      im = Image.fromarray((img * 255).astype(np.uint8))
      display(im)
      #im.save('./test/test'+str(iter)+'.jpg')'''
    
    return zs

# Main Loop

In [None]:
# The iteration for each poem in the dataset
for iter in range(0, len(poems)):
  text = poems[iter]

  # load the latents
  lats = Pars().cuda()
  par     = [lats.normu]
  lr      = .05
  
  optimizer = torch.optim.Adam(par, lr)
  txt = clip.tokenize(text[:250])
  percep = perceptor.encode_text(txt.cuda()).detach().clone()

  # training Loop
  for i in range(epochs):
      zs = train(i, model, lats, image_size, perceptor, percep, optimizer, txt)

  # generate images
  with torch.no_grad():
      img = unmap_pixels(torch.sigmoid(model(zs)[:, :3]).cpu().float())
      img = np.array(img[0])
      img = np.transpose(img, (1, 2, 0))
      im = Image.fromarray((img * 255).astype(np.uint8))
      display(im)
      im.save('./test/output'+str(iter)+'.jpg')
      print(text)

# Reference:
@ Phil Wang, BigSleep, https://github.com/lucidrains/big-sleep

@ Yannic Kilcher, CLIP Music Video, https://github.com/yk/clip_music_video

@ OpenAI, DALLE, https://github.com/openai/dall-e

@ OpenAI, CLIP, https://github.com/openai/CLIP