# SimSyn

Generate visually similar synthetic images using image captions as text prompts to a generative model.

### Setup

In [None]:
!nvidia-smi

In [None]:
!mkdir images

In [None]:
pip install --upgrade diffusers transformers scipy

In [None]:
!huggingface-cli login

### Define Image-To-Text and Text-To-Image models

In [None]:
import os 
import torch
import matplotlib.pyplot as plt
import numpy as np

from PIL import Image, ImageOps
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
from torch import autocast
from diffusers import StableDiffusionPipeline

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set params
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}


#### Setup image captioning model

# taken from https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
captioner = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
captioner.to(device)


#### Setup stable diffusion model

# take from https://huggingface.co/CompVis/stable-diffusion-v1-4
model_id = "CompVis/stable-diffusion-v1-4"
generator = StableDiffusionPipeline.from_pretrained(model_id, use_auth_token=True)
generator = generator.to(device)

Helper functions

In [None]:
def read_resize_image(image_path, new_width=224, new_height=224):
    """ 
    Load and resize an image to a desired size.
    Arguments:
        image_path (str): Image to load and resize
        new_width (int): New width of the image
        new_height (int): New height of the image
    Returns:
        img (np.array): Resized image
    Examples:
        >>> img = read_resize_image("images/doggo.jpeg")
    """

    assert type(image_path) == str, f"Should be a path, got: {image_path} which is {type(image_path)}"
    
    img = Image.open(image_path)
    img = ImageOps.fit(img, (new_width, new_height), Image.BICUBIC)
    img = img.convert("RGB")
    img = np.array(img)
    #img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # required when saving
    # If need saving
    # using cv2 - cv2.imwrite("path.png", img)
    # or using PIL - img.save("path.png")
    return img


def image_grid(imgs, rows, cols):
  """
  Usage: grid = image_grid(all_images, rows=num_rows, cols=num_cols)
  """
  assert len(imgs) == rows*cols

  w, h = imgs[0].size
  grid = Image.new('RGB', size=(cols*w, rows*h))
  grid_w, grid_h = grid.size

  for i, img in enumerate(imgs):
      grid.paste(img, box=(i%cols*w, i//cols*h))
  return grid

def predict_caption(image_path):
  """
  Predicts caption from an image.
  """
  image = Image.open(image_path)
  image = image.convert(mode="RGB")
  pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)
  output_ids = captioner.generate(pixel_values, **gen_kwargs)
  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds][0]
  return preds

def predict_image(prompt):
  """
  Generates an image from the input text.
  """
  with autocast("cuda"):
    image = generator(prompt, height=512, width=512, guidance_scale=7.5)
  # image.save(f"name.png")
  return image

### Upload your image image

In [None]:
%cd images
from google.colab import files
uploaded = files.upload()
%cd ..

### Get caption

In [None]:
image_path = os.path.join('/content/images', sorted(os.listdir('/content/images'))[0])
caption = predict_caption(image_path)
caption

### Generate new images

In [None]:
all_images = []
# Do as many as you want until Colab breaks hahah!
for i in range(5):
    img = predict_image(caption).images[0]
    img = np.array(img.convert("RGB"))    
    all_images.append(img)

### Save Images

In [None]:
for idx, img in enumerate(all_images):
  img = Image.fromarray(np.uint8(img)).convert('RGB')
  img.save(f"./images/{idx}.png")

### Show images

In [None]:
input_image = read_resize_image(image_path, new_width=512, new_height=512)
images = Image.fromarray(np.concatenate([np.array(x) for x in all_images[:5]], axis=1))
images = np.array(images.convert("RGB"))

white = np.zeros([512,200,3],dtype=np.uint8)
white.fill(255)
input_image_ = np.concatenate([input_image, white], axis=1)
imgs = np.concatenate([input_image_, images], axis=1)


fig = plt.figure(figsize=(60, 5))
plt.axis("off")
plt.title(f'"{caption}"', fontsize=30)
plt.title('Input Image', fontsize=30, loc='left')
plt.imshow(imgs)
plt.savefig("visualization.png", facecolor="white", bbox_inches = 'tight', dpi=300)
plt.show()

### Download your generated images

In [None]:
!zip -r ./images.zip ./images

In [None]:
from google.colab import files
files.download("./images.zip")

In [None]:
!rm -rf images/*
!rm -rf images.zip
!rm -rf visualization.png

Now, you can upload another image and do the same thing.

### Acknowledgements 

* https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb
* https://huggingface.co/nlpconnect/vit-gpt2-image-captioning 