<a href="https://colab.research.google.com/github/hajarhomayouni/synthetic_multimodal_data_generation/blob/main/synthetic_image_and_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image to text transformer

### First image to text transformer model

In [None]:
#https://huggingface.co/docs/transformers/model_doc/visual_bert


In [None]:
!pip install transformers

In [None]:
# image to text transformer
import requests
from PIL import Image

from transformers import GPT2TokenizerFast, ViTFeatureExtractor, VisionEncoderDecoderModel

# load a fine-tuned image captioning model and corresponding tokenizer and feature extractor
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")



In [None]:
# let's perform inference on an image
#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
url="/content/DATA/DATA/1/person1133_virus_1865.jpeg"
#image = Image.open(requests.get(url, stream=True).raw)
image = Image.open(url)
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values

# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

### Second image to text transformer model

In [None]:
# image to text transformer
#https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import requests
from PIL import Image
import torch

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# load image from the IAM dataset
url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

# training
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

pixel_values = processor(image, return_tensors="pt").pixel_values
text = "hello world"
labels = processor.tokenizer(text, return_tensors="pt").input_ids
outputs = model(pixel_values=pixel_values, labels=labels)
loss = outputs.loss

# inference (generation)
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
generated_text

# Text to Text transformer

Note: We may need it later

In [None]:
batch_sentences=generated_text

In [None]:
# text to text transformer (generate fake text)

from transformers import EncoderDecoderModel, BertTokenizer,BertConfig, EncoderDecoderConfig, AutoTokenizer

#Model Initialization**************************************************************
# First way: Bert default pretrained model
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert


# Second way: pretrained sample model
tokenizer = AutoTokenizer.from_pretrained('patrickvonplaten/bert2bert_cnn_daily_mail')
model = EncoderDecoderModel.from_pretrained('patrickvonplaten/bert2bert_cnn_daily_mail')

# Third way: Default Bert config
"""config_encoder = BertConfig()
config_decoder = BertConfig()
config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = EncoderDecoderModel(config=config)"""

# Another: GPT2
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#model = GPT2Model.from_pretrained('gpt2')

#Model Training**************************************************************
input_ids = tokenizer(batch_sentences, padding='max_length', max_length=15, truncation=True, return_tensors="pt").input_ids
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]

#Model Evaluation**************************************************************
generated_ids = model.generate(input_ids)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
generated_text

# Image to 1D feature vector transformer

Note: May need later

In [None]:
# Extract features from image
# https://huggingface.co/docs/transformers/model_doc/vit
# https://huggingface.co/google/vit-base-patch16-224

from transformers import ViTFeatureExtractor, ViTModel
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

inputs = feature_extractor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
outputs

# Image to Image Transformer

### First image to image transformer model

Limitation: Low quality of output image

In [None]:
# Image to image transformer (image generation)
# https://huggingface.co/docs/transformers/model_doc/imagegpt
# https://github.com/NielsRogge/Transformers-Tutorials/tree/master/ImageGPT


# This is unconditional, I need the conditional approach

from transformers import ImageGPTFeatureExtractor, ImageGPTForCausalImageModeling
import torch
import matplotlib.pyplot as plt
import numpy as np

feature_extractor = ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small")
model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# unconditional generation of 8 images
batch_size = 8
context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
context = torch.tensor(context).to(device)
output = model.generate(
    input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
)

clusters = feature_extractor.clusters
n_px = feature_extractor.size

samples = output[:, 1:].cpu().detach().numpy()
samples_img = [
    np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [n_px, n_px, 3]).astype(np.uint8) for s in samples
]  # convert color cluster tokens back to pixels
f, axes = plt.subplots(1, batch_size, dpi=300)

for img, ax in zip(samples_img, axes):
    ax.axis("off")
    ax.imshow(img)

### Second image to image transformer model. (Image generation based on Style GAN)

In [None]:
!pip install stylegan2_pytorch

In [None]:
!pip install -q git+https://github.com/podgorskiy/dnnlib

In [None]:
# Image to Image  
#Image generation : https://heartbeat.comet.ml/stylegans-use-machine-learning-to-generate-and-customize-realistic-images-c943388dc672
# Had error you can work on it
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

"""Minimal script for generating an image using pre-trained StyleGAN generator."""

import os
import pickle
import numpy as np
import PIL.Image
import dnnlib
import dnnlib.tflib as tflib
import config

def main():
    # Initialize TensorFlow.
    tflib.init_tf()

    # Load pre-trained network.
    url = 'https://drive.google.com/uc?id=1MEGjdvVpUsu1jB4zrXZN7Y4kBBOzizDQ' # karras2019stylegan-ffhq-1024x1024.pkl
    with dnnlib.util.open_url(url, cache_dir=config.cache_dir) as f:
        _G, _D, Gs = pickle.load(f)
        # _G = Instantaneous snapshot of the generator. Mainly useful for resuming a previous training run.
        # _D = Instantaneous snapshot of the discriminator. Mainly useful for resuming a previous training run.
        # Gs = Long-term average of the generator. Yields higher-quality results than the instantaneous snapshot.

    # Print network details.
    Gs.print_layers()

    # Pick latent vector.
    rnd = np.random.RandomState(5)
    latents = rnd.randn(1, Gs.input_shape[1])

    # Generate image.
    fmt = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
    images = Gs.run(latents, None, truncation_psi=0.7, randomize_noise=True, output_transform=fmt)

    # Save image.
    os.makedirs(config.result_dir, exist_ok=True)
    png_filename = os.path.join(config.result_dir, 'example.png')
    PIL.Image.fromarray(images[0], 'RGB').save(png_filename)

if __name__ == "__main__":
    main()

### Third image to image transformer model. (Image generation based on Style GAN)

Note: This model was successfully used by my student to generate COVID-19 X-Ray Images. Her code is available: https://colab.research.google.com/drive/1Z_VHdxNcPlsMPCzmBFTvW0LUZQawbQ9u?usp=sharing

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"hajarhomayouni","key":"33b81fa143f6c6b4c0eea1e591a9f383"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!git clone https://github.com/NVlabs/stylegan2-ada.git

In [None]:
%cd stylegan2-ada/

/content/stylegan2-ada


In [None]:
from PIL import Image
import os, sys

path = "/content/data/"
dirs = os.listdir(path)


for item in dirs:
    if os.path.isfile(path+item):
        im = Image.open(path+item)
        f, e = os.path.splitext(path+item)
        imResize = im.resize((512,512), Image.ANTIALIAS)
        imResize.save("/content/pre/" + item + '.jpeg', 'JPEG', quality=90)

In [None]:
!python dataset_tool.py create_from_images /content/output /content/pre/

In [None]:
!pip uninstall tensorflow

In [None]:
!pip install tensorflow==1.14

In [None]:
!pip install tensorflow-gpu==1.14.0

In [None]:
!pip uninstall numpy

In [None]:
!pip install numpy==1.19.5

In [None]:
!python train.py --outdir /content/generated --snap=10 --data=/content/output --augpipe=bgcfnc --res=512