# Visual question-answering (VQA)

## VQA with Vision Language Transformers (ViLTs)

In [None]:
import requests
from PIL import Image

url = "https://www.worldanimalprotection.org/cdn-cgi/image/width=1920,format= auto/globalassets/images/elephants/1033551-elephant.jpg"
image = Image.open(requests.get(url,  stream=True).raw)
text = "What animal is in this photo?"

In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

encoding = processor(image, text, return_tensors="pt")
outputs = model(**encoding)
idx = outputs.logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

## Document VQA with LayoutLM

In [None]:
from datasets import load_dataset
from transformers import pipeline

dataset = load_dataset("lmms-lab/DocVQA")

import matplotlib.pyplot as plt

plt.imshow(dataset["test"][2]["image"])
plt.show()

In [None]:
from transformers import pipeline

pipe = pipeline("document-question-answering", "impira/layoutlm-document-qa")
result = pipe(dataset["test"][2]["image"], "What was the gross income in 2011-2012?" )

print(result)

# Image editing with difussion models

## Custom image editing

In [None]:
from diffusers.utils import load_image

image = load_image("http://301.nz/o81bf")

import cv2
from PIL import Image
import numpy as np

image = cv2.Canny(np.array(image), 100, 200)  
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)

image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)

In [None]:
from diffusers import StableDiffusionControlNetPipeline
from diffusers import ControlNetModel
import torch

controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

In [None]:
prompt = ["Albert Einstein, best quality, extremely detailed"]
generator = [torch.Generator(device="cuda").manual_seed(2)]
output = pipe(
  prompt,
  canny_image,
  negative_prompt=["monochrome, lowres, bad anatomy, worst quality, low quality"],
  generator=generator,
  num_inference_steps=20
)

## Image inpainting

In [None]:
from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel

controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, use_safetensors=True)
pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True)

In [None]:
def make_inpaint_condition(image, image_mask):
  image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
  image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
  image[image_mask > 0.5] = -1.0
  image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
  image = torch.from_numpy(image)
  return image

control_image = make_inpaint_condition(init_image, mask_image)

In [None]:
output = pipe(
  "The head of the mona lisa in the same style and quality as the original mona lisa with a clear smile and a slightly smaller head size",
  num_inference_steps=40,
  eta=1.0,
  image=init_image,
  mask_image=mask_image,
  control_image=control_image,
).images[0]