In [10]:
from transformers import pipeline
device_name="mps" # change to 'mps' for apple silicon
IMAGE_URL="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"

# predict a caption for an image
# using https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
captioner = pipeline("image-to-text",
                     model="nlpconnect/vit-gpt2-image-captioning",
                     device=device_name)
# get caption for 2 parrots
result = captioner(IMAGE_URL)
image_caption = result[0]["generated_text"]
print(f"The image shows: {image_caption}")

# classify the image w/ zero shot classifier
# using https://huggingface.co/facebook/bart-large-mnli
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device=device_name)
candidate_lablels = ["potential fire",
                     "pothole",
                     "animals",
                     "traffic jam",
                     "broken streetlight",
                     "graffiti"]
candidates = classifier(image_caption, candidate_lablels)
top_label = candidates["labels"][0]
print(f"The category is: {top_label}")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


The image shows: two birds are standing next to each other 
The category is: animals


In [14]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image

clarifying_question = f"How many {top_label} are there?"

image = Image.open(requests.get(IMAGE_URL, stream=True).raw)
image = image.convert("RGB")

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, clarifying_question, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: 2
