In [2]:
from PIL import Image
import requests
import torch

"""
Step 1: Load images
"""
demo_image_one = Image.open(
    requests.get(
        "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
    ).raw
)

demo_image_two = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
        stream=True
    ).raw
)

query_image = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", 
        stream=True
    ).raw
)


"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
lang_x = tokenizer(
    ["<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of"],
    return_tensors="pt",
)


"""
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=20,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of a buffet table.<|endofchunk|>


In [4]:
from PIL import Image
import requests
import torch

"""
Step 1: Load images
"""
demo_image_one = Image.open(
    requests.get(
        "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
    ).raw
)

demo_image_two = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
        stream=True
    ).raw
)

query_image = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", 
        stream=True
    ).raw
)


"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
lang_x = tokenizer(
    ["""
     Think step by step to answer the question.
     <image>
     Question: Is the cat above a mat?
    Program:
    BOX0=LOC(image=IMAGE,object='cat')
    IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0)
    BOX1=LOC(image=IMAGE0,object='mat')
    ANSWER0=COUNT(box=BOX1)
    ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'")
    FINAL_RESULT=RESULT(var=ANSWER1)
    <|endofchunk|>
        <image>Question: Are there trains or fences in this scene?
    Program:
    BOX0=LOC(image=IMAGE,object='train')
    BOX1=LOC(image=IMAGE,object='fence')
    ANSWER0=COUNT(box=BOX0)
    ANSWER1=COUNT(box=BOX1)
    ANSWER2=EVAL(expr="'yes' if {ANSWER0} + {ANSWER1} > 0 else 'no'")
    FINAL_RESULT=RESULT(var=ANSWER)
    <image>Question: Which place is it?"""],
    return_tensors="pt",
)


"""
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=100,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  
     Think step by step to answer the question.
     <image>
     Question: Is the cat above a mat?
    Program:
    BOX0=LOC(image=IMAGE,object='cat')
    IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0)
    BOX1=LOC(image=IMAGE0,object='mat')
    ANSWER0=COUNT(box=BOX1)
    ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'")
    FINAL_RESULT=RESULT(var=ANSWER1)
    <|endofchunk|>
        <image>Question: Are there trains or fences in this scene?
    Program:
    BOX0=LOC(image=IMAGE,object='train')
    BOX1=LOC(image=IMAGE,object='fence')
    ANSWER0=COUNT(box=BOX0)
    ANSWER1=COUNT(box=BOX1)
    ANSWER2=EVAL(expr="'yes' if {ANSWER0} + {ANSWER1} > 0 else 'no'")
    FINAL_RESULT=RESULT(var=ANSWER)
    <image>Question: Which place is it?
    Program:
    BOX0=LOC(image=IMAGE,object='place')
    BOX1=LOC(image=IMAGE,object='house')
    ANSWER0=COUNT(box=BOX0)
    ANSWER1=COUNT(box=BOX1)
    ANSWER2=EVAL(expr="'yes' if {ANSWER0} + {ANSWER1} > 0 else 'no'")
    FIN


In [3]:
from PIL import Image
import requests
import torch

"""
Step 1: Load images
"""
demo_image_one = Image.open(
    requests.get(
        "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
    ).raw
)

demo_image_two = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
        stream=True
    ).raw
)

query_image = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", 
        stream=True
    ).raw
)


"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
lang_x = tokenizer(
    ["""
     Think step by step to answer the question.
     <image>
     Question: Is the cat above a mat?
    Program:
    BOX0=LOC(image=IMAGE,object='cat')
    IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0)
    BOX1=LOC(image=IMAGE0,object='mat')
    ANSWER0=COUNT(box=BOX1)
    ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'")
    FINAL_RESULT=RESULT(var=ANSWER1)
    <|endofchunk|>
        <image>Question: Where is the towel?
    Program:
    BOX0=LOC(image=IMAGE,object='towel')
    ANSWER0=VQA(image=IMAGE,question='What is near the towel?')
    BOX1=LOC(image=IMAGE,object='{ANSWER0}')
    ANSWER2=VQA(image={BOX1},question='What is this?')
    FINAL_RESULT=RESULT(var=ANSWER2)
    <image>Question: What is this?"""],
    return_tensors="pt",
)


"""
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=100,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  
     Think step by step to answer the question.
     <image>
     Question: Is the cat above a mat?
    Program:
    BOX0=LOC(image=IMAGE,object='cat')
    IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0)
    BOX1=LOC(image=IMAGE0,object='mat')
    ANSWER0=COUNT(box=BOX1)
    ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'")
    FINAL_RESULT=RESULT(var=ANSWER1)
    <|endofchunk|>
        <image>Question: Where is the towel?
    Program:
    BOX0=LOC(image=IMAGE,object='towel')
    ANSWER0=VQA(image=IMAGE,question='What is near the towel?')
    BOX1=LOC(image=IMAGE,object='{ANSWER0}')
    ANSWER2=VQA(image={BOX1},question='What is this?')
    FINAL_RESULT=RESULT(var=ANSWER2)
    <image>Question: What is this?
    Program:
    BOX0=LOC(image=IMAGE,object='{ANSWER0}')
    ANSWER0=VQA(image=IMAGE,question='What is this?')
    BOX1=LOC(image=IMAGE,object='{ANSWER1}')
    ANSWER2=VQA(image=IMAGE,question='What is this?')
    FINAL_RESULT=RESULT(var=ANSWER


In [25]:
GQA_CURATED_EXAMPLES=[
"""
Think step by step to answer the question.
<image>
Question: Is the vehicle in the top of the image?
Program:
BOX0=LOC(image=IMAGE,object='TOP')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE0,object='vehicle')
ANSWER0=COUNT(box=BOX1)
ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
FINAL_RESULT=RESULT(var=ANSWER1)
<|endofchunk|>
<image>
Question: Who is carrying the umbrella?
Program:
BOX0=LOC(image=IMAGE,object='umbrella')
IMAGE0=CROP(image=IMAGE,box=BOX0)
ANSWER0=VQA(image=IMAGE0,question='Who is carrying the umbrella?')
FINAL_RESULT=RESULT(var=ANSWER0)
<|endofchunk|>
<image>
Question: Which place is it?
Program:
ANSWER0=VQA(image=IMAGE,question='Which place is it?')
FINAL_RESULT=RESULT(var=ANSWER0)
<|endofchunk|>
<image>
Question: Is the pillow in the top part or in the bottom of the picture?
Program:
BOX0=LOC(image=IMAGE,object='TOP')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE0,object='pillow')
ANSWER0=COUNT(box=BOX1)
ANSWER1=EVAL(expr="'top' if {ANSWER0} > 0 else 'bottom'")
FINAL_RESULT=RESULT(var=ANSWER1)
<|endofchunk|>
<image>
Question: Which side is the food on?
Program:
BOX0=LOC(image=IMAGE,object='RIGHT')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE0,object='food')
ANSWER0=COUNT(box=BOX1)
ANSWER1=EVAL(expr="'right' if {ANSWER0} > 0 else 'left'")
FINAL_RESULT=RESULT(var=ANSWER1)
<|endofchunk|>
<image>
Question: Do the post and the sign have a different colors?
Program:
BOX0=LOC(image=IMAGE,object='post')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE,object='sign')
IMAGE1=CROP(image=IMAGE,box=BOX1)
ANSWER0=VQA(image=IMAGE0,question='What color is the post?')
ANSWER1=VQA(image=IMAGE1,question='What color is the sign?')
ANSWER2=EVAL(expr="'yes' if {ANSWER0} != {ANSWER1} else 'no'")
FINAL_RESULT=RESULT(var=ANSWER2)
<|endofchunk|>
<image>
Question: Does the traffic cone have white color?
Program:
BOX0=LOC(image=IMAGE,object='traffic cone')
IMAGE0=CROP(image=IMAGE,box=BOX0)
ANSWER0=VQA(image=IMAGE0,question='What color is the traffic cone?')
ANSWER1=EVAL(expr="'yes' if {ANSWER0} == 'white' else 'no'")
FINAL_RESULT=RESULT(var=ANSWER1)
<|endofchunk|>
<image>
Question: Are these animals of different species?
Program:
ANSWER0=VQA(image=IMAGE,question='Are these animals of different species?')
FINAL_RESULT=RESULT(var=ANSWER0)
<|endofchunk|>
<image>
Question: Which side of the image is the chair on?
Program:
BOX0=LOC(image=IMAGE,object='RIGHT')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE0,object='chair')
ANSWER0=COUNT(box=BOX1)
ANSWER1=EVAL(expr="'right' if {ANSWER0} > 0 else 'left'")
FINAL_RESULT=RESULT(var=ANSWER1)
<|endofchunk|>
 <image>Question: What is this?
 Program:
""",
]

In [26]:
import os 
import sys 
from functools import partial

# sys.path.append('/home/user2/code/WIL_DeepLearningProject_2/VisualProgramming')

# from prompt.gqa import create_prompt

image_id = ['n437038', 'n153818', 'n542565', 'n428090', 'n435687', 'n167620', 'n70342', 'n418470', 'n77875']

vision_x = []

image_path = '/data2/NS/GQA/images/images/'
for i in image_id:
    demo_image = Image.open(os.path.join(image_path,i+".jpg")) 
    vision_x.append(image_processor(demo_image).unsqueeze(0))
    
query_image = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", 
        stream=True
    ).raw
)
vision_x.append(image_processor(query_image).unsqueeze(0))

vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)
question = "What is this?"

tokenizer.padding_side = "left" # For generation padding tokens should be on the left
lang_x = tokenizer(
   GQA_CURATED_EXAMPLES,
    return_tensors="pt",
)



generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=100,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  
Think step by step to answer the question.
<image>
Question: Is the vehicle in the top of the image?
Program:
BOX0=LOC(image=IMAGE,object='TOP')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE0,object='vehicle')
ANSWER0=COUNT(box=BOX1)
ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'")
FINAL_RESULT=RESULT(var=ANSWER1)
<|endofchunk|>
<image>
Question: Who is carrying the umbrella?
Program:
BOX0=LOC(image=IMAGE,object='umbrella')
IMAGE0=CROP(image=IMAGE,box=BOX0)
ANSWER0=VQA(image=IMAGE0,question='Who is carrying the umbrella?')
FINAL_RESULT=RESULT(var=ANSWER0)
<|endofchunk|>
<image>
Question: Which place is it?
Program:
ANSWER0=VQA(image=IMAGE,question='Which place is it?')
FINAL_RESULT=RESULT(var=ANSWER0)
<|endofchunk|>
<image>
Question: Is the pillow in the top part or in the bottom of the picture?
Program:
BOX0=LOC(image=IMAGE,object='TOP')
IMAGE0=CROP(image=IMAGE,box=BOX0)
BOX1=LOC(image=IMAGE0,object='pillow')
ANSWER0=COUNT(box=BOX1)
ANSWER1=EVAL(expr="'