In [1]:
import argparse
import os
import os.path as osp
import random

import numpy as np
import torch
import torch.backends.cudnn as cudnn
# import gradio as gr
from PIL import Image
# import base64
# from io import BytesIO

from click4caption.common.config import Config
from click4caption.common.dist_utils import get_rank
from click4caption.common.registry import registry
from click4caption.conversation.conversation import Chat, CONV_VISION

# imports modules for registration
from click4caption.datasets.builders import *
from click4caption.models import *
from click4caption.processors import *
from click4caption.runners import *
from click4caption.tasks import *


In [2]:
class Args:
    def __init__(self):
        self.cfg_path = 'eval_configs/click4caption_eval.yaml'
        self.gpu_id = 1
        self.options = None  # Assuming you may not need to override config options in this context
        self.num_beams = 1
        self.temperature = 1.0
        # self.image_path = '/proj/ecole/team/keegan.stoner/Osprey/original_1.jpg'
        self.tl_x = -1  # -1 stands for using the whole figure
        self.tl_y = -1
        self.br_x = -1
        self.br_y = -1
        self.input_text = 'image[IMG] Tell me what it is and write a description for it.'

# Initialize
print('Initializing')
args = Args()

Initializing


In [3]:
# Load the bboxes?

args.tl_x = 50
args.tl_y = 100
args.br_x = 200
args.br_y = 200

In [4]:

def setup_seeds(config):
    seed = config.run_cfg.seed + get_rank()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    cudnn.benchmark = False
    cudnn.deterministic = True


def upload_img(chat, gr_img, chat_state, tl_x, tl_y, br_x, br_y, img_list, model_image_size):
    if chat_state is None:
        chat_state = CONV_VISION.copy()
    if img_list is None:
        img_list = []
    
    # process the coords
    tl_x = 0 if tl_x < 0 else tl_x
    tl_y = 0 if tl_y < 0 else tl_y
    br_x = gr_img.size[0] if br_x < 0 else br_x
    br_y = gr_img.size[1] if br_y < 0 else br_y
    x_scale = model_image_size / gr_img.size[0]
    y_scale = model_image_size / gr_img.size[1]
    
    # upload
    llm_message = chat.upload_img(gr_img, chat_state, img_list, tl_x*x_scale, tl_y*y_scale, br_x*x_scale, br_y*y_scale)

    # draw bbox on img
    _img = np.array(gr_img.copy())
    line_width = int((gr_img.size[0]+gr_img.size[1])/224)
    _img[tl_y:tl_y+line_width, tl_x:br_x] = np.array([255, 0, 0])
    _img[br_y-line_width+1:br_y+1, tl_x:br_x] = np.array([255, 0, 0])
    _img[tl_y:br_y, tl_x:tl_x+line_width] = np.array([255, 0, 0])
    _img[tl_y:br_y, br_x-line_width+1:br_x+1] = np.array([255, 0, 0])
    _img = Image.fromarray(_img)

    return chat_state, img_list, _img



In [5]:


cfg = Config(args)

model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model = model.eval()

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
model_image_size = vis_processor_cfg.image_size
print('Initialization Finished')


==> model image_size=224
Loading VIT
==> release the ln_vision param
Loading VIT Done
Loading Q-Former
Loading Q-Former Done
Loading LLAMA

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading LLAMA Done
==> Add pe in the image embs
==> Delete the vit CLS token when feeding in Qformer
==> Use vit multi-block feats: block [9, 19, 29, 38]
Load Model Checkpoint: cached_model/click4caption_13b.pth
Unexpected_keys: []
Initialization Finished


In [18]:
BASEPATH = '/proj/ecole/team/keegan.stoner/Click4Caption/test_images/'
image_1_filename = '001.jpg'
image_2_filename = '002.jpg'

image_1_boxes_filename = 'unique_img_1_bboxes.npy'
image_2_boxes_filename = 'unique_img_2_bboxes.npy'


In [19]:
# image 1 bounds

image_1_bboxes = np.load(BASEPATH + image_1_boxes_filename)

tl_x, tl_y, br_x, br_y = image_1_bboxes[0]

tl_x = int(tl_x)
tl_y = int(tl_y)
br_x = int(br_x)
br_y = int(br_y)


# upload image
image = Image.open(BASEPATH + image_1_filename).convert("RGB")
chat_state, img_list, img_with_bbox = upload_img(chat, image, None, tl_x, tl_y, br_x, br_y, None, model_image_size)
# image_save_path = osp.join(osp.dirname(osp.realpath(__file__)), "img_with_bbox.jpg")
image_save_path = osp.join(('/proj/ecole/team/keegan.stoner/Click4Caption'), "img_with_bbox1.jpg")
print(f"saving the img with drawn bbox in {image_save_path}")
img_with_bbox.save(image_save_path)


input bbox:
[[102.42909240722656, 6.269720077514648], [146.21090698242188, 81.79135131835938]]
saving the img with drawn bbox in /proj/ecole/team/keegan.stoner/Click4Caption/img_with_bbox1.jpg


In [20]:
image_2_bboxes = np.load(BASEPATH + image_2_boxes_filename)

tl_x, tl_y, br_x, br_y = image_2_bboxes[0]

tl_x = int(tl_x)
tl_y = int(tl_y)
br_x = int(br_x)
br_y = int(br_y)


# upload image
image = Image.open(BASEPATH + image_2_filename).convert("RGB")
chat_state, img_list, img_with_bbox = upload_img(chat, image, chat_state, tl_x, tl_y, br_x, br_y, img_list, model_image_size)
# image_save_path = osp.join(osp.dirname(osp.realpath(__file__)), "img_with_bbox.jpg")
image_save_path = osp.join(('/proj/ecole/team/keegan.stoner/Click4Caption'), "img_with_bbox2.jpg")
print(f"saving the img with drawn bbox in {image_save_path}")
img_with_bbox.save(image_save_path)


input bbox:
[[155.9040069580078, 152.67752075195312], [213.2480010986328, 207.21824645996094]]
saving the img with drawn bbox in /proj/ecole/team/keegan.stoner/Click4Caption/img_with_bbox2.jpg


In [21]:
len(img_list), img_list[0].shape

(2, torch.Size([1, 32, 5120]))

In [28]:

# ask and answer
user_message = 'image 1[IMG] image 2[IMG] Compare this part of image 1 with the part in image 2. Explain how the two images are similar or different using attributes of the region specified. '
# user_message = 'image 1[IMG] image 2[IMG] Write a story that combines image 1 and image 2.'
if "image[IMG]" not in user_message:
    print(f"Warning: we recommend to use format 'image[IMG] question' as input text")


chat.ask(user_message, chat_state)

llm_message = chat.answer(conv=chat_state,
                            img_list=img_list,
                            num_beams=args.num_beams,
                            temperature=args.temperature,
                            max_new_tokens=300,  # 800,
                            max_length=2000)[0]
print(f"=====LLM reply=====\n{llm_message}")


The whole input prompt: '### Human: image 1[IMG] image 2[IMG] Compare this part of image 1 with the part in image 2. Explain how the two images are similar or different using attributes of the region specified. \n### Assistant:'
**************************************************
=> LLM output w/ special tokens:
<unk>The silver spoon in Image 2 is a utensil used to eat food with. The spoon in Image 1 is similar in that it is also a silver spoon, but the shape of the bowl is different. In Image 1, the spoon has a larger bowl and tines that are longer and more pronounced, while the spoon in Image 2 has a smaller bowl and tines that are shorter and more rounded. The reflection on the spoon in Image 2 is also visible, which is not present in Image 1. Overall, the two images are similar in that they both depict silver spoons, but they differ in the shape and size of the spoons.###


=====LLM reply=====
The silver spoon in Image 2 is a utensil used to eat food with. The spoon in Image 1 is si

In [None]:
# Results

# It doesn't really focus on the boxes, it only looks at the whole image. 
# It's not really accurate about a lot of things - "both utensils have four prongs"

# Actually I have kind of focused this


# Based on LLaMA