In [11]:
from datasets import load_dataset
import os

import uuid
from tqdm import tqdm
import json

In [12]:
def make_id():
    return str(uuid.uuid4())

def make_conversation(id,image_path,question,answer,image_folder_root=None):
    if image_folder_root is not None:
        image_path = os.path.join(image_folder_root, image_path)
    return_data =   {
        "id": id,
        "image": image_path,
        "conversations": [
        {
            "from": "human",
            "value": f"<image>\n{question}"
        },
        {
            "from": "gpt",
            "value": answer
        },
        ]
    }
    return return_data


def make_question(caption):
    return  f"Provide the bounding box coordinate of the region this sentence describes: {caption}"

def bbox_absolute_to_relative(absolute_bbox, image_width_height):
    width, height = image_width_height
    x1 = absolute_bbox[0] / width
    y1 = absolute_bbox[1] / height
    x2 = absolute_bbox[2] / width
    y2 = absolute_bbox[3] / height
    relative_bbox = [x1, y1, x2, y2]
    return relative_bbox

def make_answer(bbox, image_width_height):
    relative_bbox = bbox_absolute_to_relative(bbox, image_width_height)
    relative_bbox = [f"{coord:.3f}" for coord in relative_bbox]
    
    return f"[{relative_bbox[0]},{relative_bbox[1]},{relative_bbox[2]},{relative_bbox[3]}]"

In [13]:
data_split = "testB"

dataset_root_dir = "/data_ssd/huggingface_dataset"

cache_dir = "/data_ssd/huggingface_cache"
save_json_path = f"/data_ssd/refcoco/refcoco-{data_split}_llava-onevision.json"

dataset_id = os.path.join(dataset_root_dir,"jxu124/refcoco")

dataset = load_dataset(dataset_id, cache_dir=cache_dir, split=data_split)

In [14]:
print(dataset[0])
for key, value in dataset[0].items():
    print(f"{key}: {value}")

{'sent_ids': [71, 72, 73], 'file_name': 'COCO_train2014_000000581563_3.jpg', 'ann_id': 1345868, 'ref_id': 25, 'image_id': 581563, 'split': 'testB', 'sentences': [{'raw': 'lower left corner darkness', 'sent': 'lower left corner darkness', 'sent_id': 71, 'tokens': ['lower', 'left', 'corner', 'darkness']}, {'raw': 'bpttom left dark', 'sent': 'bpttom left dark', 'sent_id': 72, 'tokens': ['bpttom', 'left', 'dark']}, {'raw': 'black van in front of cab', 'sent': 'black van in front of cab', 'sent_id': 73, 'tokens': ['black', 'van', 'in', 'front', 'of', 'cab']}], 'category_id': 3, 'raw_anns': '{"segmentation": [[59.15, 500.0, 0.48, 500.0, 0.0, 375.26, 96.66, 373.89, 104.85, 380.71, 117.13, 384.81, 127.36, 395.04, 134.86, 403.91, 137.59, 410.04, 112.41, 433.82, 108.59, 435.48, 105.11, 441.62, 103.45, 447.26, 95.99, 447.09, 62.98, 457.54]], "area": 12101.243650000002, "iscrowd": 0, "image_id": 581563, "bbox": [0.0, 373.89, 137.59, 126.11], "category_id": 3, "id": 1345868}', 'raw_image_info': '{"

In [15]:
converted_data = []

for sample in tqdm(dataset):
    id = make_id()

    raw_image_info = json.loads(sample["raw_image_info"])
    image_file_name = raw_image_info["file_name"]
    original_image_width_height = (raw_image_info["width"], raw_image_info["height"])
    image_path = os.path.join("mscoco2014/train2014",image_file_name)
    bbox = sample["bbox"]
    caption_list = [sentences["raw"] for sentences in sample["sentences"]]
    
    for caption in caption_list:
        question = make_question(caption)
        answer = make_answer(bbox, original_image_width_height)
        conversation = make_conversation(id,image_path,question,answer)
        converted_data.append(conversation)


100%|██████████| 1810/1810 [00:00<00:00, 2452.03it/s]


In [16]:
print("len(converted_data):",len(converted_data))
for i in range(5):
    for key, value in converted_data[i].items():
        print(f"{key}: {value}")

len(converted_data): 5095
id: 939e6612-a879-4546-92df-54c01a92d081
image: mscoco2014/train2014/COCO_train2014_000000581563.jpg
conversations: [{'from': 'human', 'value': '<image>\nProvide the bounding box coordinate of the region this sentence describes: lower left corner darkness'}, {'from': 'gpt', 'value': '[0.000,0.748,0.413,1.000]'}]
id: 939e6612-a879-4546-92df-54c01a92d081
image: mscoco2014/train2014/COCO_train2014_000000581563.jpg
conversations: [{'from': 'human', 'value': '<image>\nProvide the bounding box coordinate of the region this sentence describes: bpttom left dark'}, {'from': 'gpt', 'value': '[0.000,0.748,0.413,1.000]'}]
id: 939e6612-a879-4546-92df-54c01a92d081
image: mscoco2014/train2014/COCO_train2014_000000581563.jpg
conversations: [{'from': 'human', 'value': '<image>\nProvide the bounding box coordinate of the region this sentence describes: black van in front of cab'}, {'from': 'gpt', 'value': '[0.000,0.748,0.413,1.000]'}]
id: ea2317a9-a06c-4270-b745-c5092800d0b4
im

In [17]:
with open(save_json_path, "w") as f:
    json.dump(converted_data, f, indent=4, ensure_ascii=False)

In [18]:
from datasets import load_dataset
import os

import uuid
from tqdm import tqdm
import json

In [19]:
with open(save_json_path, "r") as f:
    loaded_data = json.load(f)
print("len(loaded_data):",len(loaded_data))

len(loaded_data): 5095


In [20]:

image_folder_root = "/data_ssd"

exist_image_count = 0
for i in tqdm(range(len(loaded_data))):
    image_file_name = loaded_data[i]["image"]
    image_path = os.path.join(image_folder_root,image_file_name)
    if not os.path.exists(image_path):
        print(f"Image file does not exist: {image_path}")
        continue
    exist_image_count += 1
print("exist_image_count:", exist_image_count)
print("non_exist_image_count:", len(loaded_data) - exist_image_count)
print("exist_image_count / len(loaded_data):", exist_image_count / len(loaded_data))

100%|██████████| 5095/5095 [00:00<00:00, 24212.09it/s]

exist_image_count: 5095
non_exist_image_count: 0
exist_image_count / len(loaded_data): 1.0





In [None]:
import imgviz
import numpy as np
import matplotlib.pyplot as plt
import regex as re
from PIL import Image

def visualize_bbox(image, bbox, caption):
    bboxes = np.array([bbox[1],bbox[0],bbox[3],bbox[2]]).astype(np.int32).reshape(-1, 4)
    labels = [2]
    image = imgviz.instances2rgb(np.array(image), bboxes=bboxes, labels=labels,captions=[caption],font_size=16)

    plt.imshow(image)
    plt.show()
    
def extract_bbox_from_text(ans):
    pattern = re.compile(r'\[(((0|1)\.(\d){3}\,){3}((0|1)\.(\d){3}))\]')
    match_list = pattern.findall(ans)

    if len(match_list) > 0:
        answer = [list(map(float,match[0].split(","))) for match in match_list]
    else:
        answer = "FAILED"
    return answer

def bbox_relative_to_absolute(relative_bbox, image_width_height):
    width, height = image_width_height
    x1 = relative_bbox[0] * width
    y1 = relative_bbox[1] * height
    x2 = relative_bbox[2] * width
    y2 = relative_bbox[3] * height
    absolute_bbox = [x1, y1, x2, y2]
    return absolute_bbox



sample_data = loaded_data[-500]

image_path = sample_data["image"]
image_path = os.path.join(image_folder_root,image_path)
image = Image.open(image_path)
original_image_width_height = (image.width, image.height)

answer = sample_data["conversations"][1]["value"]
relative_bbox = extract_bbox_from_text(answer)[0]
absolute_bbox = bbox_relative_to_absolute(relative_bbox, original_image_width_height)

print(sample_data["conversations"][0]["value"])
visualize_bbox(image, absolute_bbox, sample_data["conversations"][0]["value"])
