In [5]:
from datasets import load_dataset
from transformers import AutoProcessor
from mantis.models.conversation import conv_mllava_v1
import numpy as np
from tqdm import tqdm
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
# subsets = ["llava_665k_multi", "nlvr2", "birds-to-words", "contrastive_caption", "dreamsim", "nextqa", "star", "spot-the-diff", "visual_story_telling", "lrv", "coinstruct", "dvqa", "docvqa", "chartqa"]
subsets = ["birds-to-words"]
default_conv = conv_mllava_v1.copy()
def get_conv_len(item):
    default_conv.messages = []
    roles = {"human": default_conv.roles[0], "gpt": default_conv.roles[1], "user": default_conv.roles[0], "assistant": default_conv.roles[1]}
    default_conv.messages = []
    source_key = "conversation" if "conversation" in item else "conversations"
    for j, sentence in enumerate(item[source_key]):
        role = roles[sentence.get("from", sentence.get("role"))]
        # assert role == default_conv.roles[j % 2], f"Role mismatch: {role} != {default_conv.roles[j % 2]}, {j}"
        default_conv.append_message(role, sentence.get("content", sentence.get("text", sentence.get("value", ""))))
    return len(processor.tokenizer.encode(default_conv.get_prompt()))

def get_question_len(question:str):
    return len(processor.tokenizer.encode(question))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
for subset in subsets:
    print(f"------ {subset} ------")
    dataset = load_dataset("TIGER-Lab/Mantis-Instruct", subset, split='train')
    lengths = [get_conv_len(item) for item in tqdm(dataset)]
    source_key = "conversation" if "conversation" in dataset[0] else "conversations"
    print(f"# Examples: {len(lengths)}")
    print(f"# Average images: {np.mean([len(item['images']) for item in dataset])}")
    print(f"# Max images: {np.max([len(item['images']) for item in dataset])}")
    print(f"# Avg Turns: {np.mean([len(item[source_key]) for item in dataset])}")
    print(f"# Max Turns: {np.max([len(item[source_key]) for item in dataset])}")
    print(f"# Avg Text Length: {np.mean(lengths)}")
    print(f"# Avg Length with image: {np.mean([lengths[i] + 576 * len(dataset[i]['images']) for i in range(len(lengths))])}")
    

In [2]:
"""
------ nlvr2 ------
/home/aiops/jiangdf/miniconda3/envs/miqa/lib/python3.9/site-packages/datasets/load.py:1454: FutureWarning: The repository for TIGER-Lab/Mantis-Instruct contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/TIGER-Lab/Mantis-Instruct
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
  warnings.warn(
100%|██████████████████████████████████████████████████████| 86373/86373 [00:21<00:00, 3952.76it/s]
# Examples: 86373
# Average images: 2.0
# Max images: 2
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 104.96493117062045
# Avg Length with image: 1256.9649311706205
------ birds-to-words ------
/home/aiops/jiangdf/miniconda3/envs/miqa/lib/python3.9/site-packages/datasets/load.py:1454: FutureWarning: The repository for TIGER-Lab/Mantis-Instruct contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/TIGER-Lab/Mantis-Instruct
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
  warnings.warn(
100%|████████████████████████████████████████████████████████| 2649/2649 [00:00<00:00, 3447.90it/s]
# Examples: 2649
# Average images: 2.0
# Max images: 2
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 100.68742921857304
# Avg Length with image: 1252.687429218573
------ contrastive_caption ------
100%|███████████████████████████████████████████████████████| 35984/35984 [01:03<00:00, 565.94it/s]
# Examples: 35984
# Average images: 3.812083148065807
# Max images: 8
# Avg Turns: 7.624166296131614
# Max Turns: 16
# Avg Text Length: 871.1644341929747
# Avg Length with image: 3066.9243274788796
------ dreamsim ------
100%|██████████████████████████████████████████████████████| 15941/15941 [00:04<00:00, 3404.04it/s]
# Examples: 15941
# Average images: 3.0
# Max images: 3
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 102.54607615582461
# Avg Length with image: 1830.5460761558245
------ nextqa ------
100%|█████████████████████████████████████████████████████████| 3870/3870 [00:04<00:00, 935.30it/s]
# Examples: 3870
# Average images: 8.0
# Max images: 8
# Avg Turns: 17.639276485788113
# Max Turns: 38
# Avg Text Length: 572.2147286821705
# Avg Length with image: 5180.214728682171
------ star ------
100%|█████████████████████████████████████████████████████████| 3032/3032 [00:04<00:00, 680.98it/s]
# Examples: 3032
# Average images: 8.0
# Max images: 8
# Avg Turns: 30.1655672823219
# Max Turns: 232
# Avg Text Length: 961.259234828496
# Avg Length with image: 5569.259234828496
------ spot-the-diff ------
100%|████████████████████████████████████████████████████████| 8007/8007 [00:02<00:00, 2916.66it/s]
# Examples: 8007
# Average images: 2.0
# Max images: 2
# Avg Turns: 3.993755463969027
# Max Turns: 28
# Avg Text Length: 120.73398276508055
# Avg Length with image: 1272.7339827650806
------ visual_story_telling ------
100%|█████████████████████████████████████████████████████████| 6661/6661 [00:08<00:00, 803.68it/s]
# Examples: 6661
# Average images: 20.32772856928389
# Max images: 50
# Avg Turns: 9.709653205224441
# Max Turns: 20
# Avg Text Length: 529.7932742831407
# Avg Length with image: 12238.564930190661
------ lrv ------
100%|█████████████████████████████████████████████████████████| 8453/8453 [00:35<00:00, 237.13it/s]
# Examples: 8453
# Average images: 3.5011238613509996
# Max images: 9
# Avg Turns: 83.33656689932569
# Max Turns: 154
# Avg Text Length: 2234.463622382586
# Avg Length with image: 4251.110966520762
------ coinstruct ------
100%|████████████████████████████████████████████████████| 150918/150918 [01:48<00:00, 1395.81it/s]
# Examples: 150918
# Average images: 2.266906532024013
# Max images: 4
# Avg Turns: 7.4451026385189305
# Max Turns: 58
# Avg Text Length: 313.8353542983607
# Avg Length with image: 1619.5735167441921
------ dvqa ------
100%|████████████████████████████████████████████████████| 200000/200000 [02:23<00:00, 1390.70it/s]
# Examples: 200000
# Average images: 1.0
# Max images: 1
# Avg Turns: 23.25316
# Max Turns: 40
# Avg Text Length: 304.306895
# Avg Length with image: 880.306895
------ docvqa ------
100%|██████████████████████████████████████████████████████| 39463/39463 [00:07<00:00, 5190.59it/s]
# Examples: 39463
# Average images: 1.0
# Max images: 1
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 61.71502419988344
# Avg Length with image: 637.7150241998835
------ chartqa ------
100%|██████████████████████████████████████████████████████| 28299/28299 [00:05<00:00, 5104.47it/s]
# Examples: 28299
# Average images: 1.0
# Max images: 1
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 66.11519841690519
# Avg Length with image: 642.1151984169052
------ llava_665k_multi ------
# Examples: 312611
# Average images: 1.9994018124762085
# Max images: 4
# Avg Turns: 21.44195821644152
# Max Turns: 260
# Avg Text Length: 558.4972793663691
# Avg Length with image: 1710.1527233526651
------ imagecode ------
# Examples: 16594
# Average images: 10.0
# Max images: 10
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 125.86386645775582
# Avg Length with image: 5885.863866457756
------ multi_vqa ------
# Examples: 4993
# Average images: 4.019026637292209
# Max images: 6
# Avg Turns: 19.709593430803125
# Max Turns: 22
# Avg Text Length: 1102.1363909473262
# Avg Length with image: 3417.0957340276386
------ iconqa ------
# Examples: 64462
# Average images: 2.401430300021718
# Max images: 6
# Avg Turns: 2.0
# Max Turns: 2
# Avg Text Length: 70.62872700195464
# Avg Length with image: 1453.8525798144644
"""

------ llava_665k_merged ------


100%|██████████| 312611/312611 [04:33<00:00, 1144.46it/s]


# Examples: 312611
# Average images: 1.9994018124762085
# Max images: 4
# Avg Turns: 21.44195821644152
# Max Turns: 260
# Avg Text Length: 558.4972793663691
# Avg Length with image: 1710.1527233526651


In [8]:
eval_subsets = ["mantis_eval", "birds-to-words", "nlvr2"]
from datasets import load_dataset
for subset in eval_subsets:
    print(f"------ {subset} ------")
    dataset = load_dataset("Mantis-VL/MIQA_eval", subset, split='test')
    lengths = [get_question_len(item["question"]) for item in tqdm(dataset)]
    print(f"# Examples: {len(lengths)}")
    print(f"# Average images: {np.mean([len(item['images']) for item in dataset])}")
    print(f"# Max images: {np.max([len(item['images']) for item in dataset])}")
    print(f"# Avg Text Length: {np.mean(lengths)}")
    print(f"# Avg Length with image: {np.mean([lengths[i] + 576 * len(dataset[i]['images']) for i in range(len(lengths))])}")

------ human_eval ------


100%|██████████| 217/217 [00:16<00:00, 13.03it/s]


# Examples: 217
# Average images: 2.488479262672811
# Max images: 5
# Avg Text Length: 21.967741935483872
# Avg Length with image: 1455.331797235023
------ birds-to-words ------


100%|██████████| 337/337 [00:06<00:00, 54.66it/s]


# Examples: 337
# Average images: 2.0
# Max images: 2
# Avg Text Length: 13.02373887240356
# Avg Length with image: 1165.0237388724036
------ nlvr2 ------


100%|██████████| 6967/6967 [08:00<00:00, 14.50it/s]


# Examples: 6967
# Average images: 2.0
# Max images: 2
# Avg Text Length: 37.554901679345484
# Avg Length with image: 1189.5549016793454


In [6]:
from datasets import get_dataset_config_names, load_dataset
from tqdm import tqdm
configs = get_dataset_config_names("BLINK-Benchmark/BLINK")
config_items = {}
for config in configs:
    config_items[config] = []
    config_dataset = load_dataset("BLINK-Benchmark/BLINK", config, split='val')
    for item in tqdm(config_dataset, desc=config):
        num_of_images = len([item[x] for x in [f"image_{i}" for i in range(1, 5)] if item[x] is not None])
        config_items[config].append({
            "text_length": get_question_len(item["question"]),
            "num_of_images": num_of_images
        })
    print(f"------ {config} ------")
    print(f"# Examples: {len(config_items[config])}")
    print(f"# Average images: {np.mean([item['num_of_images'] for item in config_items[config]])}")
    print(f"# Max images: {np.max([item['num_of_images'] for item in config_items[config]])}")
    print(f"# Avg Text Length: {np.mean([item['text_length'] for item in config_items[config]])}")
    print(f"# Avg Length with image: {np.mean([item['text_length'] + 576 * item['num_of_images'] for item in config_items[config]])}")

# all
all_items = [item for config in config_items for item in config_items[config]]
print(f"------ all ------")
print(f"# Examples: {len(all_items)}")
print(f"# Average images: {np.mean([item['num_of_images'] for item in all_items])}")
print(f"# Max images: {np.max([item['num_of_images'] for item in all_items])}")
print(f"# Avg Text Length: {np.mean([item['text_length'] for item in all_items])}")
print(f"# Avg Length with image: {np.mean([item['text_length'] + 576 * item['num_of_images'] for item in all_items])}")



Art_Style: 100%|██████████| 117/117 [00:05<00:00, 22.22it/s]


------ Art_Style ------
# Examples: 117
# Average images: 3.0
# Max images: 3
# Avg Text Length: 12.0
# Avg Length with image: 1740.0


Counting: 100%|██████████| 120/120 [00:00<00:00, 709.38it/s]


------ Counting ------
# Examples: 120
# Average images: 1.0
# Max images: 1
# Avg Text Length: 11.466666666666667
# Avg Length with image: 587.4666666666667


Forensic_Detection: 100%|██████████| 132/132 [00:00<00:00, 155.73it/s]


------ Forensic_Detection ------
# Examples: 132
# Average images: 4.0
# Max images: 4
# Avg Text Length: 12.0
# Avg Length with image: 2316.0


Functional_Correspondence: 100%|██████████| 130/130 [00:01<00:00, 100.89it/s]


------ Functional_Correspondence ------
# Examples: 130
# Average images: 2.0
# Max images: 2
# Avg Text Length: 10.0
# Avg Length with image: 1162.0


IQ_Test: 100%|██████████| 150/150 [00:00<00:00, 570.49it/s]


------ IQ_Test ------
# Examples: 150
# Average images: 1.0
# Max images: 1
# Avg Text Length: 16.1
# Avg Length with image: 592.1


Jigsaw: 100%|██████████| 150/150 [00:00<00:00, 685.78it/s]


------ Jigsaw ------
# Examples: 150
# Average images: 3.0
# Max images: 3
# Avg Text Length: 12.0
# Avg Length with image: 1740.0


Multi-view_Reasoning: 100%|██████████| 133/133 [00:00<00:00, 286.95it/s]


------ Multi-view_Reasoning ------
# Examples: 133
# Average images: 2.0
# Max images: 2
# Avg Text Length: 46.0
# Avg Length with image: 1198.0


Object_Localization: 100%|██████████| 122/122 [00:00<00:00, 439.72it/s]


------ Object_Localization ------
# Examples: 122
# Average images: 1.0
# Max images: 1
# Avg Text Length: 18.270491803278688
# Avg Length with image: 594.2704918032787


Relative_Depth: 100%|██████████| 124/124 [00:00<00:00, 714.60it/s]


------ Relative_Depth ------
# Examples: 124
# Average images: 1.0
# Max images: 1
# Avg Text Length: 9.0
# Avg Length with image: 585.0


Relative_Reflectance: 100%|██████████| 134/134 [00:00<00:00, 199.53it/s]


------ Relative_Reflectance ------
# Examples: 134
# Average images: 1.0
# Max images: 1
# Avg Text Length: 33.0
# Avg Length with image: 609.0


Semantic_Correspondence: 100%|██████████| 139/139 [00:01<00:00, 87.57it/s]


------ Semantic_Correspondence ------
# Examples: 139
# Average images: 2.0
# Max images: 2
# Avg Text Length: 10.0
# Avg Length with image: 1162.0


Spatial_Relation: 100%|██████████| 143/143 [00:00<00:00, 599.19it/s]


------ Spatial_Relation ------
# Examples: 143
# Average images: 1.0
# Max images: 1
# Avg Text Length: 10.258741258741258
# Avg Length with image: 586.2587412587412


Visual_Correspondence: 100%|██████████| 172/172 [00:02<00:00, 79.89it/s]


------ Visual_Correspondence ------
# Examples: 172
# Average images: 2.0
# Max images: 2
# Avg Text Length: 10.0
# Avg Length with image: 1162.0


Visual_Similarity: 100%|██████████| 135/135 [00:01<00:00, 114.35it/s]

------ Visual_Similarity ------
# Examples: 135
# Average images: 3.0
# Max images: 3
# Avg Text Length: 11.0
# Avg Length with image: 1739.0
------ all ------
# Examples: 1901
# Average images: 1.9331930562861652
# Max images: 4
# Avg Text Length: 15.689637033140452
# Avg Length with image: 1129.2088374539717





In [7]:
from pathlib import Path
# MVbench
import json
json_dir = Path("./mvbench/MVBench/json")
new_data = []
for json_file in json_dir.glob("*.json"):
    with open(json_file, "r") as f:
        subset_data = json.load(f)
    subset_new_data = []
    for item in tqdm(subset_data):
        subset_new_data.append({
            "text_length": get_question_len(item["question"]),
            "num_of_images": 8
        })
    print(f"------ {json_file} ------")
    print(f"# Examples: {len(subset_new_data)}")
    print(f"# Average images: {np.mean([item['num_of_images'] for item in subset_new_data])}")
    print(f"# Max images: {np.max([item['num_of_images'] for item in subset_new_data])}")
    print(f"# Avg Text Length: {np.mean([item['text_length'] for item in subset_new_data])}")
    print(f"# Avg Length with image: {np.mean([item['text_length'] + 576 * item['num_of_images'] for item in subset_new_data])}")
    new_data.extend(subset_new_data)
        
print(f"------ all ------")
print(f"# Examples: {len(new_data)}")
print(f"# Average images: {np.mean([item['num_of_images'] for item in new_data])}")
print(f"# Max images: {np.max([item['num_of_images'] for item in new_data])}")
print(f"# Avg Text Length: {np.mean([item['text_length'] for item in new_data])}")
print(f"# Avg Length with image: {np.mean([item['text_length'] + 576 * item['num_of_images'] for item in new_data])}")
    

100%|██████████| 200/200 [00:00<00:00, 16920.02it/s]


------ mvbench/MVBench/json/counterfactual_inference.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 11.665
# Avg Length with image: 4619.665


100%|██████████| 200/200 [00:00<00:00, 7711.04it/s]


------ mvbench/MVBench/json/egocentric_navigation.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 51.755
# Avg Length with image: 4659.755


100%|██████████| 200/200 [00:00<00:00, 24635.42it/s]


------ mvbench/MVBench/json/moving_count.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 10.16
# Avg Length with image: 4618.16


100%|██████████| 200/200 [00:00<00:00, 18131.65it/s]


------ mvbench/MVBench/json/object_shuffle.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 35.0
# Avg Length with image: 4643.0


100%|██████████| 200/200 [00:00<00:00, 24836.00it/s]


------ mvbench/MVBench/json/scene_transition.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 14.605
# Avg Length with image: 4622.605


100%|██████████| 200/200 [00:00<00:00, 30424.37it/s]


------ mvbench/MVBench/json/action_prediction.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 8.55
# Avg Length with image: 4616.55


100%|██████████| 200/200 [00:00<00:00, 14714.27it/s]


------ mvbench/MVBench/json/action_localization.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 21.135
# Avg Length with image: 4629.135


100%|██████████| 200/200 [00:00<00:00, 22692.15it/s]


------ mvbench/MVBench/json/state_change.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 20.07
# Avg Length with image: 4628.07


100%|██████████| 200/200 [00:00<00:00, 19344.19it/s]


------ mvbench/MVBench/json/moving_attribute.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 14.165
# Avg Length with image: 4622.165


100%|██████████| 200/200 [00:00<00:00, 25602.34it/s]


------ mvbench/MVBench/json/character_order.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 12.605
# Avg Length with image: 4620.605


100%|██████████| 200/200 [00:00<00:00, 23233.28it/s]


------ mvbench/MVBench/json/action_antonym.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 12.2
# Avg Length with image: 4620.2


100%|██████████| 200/200 [00:00<00:00, 15901.07it/s]


------ mvbench/MVBench/json/episodic_reasoning.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 18.78
# Avg Length with image: 4626.78


100%|██████████| 200/200 [00:00<00:00, 27622.27it/s]


------ mvbench/MVBench/json/fine_grained_pose.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 11.83
# Avg Length with image: 4619.83


100%|██████████| 200/200 [00:00<00:00, 17592.45it/s]


------ mvbench/MVBench/json/unexpected_action.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 14.795
# Avg Length with image: 4622.795


100%|██████████| 200/200 [00:00<00:00, 22657.83it/s]


------ mvbench/MVBench/json/object_existence.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 11.315
# Avg Length with image: 4619.315


100%|██████████| 200/200 [00:00<00:00, 23588.68it/s]


------ mvbench/MVBench/json/action_sequence.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 11.62
# Avg Length with image: 4619.62


100%|██████████| 200/200 [00:00<00:00, 23490.26it/s]


------ mvbench/MVBench/json/action_count.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 15.625
# Avg Length with image: 4623.625


100%|██████████| 200/200 [00:00<00:00, 27097.61it/s]


------ mvbench/MVBench/json/fine_grained_action.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 12.5
# Avg Length with image: 4620.5


100%|██████████| 200/200 [00:00<00:00, 22409.06it/s]


------ mvbench/MVBench/json/moving_direction.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 14.455
# Avg Length with image: 4622.455


100%|██████████| 200/200 [00:00<00:00, 28078.08it/s]

------ mvbench/MVBench/json/object_interaction.json ------
# Examples: 200
# Average images: 8.0
# Max images: 8
# Avg Text Length: 9.775
# Avg Length with image: 4617.775
------ all ------
# Examples: 4000
# Average images: 8.0
# Max images: 8
# Avg Text Length: 16.63025
# Avg Length with image: 4624.63025





In [8]:
# Qbench
import json
with open('./qbench2/data/q-bench2-a1-single-dev.json') as f:
    data = json.load(f)
for item in data:
    item["text_length"] = get_question_len(item["question"])
    item["num_of_images"] = len(item["images"])
print(f"------ Qbench A1-pair-dev ------")
print(f"# Examples: {len(data)}")
print(f"# Average images: {np.mean([item['num_of_images'] for item in data])}")
print(f"# Max images: {np.max([item['num_of_images'] for item in data])}")
print(f"# Avg Text Length: {np.mean([item['text_length'] for item in data])}")
print(f"# Avg Length with image: {np.mean([item['text_length'] + 576 * item['num_of_images'] for item in data])}")

------ Qbench A1-pair-dev ------
# Examples: 1000
# Average images: 2.0
# Max images: 2
# Avg Text Length: 14.836
# Avg Length with image: 1166.836
