In [1]:
import json

def load_json(file_path):
    """
    Load a JSON file and return its content as a Python dictionary.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        dict: The content of the JSON file as a dictionary.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data
def save_json(data, file_path):
    """
    Save a Python dictionary to a JSON file.

    Parameters:
        data (dict): The data to save.
        file_path (str): The path where the JSON file will be saved.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [2]:
import os
import glob

json_root_dir = "/data_ssd/LLaVA-OneVision-Data"
path_list = glob.glob(os.path.join(json_root_dir, "*", "*_checked_image_tag.json"), recursive=True)
print(len(path_list))

91


In [3]:
dataset_name_list = [os.path.basename(path).split("_checked_image_tag")[0] for path in path_list]

In [4]:
print(dataset_name_list)
from tqdm import tqdm
dataset_dict = {}
total_data_num = 0
for dataset_name, path in tqdm(zip(dataset_name_list, path_list)):
    data = load_json(path)
    dataset_dict[dataset_name] = {"data":data, "data_num": len(data)}
    total_data_num += len(data)
    if len(data) == 0:
        print(f"Warning: {dataset_name} has no data, please check the path: {path}")

for dataset_name, dataset_info in dataset_dict.items():
    dataset_dict[dataset_name]["weight"] = dataset_info["data_num"] / total_data_num

['CLEVR-Math(MathV360K)', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)', 'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'PMC-VQA(MathV360K)', 'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)', 'VizWiz(MathV360K)', 'ai2d(cauldron,llava_format)', 'ai2d(gpt4v)', 'ai2d(internvl)', 'allava_instruct_laion4v', 'allava_instruct_vflan4v', 'aokvqa(cauldron,llava_format)', 'chart2text(cauldron)', 'chartqa(cauldron,llava_format)', 'chrome_writting', 'clevr(cauldron,llava_format)', 'diagram_image_to_text(cauldron)', 'dvqa(cauldron,llava_format)', 'figureqa(cauldron,llava_format)', 'geo170k(align)', 'geo170k(qa)', 'geo3k', 'geomverse(cauldron)', 'hateful_memes(cauldron,llava_format)', 'hitab(cauldron,llava_format)', 'hme100k', 'iam(cauldron)', 'iconqa(cauldron,llava_format)', 'iiit5k', 'image_textualization(filtered)', 'infographic(gpt4v)', 'infographic_vqa', 'infographic_vqa_llava_format', 'intergps(cauldron,llava_fo

2it [00:00, 12.84it/s]

91it [01:25,  1.07it/s]


In [5]:
for dataset_name, dataset_info in dataset_dict.items():
    print(f"{dataset_name}: {dataset_info['data_num']} samples, weight: {dataset_info['weight']:.4f}")

CLEVR-Math(MathV360K): 5280 samples, weight: 0.0012
FigureQA(MathV360K): 17587 samples, weight: 0.0042
GEOS(MathV360K): 498 samples, weight: 0.0001
GeoQA+(MathV360K): 17162 samples, weight: 0.0041
Geometry3K(MathV360K): 9724 samples, weight: 0.0023
IconQA(MathV360K): 22589 samples, weight: 0.0053
MapQA(MathV360K): 5225 samples, weight: 0.0012
PMC-VQA(MathV360K): 35948 samples, weight: 0.0085
Super-CLEVR(MathV360K): 8642 samples, weight: 0.0020
TabMWP(MathV360K): 22452 samples, weight: 0.0053
UniGeo(MathV360K): 11949 samples, weight: 0.0028
VisualWebInstruct(filtered): 263584 samples, weight: 0.0623
VizWiz(MathV360K): 6604 samples, weight: 0.0016
ai2d(cauldron,llava_format): 2429 samples, weight: 0.0006
ai2d(gpt4v): 4864 samples, weight: 0.0012
ai2d(internvl): 12403 samples, weight: 0.0029
allava_instruct_laion4v: 49990 samples, weight: 0.0118
allava_instruct_vflan4v: 19990 samples, weight: 0.0047
aokvqa(cauldron,llava_format): 16534 samples, weight: 0.0039
chart2text(cauldron): 26956 s

In [6]:
print(total_data_num)
print(f"min data num: {min([dataset_info['data_num'] for dataset_info in dataset_dict.values()])}")

4229131
min data num: 295


# 単なる重み付サンプル

In [7]:
sample_data_num = 200000

In [8]:
import random
random.seed(42)
from collections import Counter

dataset_name_list = []
weights = []
for dataset_name, dataset_info in dataset_dict.items():
    dataset_name_list.append(dataset_name)
    weights.append(dataset_info["weight"])
    
sample_dataset_iter = random.choices(dataset_name_list, weights=weights, k=sample_data_num)     

In [9]:
print(f"Sampled {len(sample_dataset_iter)} samples from the datasets.")
print(len(set(sample_dataset_iter)))


sample_num_counter = Counter(sample_dataset_iter)

Sampled 200000 samples from the datasets.
91


In [10]:
sample_num_dict = {k:{"num":v, "target_sample_num": round(dataset_dict[k]["weight"]*sample_data_num)} for k, v in sample_num_counter.most_common()}
for k, v in sample_num_dict.items():
    print(f"{k}: {v['num']} samples, target: {v['target_sample_num']}")

magpie_pro(qwen2_72b_st): 14237 samples, target: 14186
magpie_pro(l3_80b_st): 14185 samples, target: 14187
magpie_pro(l3_80b_mt): 14044 samples, target: 14187
VisualWebInstruct(filtered): 12530 samples, target: 12465
ureader_qa: 12100 samples, target: 11962
k12_printing: 11993 samples, target: 12137
dvqa(cauldron,llava_format): 9442 samples, target: 9458
vision_flan(filtered): 8753 samples, target: 8799
Evol-Instruct-GPT4-Turbo: 6702 samples, target: 6762
mavis_math_rule_geo: 4752 samples, target: 4729
figureqa(cauldron,llava_format): 4719 samples, target: 4729
image_textualization(filtered): 4672 samples, target: 4709
tallyqa(cauldron,llava_format): 4662 samples, target: 4666
ureader_cap: 4500 samples, target: 4324
mavis_math_metagen: 4190 samples, target: 4131
robut_wikisql(cauldron): 3500 samples, target: 3546
hme100k: 3498 samples, target: 3523
clevr(cauldron,llava_format): 3370 samples, target: 3310
geo170k(qa): 3209 samples, target: 3207
geo170k(align): 2767 samples, target: 2849

# 最低保証サンプル

In [11]:
import random
random.seed(42)
from collections import Counter

sample_data_num = 200000 #50000 #20000
dataset_name_list = []
weights = []
import numpy as np
sample_num_per_dataset = []
for dataset_name, dataset_info in dataset_dict.items():
    dataset_name_list.append(dataset_name)
    sample_num_per_dataset.append(round(dataset_info["weight"] * sample_data_num) if dataset_info["weight"] > 0 else 1 )
    
min_sample_num = 10 #min(sample_num_per_dataset)

for dataset_info in dataset_dict.values():
    #weights.append((dataset_info["data_num"]-min_sample_num) / (total_data_num - len(dataset_dict) * min_sample_num))
    weights.append(round(dataset_info["weight"] * sample_data_num) - min_sample_num)

In [12]:
print(np.sum(sample_num_per_dataset))
print(f"Minimum sample number per dataset: {min_sample_num}")

200001
Minimum sample number per dataset: 10


In [13]:
sample_num_per_dataset = {k:min_sample_num for k in dataset_dict.keys()}
print(f"Sampled {sum(sample_num_per_dataset.values())} samples from the datasets.")

Sampled 910 samples from the datasets.


In [14]:
sample_dataset_iter = random.choices(dataset_name_list, weights=weights, k=(sample_data_num - len(dataset_name_list) * min_sample_num))
sample_num_counter = Counter(sample_dataset_iter)
for k, v in sample_num_counter.items():
    sample_num_per_dataset[k] += v

In [15]:
for k, v in sample_num_per_dataset.items():
    print(f"{k}: {v} samples, target: {round(dataset_dict[k]['weight'] * sample_data_num)}")
    
print(f"Total sampled data number: {sum(sample_num_per_dataset.values())}")

CLEVR-Math(MathV360K): 233 samples, target: 250
FigureQA(MathV360K): 840 samples, target: 832
GEOS(MathV360K): 22 samples, target: 24
GeoQA+(MathV360K): 795 samples, target: 812
Geometry3K(MathV360K): 460 samples, target: 460
IconQA(MathV360K): 1049 samples, target: 1068
MapQA(MathV360K): 214 samples, target: 247
PMC-VQA(MathV360K): 1721 samples, target: 1700
Super-CLEVR(MathV360K): 369 samples, target: 409
TabMWP(MathV360K): 1135 samples, target: 1062
UniGeo(MathV360K): 544 samples, target: 565
VisualWebInstruct(filtered): 12540 samples, target: 12465
VizWiz(MathV360K): 343 samples, target: 312
ai2d(cauldron,llava_format): 132 samples, target: 115
ai2d(gpt4v): 240 samples, target: 230
ai2d(internvl): 613 samples, target: 587
allava_instruct_laion4v: 2338 samples, target: 2364
allava_instruct_vflan4v: 991 samples, target: 945
aokvqa(cauldron,llava_format): 754 samples, target: 782
chart2text(cauldron): 1288 samples, target: 1275
chartqa(cauldron,llava_format): 875 samples, target: 864


# 実際にサンプル

In [16]:
save_json_data = []

for dataset_name, sample_num in sample_num_per_dataset.items():
    data = dataset_dict[dataset_name]["data"]
    if sample_num > len(data):
        print(f"Warning: {dataset_name} has only {len(data)} samples, but requested {sample_num} samples.")
        sample_num = len(data)
    sampled_data = random.sample(data, sample_num)
    save_json_data.extend(sampled_data)
    
print(f"Total sampled data number: {len(save_json_data)}")

Total sampled data number: 200000


In [17]:
image_folder_root = "/data_ssd/llava-onevision-data-symbolic-link"
for item in tqdm(save_json_data):
    if "image" in item:
        image_list = item["image"] if isinstance(item["image"], list) else [item["image"]]
        iamge_list = [os.path.join(image_folder_root, img) for img in image_list]
        
        for image_path in iamge_list:
            if not os.path.exists(image_path):
                print(f"Warning: Image path {image_path} does not exist, removing item from data.")
            
        image_count = 0
        for conversation in item["conversations"]:
            image_count += conversation["value"].count("<image>")
            
        if image_count != len(image_list):
            print(image_list[0])
            break
            
            # print(f"{item["image"]} has more than one <image> tag {image_count}, removing item from data.")
        
        

100%|██████████| 200000/200000 [00:19<00:00, 10020.50it/s] 


In [18]:
print(image_count)

1


In [19]:
for k, v in item.items():
    print(f"{k}: {v}")

id: 2-16406152-2
conversations: [{'from': 'human', 'value': '<image>\ninte award receive only 2 nomination, Yes or No?\nAnswer the question using a single word or phrase.'}, {'from': 'gpt', 'value': 'No'}]
data_source: ureader_qa
image: LLaVA-OneVision-Data/ureader_qa/ureader-instruction-1.0/DUE_Benchmark/TabFact/pngs/2-16406152-2.png


In [20]:
save_json_path = os.path.join("/data_ssd/LLaVA-OneVision-Data-Json", f"llava-onevision-data_single-image_data_{sample_data_num}.json")

In [21]:
save_json(save_json_data, save_json_path)

In [22]:
loaded_data = load_json(save_json_path)
print(f"Loaded {len(loaded_data)} samples from {save_json_path}.")

Loaded 200000 samples from /data_ssd/LLaVA-OneVision-Data-Json/llava-onevision-data_single-image_data_200000.json.


In [23]:
print(loaded_data[0])  # Print the first item to verify the content

{'id': 'identity_97897', 'conversations': [{'from': 'human', 'value': '<image>\nHint: Please answer the question and provide the final answer at the end.\nQuestion: How many cylinders are there in total?'}, {'from': 'gpt', 'value': 'The answer is 4'}], 'data_source': 'CLEVR-Math(MathV360K)', 'image': 'LLaVA-OneVision-Data/CLEVR-Math(MathV360K)/train/identity_97897.png'}
