In [1]:
import json

def load_json(file_path):
    """
    Load a JSON file and return its content as a Python dictionary.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        dict: The content of the JSON file as a dictionary.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data
def save_json(data, file_path):
    """
    Save a Python dictionary to a JSON file.

    Parameters:
        data (dict): The data to save.
        file_path (str): The path where the JSON file will be saved.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [2]:
json_path = "/data_ssd/object365/object365_train_conversation_exist_image.json"
json_data = load_json(json_path)

In [3]:
print(len(json_data))

1742289


In [4]:
import random
from sklearn.model_selection import train_test_split
import numpy as np

random.seed(42)
np.random.seed(42)

for_detection_num = len(json_data) // 2
for_counting_num = len(json_data) - for_detection_num
print("for_detection_num:", for_detection_num)
print("for_counting_num:", for_counting_num)

for_detection_num: 871144
for_counting_num: 871145


In [5]:
detection_dataset, counting_dataset = train_test_split(    json_data,
    test_size=for_counting_num,)

In [6]:
print(f"len(detection_dataset): {len(detection_dataset)}")
print(f"len(counting_dataset): {len(counting_dataset)}")

len(detection_dataset): 871144
len(counting_dataset): 871145


In [7]:
detection_json_path = "/data_ssd/object365/detection_object365_for_llava-onevision.json"
counting_json_path = "/data_ssd/object365/counting_object365_for_llava-onevision.json"
save_json(detection_dataset, detection_json_path)
save_json(counting_dataset, counting_json_path)

In [8]:
detection_loaded_data = load_json(detection_json_path)
counting_loaded_data = load_json(counting_json_path)

print(f"Loaded detection dataset length: {len(detection_loaded_data)}")
print(f"Loaded counting dataset length: {len(counting_loaded_data)}")

Loaded detection dataset length: 871144
Loaded counting dataset length: 871145


In [9]:
print(detection_loaded_data[0])

{'id': '82adbac3-1246-4eb8-85d9-206418cd3c86', 'image': 'objects365/train/patch23/objects365_v2_01190195.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nPlease output bbox coordinates and names of every item in this image.'}, {'from': 'gpt', 'value': 'Pickup Truck: [0.000,0.397,0.282,0.682]\nCar: [0.380,0.518,0.559,0.717]\nSUV: [0.474,0.460,0.543,0.542]\nSUV: [0.381,0.431,0.460,0.514]\nTraffic Light: [0.533,0.354,0.541,0.377]\nTraffic Light: [0.486,0.342,0.494,0.371]\nCar: [0.637,0.480,0.681,0.506]\nTraffic Light: [0.641,0.331,0.649,0.356]\nStreet Lights: [0.543,0.324,0.580,0.480]\nCar: [0.359,0.428,0.395,0.457]\nTraffic Light: [0.302,0.394,0.306,0.411]\nTraffic Light: [0.208,0.296,0.217,0.330]\nCar: [0.598,0.450,0.621,0.466]\nTraffic Light: [0.667,0.430,0.675,0.451]\nTraffic Light: [0.655,0.415,0.666,0.449]\nPickup Truck: [0.744,0.513,0.989,0.803]\nCar: [0.956,0.499,1.000,0.523]\nCar: [0.461,0.429,0.496,0.475]\nCar: [0.499,0.441,0.540,0.459]\nCar: [0.516,0.454,0.538,0.479

In [None]:
import numpy as np


a = np.arange(10)
print(a)
# [0 1 2 3 4 5 6 7 8 9]

print(train_test_split(a))
# [array([3, 9, 6, 1, 5, 0, 7]), array([2, 8, 4])]

print(type(train_test_split(a)))
# <class 'list'>

print(len(train_test_split(a)))

[0 1 2 3 4 5 6 7 8 9]
[array([1, 3, 7, 9, 0, 5, 2]), array([6, 4, 8])]
<class 'list'>
2
