In [1]:
import json

def load_json(file_path):
    """
    Load a JSON file and return its content as a Python dictionary.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        dict: The content of the JSON file as a dictionary.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data
def save_json(data, file_path):
    """
    Save a Python dictionary to a JSON file.

    Parameters:
        data (dict): The data to save.
        file_path (str): The path where the JSON file will be saved.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [3]:
json_path = "/data_ssd/mscoco-detection/original-split/original-train_bbox-UDLR_for-kosmos2_mscoco2017-detection.json"
json_data = load_json(json_path)

In [4]:
print(len(json_data))

117266


In [5]:
import random
from sklearn.model_selection import train_test_split
import numpy as np

random.seed(42)
np.random.seed(42)

for_val_num = 5000
for_train_num = len(json_data) - for_val_num
print("for_train_num:", for_train_num)
print("for_val_num:", for_val_num)

for_train_num: 112266
for_val_num: 5000


In [6]:
train_dataset, val_dataset = train_test_split(    json_data,
    test_size=for_val_num,)

In [7]:
print(f"train_dataset: {len(train_dataset)}")
print(f"val_dataset: {len(val_dataset)}")
assert len(train_dataset) + len(val_dataset) == len(json_data)

train_dataset: 112266
val_dataset: 5000


In [8]:
train_json_path = "/data_ssd/mscoco-detection/train_bbox-UDLR_for-kosmos2_mscoco2017-detection.json"
val_json_path = "/data_ssd/mscoco-detection/val_bbox-UDLR_for-kosmos2_mscoco2017-detection.json"
save_json(train_dataset, train_json_path)
save_json(val_dataset, val_json_path)

In [9]:
train_loaded_data = load_json(train_json_path)
val_loaded_data = load_json(val_json_path)

assert len(train_loaded_data) == len(train_dataset)
assert len(val_loaded_data) == len(val_dataset)