In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

In [12]:
with open('/data/ephemeral/home/dataset/train.json', 'r') as f:
    data = json.load(f)

# Create a DataFrame from the JSON data
df = pd.DataFrame(data['annotations'])

# Extract X, y, and groups
X = np.ones((len(df), 1))  # X is just placeholder data
y_area = df['area'].values
y = np.where(y_area <= 10000.0, 0, np.where(y_area <= 100000.0, 1, 2))

groups = df['image_id'].values  # Groups

# StratifiedGroupKFold
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    if fold == 0:
        train_image_ids = np.unique(groups[train_idx])
        val_image_ids = np.unique(groups[val_idx])

        train_annotations = df[df['image_id'].isin(train_image_ids)]
        val_annotations = df[df['image_id'].isin(val_image_ids)]

        train_coco_format = {
            "images": [img for img in data['images'] if img['id'] in train_image_ids],
            "annotations": train_annotations.to_dict(orient='records'),
            "categories": data['categories']
        }

        val_coco_format = {
            "images": [img for img in data['images'] if img['id'] in val_image_ids],
            "annotations": val_annotations.to_dict(orient='records'),
            "categories": data['categories']
        }

        print('train image: ' , len(train_coco_format['images']) , 'val image: ' , len(val_coco_format['images']))
        print('train annotations: ' , len(train_coco_format['annotations']) , 'val annotations: ' , len(val_coco_format['annotations']))

        with open('/data/ephemeral/home/dataset/train_split_box.json', 'w') as train_file:
            json.dump(train_coco_format, train_file)

        with open('/data/ephemeral/home/dataset/val_split_box.json', 'w') as val_file:
            json.dump(val_coco_format, val_file)

        break



train image:  3897 val image:  986
train annotations:  18444 val annotations:  4700


In [11]:
counts = np.bincount(y)
# 결과 출력
print(f"0의 개수: {counts[0]}")
print(f"1의 개수: {counts[1]}")
print(f"2의 개수: {counts[2]}")

0의 개수: 5789
1의 개수: 10632
2의 개수: 6723
