In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

In [5]:
with open('/data/ephemeral/home/dataset/train.json', 'r') as f:
    data = json.load(f)

# Create a DataFrame from the JSON data
df = pd.DataFrame(data['annotations'])

# Extract X, y, and groups
X = np.ones((len(df), 1))  # X is just placeholder data
y = df['category_id'].values  # Labels
groups = df['image_id'].values  # Groups

# StratifiedGroupKFold
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    if fold == 0:
        train_image_ids = np.unique(groups[train_idx])
        val_image_ids = np.unique(groups[val_idx])

        train_annotations = df[df['image_id'].isin(train_image_ids)]
        val_annotations = df[df['image_id'].isin(val_image_ids)]

        train_coco_format = {
            "images": [img for img in data['images'] if img['id'] in train_image_ids],
            "annotations": train_annotations.to_dict(orient='records'),
            "categories": data['categories']
        }

        val_coco_format = {
            "images": [img for img in data['images'] if img['id'] in val_image_ids],
            "annotations": val_annotations.to_dict(orient='records'),
            "categories": data['categories']
        }

        print('train image: ' , len(train_coco_format['images']) , 'val image: ' , len(val_coco_format['images']))
        print('train annotations: ' , len(train_coco_format['annotations']) , 'val annotations: ' , len(val_coco_format['annotations']))

        with open('/data/ephemeral/home/dataset/train_split_class.json', 'w') as train_file:
            json.dump(train_coco_format, train_file)

        with open('/data/ephemeral/home/dataset/val_split_class.json', 'w') as val_file:
            json.dump(val_coco_format, val_file)

        break



train image:  3914 val image:  969
train annotations:  18633 val annotations:  4511
