In [17]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import StratifiedGroupKFold, train_test_split

In [18]:
# Data path
data_path = '/data/ephemeral/home/dataset/'

# Data load
with open(data_path + 'train.json', 'r') as f:
    data = json.load(f)

In [19]:
# Create a DataFrame from the JSON data
df = pd.DataFrame(data['annotations'])

In [23]:
# Get split output
def get_output(train_coco_format, val_coco_format, train_path, val_path):
    
    print('train image:', len(train_coco_format['images']), ', val image:', len(val_coco_format['images']))
    print('train annotations:', len(train_coco_format['annotations']), ', val annotations:', len(val_coco_format['annotations']))

    with open(train_path, 'w') as train_file:
        json.dump(train_coco_format, train_file)

    with open(val_path, 'w') as val_file:
        json.dump(val_coco_format, val_file)

In [21]:
# Split with category & bbox area

# Extract X, y, and groups
X = np.ones((len(df), 1))  # X is just placeholder data
y_area = df['area'].values
y_area_bins = np.digitize(y_area, bins=[10000.0, 100000.0])
y = df['category_id'].astype(str) + '_' + y_area_bins.astype(str)

groups = df['image_id'].values  # Groups

# StratifiedGroupKFold
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411)

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    if fold == 0:
        train_image_ids = np.unique(groups[train_idx])
        val_image_ids = np.unique(groups[val_idx])

        train_annotations = df[df['image_id'].isin(train_image_ids)]
        val_annotations = df[df['image_id'].isin(val_image_ids)]

        train_coco_format = {
            "images": [img for img in data['images'] if img['id'] in train_image_ids],
            "annotations": train_annotations.to_dict(orient='records'),
            "categories": data['categories']
        }

        val_coco_format = {
            "images": [img for img in data['images'] if img['id'] in val_image_ids],
            "annotations": val_annotations.to_dict(orient='records'),
            "categories": data['categories']
        }

        train_path = data_path + 'train_split.json'
        val_path = data_path + 'val_split.json'
        get_output(train_coco_format, val_coco_format, train_path, val_path)
        
        break

train image:  3909 , val image:  974
train annotations:  18642 , val annotations:  4502


In [22]:
# Split randomly

image_ids = np.array([img['id'] for img in data['images']])

# train_test_split을 이용해 train, validation 이미지 ID를 랜덤으로 나눔
train_image_ids_rand, val_image_ids_rand = train_test_split(image_ids, test_size=0.2, random_state=411)

# Annotations를 train, validation으로 나눔
train_annotations = [ann for ann in data['annotations'] if ann['image_id'] in train_image_ids_rand]
val_annotations = [ann for ann in data['annotations'] if ann['image_id'] in val_image_ids_rand]

train_coco_format = {
    "images": [img for img in data['images'] if img['id'] in train_image_ids_rand],
    "annotations": train_annotations,
    "categories": data['categories']
}

val_coco_format = {
    "images": [img for img in data['images'] if img['id'] in val_image_ids_rand],
    "annotations": val_annotations,
    "categories": data['categories']
}

train_path = data_path + 'train_split_random.json'
val_path = data_path + 'val_split_random.json'
get_output(train_coco_format, val_coco_format, train_path, val_path)

train image:  3906 , val image:  977
train annotations:  18653 , val annotations:  4491


In [25]:
# Check distribution
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())
    
    return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in range(np.max(y) +1)]

In [26]:
# Check distribution of area in category & bbox dataset
y_area = df['area'].values
y_class = np.where(y_area <= 10000.0, 0, np.where(y_area <= 100000.0, 1, 2))
    
distrs = [get_distribution(y_class)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y_class[train_idx], y_class[val_idx]
    # train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    # assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
                 
categories = [d['name'] for d in data['categories']]

pd.DataFrame(distrs, index=index, columns = ['Area_1e4', 'Area_1e5', 'Area_over'])

Unnamed: 0,Area_1e4,Area_1e5,Area_over
training set,25.01%,45.94%,29.05%
train - fold0,24.77%,46.26%,28.97%
val - fold0,26.01%,44.62%,29.36%
train - fold1,25.23%,45.89%,28.89%
val - fold1,24.19%,46.13%,29.67%
train - fold2,25.16%,45.86%,28.98%
val - fold2,24.37%,46.29%,29.34%
train - fold3,24.97%,45.80%,29.23%
val - fold3,25.19%,46.46%,28.34%
train - fold4,24.93%,45.89%,29.18%


In [27]:
# Check distribution of category in category & bbox dataset

y_class = df['category_id'].values  # Labels
    
distrs = [get_distribution(y_class)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y_class[train_idx], y_class[val_idx]
    # train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    # assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
                 
categories = [d['name'] for d in data['categories']]

pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y_class) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - fold0,17.01%,28.26%,3.54%,3.96%,4.45%,12.58%,5.49%,22.03%,0.70%,1.97%
val - fold0,17.66%,24.08%,5.29%,4.38%,3.38%,13.26%,5.31%,23.79%,0.62%,2.24%
train - fold1,17.11%,27.17%,4.05%,4.15%,3.92%,12.68%,5.79%,22.57%,0.55%,2.01%
val - fold1,17.25%,28.51%,3.21%,3.63%,5.48%,12.84%,4.19%,21.63%,1.21%,2.06%
train - fold2,16.95%,27.48%,4.07%,4.04%,4.13%,12.74%,5.33%,22.45%,0.77%,2.05%
val - fold2,17.95%,27.30%,3.06%,4.06%,4.74%,12.62%,5.99%,22.06%,0.32%,1.91%
train - fold3,17.33%,27.02%,3.84%,3.87%,4.35%,12.73%,5.52%,22.42%,0.80%,2.12%
val - fold3,16.36%,29.09%,4.02%,4.74%,3.83%,12.68%,5.23%,22.18%,0.23%,1.63%
train - fold4,17.29%,27.28%,3.89%,4.20%,4.37%,12.85%,5.16%,22.40%,0.60%,1.96%


In [28]:
# Check distribution of category in random dataset

y_class = df['category_id'].values  # Labels
    
distrs = [get_distribution(y_class)]
index = ['training set']

train_y, val_y = y_class[train_image_ids_rand], y_class[val_image_ids_rand]

distrs.append(get_distribution(train_y))
distrs.append(get_distribution(val_y))
    
index.append(f'train - rand')
index.append(f'val - rand')
                 
categories = [d['name'] for d in data['categories']]

pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y_class) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,17.14%,27.45%,3.88%,4.04%,4.24%,12.72%,5.46%,22.37%,0.69%,2.02%
train - rand,18.66%,25.96%,3.38%,4.94%,4.33%,12.29%,4.51%,23.02%,1.31%,1.61%
val - rand,16.89%,28.76%,3.38%,3.79%,3.07%,12.28%,4.50%,24.77%,0.92%,1.64%
