## Load And Save A Coco Dataset

In [1]:
import json
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from coco_utilities import get_image_to_class_count_dataframe
#------- Config ------------
# set to false if a full_coco_data file has not been downloaded

LOAD_FILE = True

# If False, will run stratification on the entire COCO dataset
SAMPLE_DATA = True

# -----------------------------------

# Step 1: Download COCO 2017 validation annotations
if LOAD_FILE:
    with open("full_coco_data.json",'rb') as f:
        data = json.load(f)
else:
    url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    r = requests.get(url)
    z = ZipFile(BytesIO(r.content))
    ann_file = "annotations/instances_val2017.json"
    data = json.load(z.open(ann_file))

df = get_image_to_class_count_dataframe(data)

# Show sample
df.head()


Unnamed: 0,image_id,person,bicycle,car,motorcycle,airplane,bus,train,truck,boat,...,toaster,sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,toothbrush
0,397133,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,37777,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,252219,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,87038,14,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,174482,0,1,5,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Save the file if needed
# if using 'Run All' from VS Code this block might error saying 'LOAD_FILE' not defined
# even though it's clearly defined above. One needs to restart the kernel and rerun. 
if not LOAD_FILE:
    with open("full_coco_data.json",'w') as f:
        json.dump(data,f,indent=4)

## Extract a 80/20 Train/Val assignment

In [3]:
import pandas as pd
from coco_utilities import sub_sample_count_dataframe

if not SAMPLE_DATA:
    df_subsample = df.copy()
    n_classes = df_subsample.shape[1] -1
    n_images = df_subsample.shape[0]
else:
    # sampe data
    n_images = 400
    n_classes = 10

    df_subsample = sub_sample_count_dataframe(df,n_images,n_classes)


In [4]:

import numpy as np
from coco_utilities import get_dataframe_count_datastructures

class_counts_per_image,index_to_image_id,index_to_class_name,class_names,image_ids = get_dataframe_count_datastructures(df_subsample)


print(f"W shape: {class_counts_per_image.shape}")
print(f"w_ic[0]: {class_counts_per_image[0]}")
print(f"image_id[0]: {index_to_image_id[0]}")
print(f"class[0]: {index_to_class_name[0]}")
print(f"w_ic[0][0] (num of '{index_to_class_name[0]}' in image {index_to_image_id[0]}): {class_counts_per_image[0][0]}")


W shape: (400, 10)
w_ic[0]: [0 0 0 0 0 0 6 0 0 0]
image_id[0]: 395180
class[0]: couch
w_ic[0][0] (num of 'couch' in image 395180): 0


In [5]:
from optimization_routine import stratified_k_way_split_respecting_partition

split_ratios = [0.8,0.2] # split weights (must sum to one)
# these indexes correspond to the array in split_ratios
assignment_index_to_name = {0:"train",1:"val"}

assignments = stratified_k_way_split_respecting_partition(n_images,n_classes,class_counts_per_image,split_ratios)


In [6]:
from coco_utilities import construct_coco_split_from_assignments


coco_per_assignment = construct_coco_split_from_assignments(data,
                                                            assignments,
                                                            index_to_image_id,
                                                            assignment_index_to_name)

# Example: print keys and image counts
for name, coco_obj in coco_per_assignment.items():
    print(f"{name}: {len(coco_obj['images'])} images, {len(coco_obj['annotations'])} annotations")


train: 320 images, 2727 annotations
val: 80 images, 772 annotations


## Example 60/20/20 Train/Val/Test Split

In [7]:
split_ratios = [0.6,0.2,0.2] # split weights (must sum to one)
# these indexes correspond to the array in split_ratios
assignment_index_to_name = {0:"train",1:"val",2:"test"}

assignments = stratified_k_way_split_respecting_partition(n_images,n_classes,class_counts_per_image,split_ratios)


In [8]:
coco_per_assignment = construct_coco_split_from_assignments(data,
                                                            assignments,
                                                            index_to_image_id,
                                                            assignment_index_to_name)

# Example: print keys and image counts
for name, coco_obj in coco_per_assignment.items():
    print(f"{name}: {len(coco_obj['images'])} images, {len(coco_obj['annotations'])} annotations")


test: 80 images, 674 annotations
train: 240 images, 2107 annotations
val: 80 images, 718 annotations
