# Prepare EPOXY data in COCO format


In [24]:
# https://github.com/akarazniewicz/cocosplit/blob/master/cocosplit.py
import json
import argparse
import funcy
from sklearn.model_selection import train_test_split
import shutil
import os 

def save_coco(file, info, licenses, categories, images, annotations):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'info': info, 'licenses': licenses, 'categories': categories, 'images': images, 
            'annotations': annotations}, coco, indent=2, sort_keys=False)

def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)


def copy_files(fns, src_dir, tgt_dir):
    for fn in fns:
        shutil.copy2(os.path.join(src_dir, fn), os.path.join(tgt_dir, fn))
        
def clean_annotation(annot):
    filtered_mod = []
    for bb in annot:
        b = {"id": bb['id'], 
             "image_id" : bb['image_id'], 
             "category_id" : bb['category_id'],
             "bbox" : bb['bbox'], 
             "segmentation": [], 
             "iscrowd" : bb['iscrowd']
            }
        filtered_mod.append(b)
    return filtered_mod

def clean_images_info(images):
    filtered_mod = []
    for bb in images:
        b = {"id": bb['id'], 
             "file_name" : bb['file_name'], 
             "width" : bb['width'],
             "height" : bb['height'], 
             "product_id" : bb['product_id']
            }
        filtered_mod.append(b)
    return filtered_mod


img_src_dir = '/mnt/sda1/intel_challenge/epoxy/defect_detection/img/'
fn = '/mnt/sda1/intel_challenge/epoxy/defect_detection/annotations_all.json'
#having_annotations = False 

train_dir =  '/mnt/sda1/intel_challenge/epoxy/train'
valid_dir =  '/mnt/sda1/intel_challenge/epoxy/valid'
test_dir =  '/mnt/sda1/intel_challenge/epoxy/test'
os.makedirs(train_dir, exist_ok=True)
os.makedirs(valid_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
train_fn = os.path.join(train_dir, 'train.json')
valid_fn = os.path.join(valid_dir, 'valid.json')
test_fn =  os.path.join(test_dir, 'test.json')


with open(fn, 'rt') as annot:
    coco = json.load(annot)
    info = coco['info']
    licenses = coco['licenses']
    categories = coco['categories']
    images = coco['images']
    annotations = coco['annotations']
    
    number_of_images = len(images)

    #images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)

    #if having_annotations:
    #    images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images)
    #split and sort ids 
    train, test = train_test_split(images, train_size=0.8, random_state= 42)
    test, valid = train_test_split(test, train_size=0.05, random_state=42)
    train = sorted(train, key = lambda i: i['id']) 
    valid = sorted(valid, key = lambda i: i['id']) 
    test  = sorted(test,  key = lambda i: i['id']) 
    
    save_coco(train_fn, info, licenses, categories, clean_images_info(train),clean_annotation(filter_annotations(annotations, train)))
    save_coco(valid_fn, info, licenses, categories, clean_images_info(valid),clean_annotation(filter_annotations(annotations, valid)))
    save_coco(test_fn, info, licenses, categories, clean_images_info(test),clean_annotation(filter_annotations(annotations, test)))


In [25]:
train_imgs_fns = set([x['file_name'] for x in train])
valid_imgs_fns = set([x['file_name'] for x in valid])
test_imgs_fns = set([x['file_name'] for x in test])
copy_files(train_imgs_fns, img_src_dir, train_dir)
copy_files(valid_imgs_fns, img_src_dir, valid_dir)
copy_files(test_imgs_fns, img_src_dir, test_dir)