In [None]:
# Code to build up a simple 'class => images' mapping and then we will
# create customized datasets like 'not vs not dog' upon it.

import os
import csv
import shutil

def _copy_files(images, dstfolder):
    for image in images:
        if os.path.exists(image):
            shutil.copy(image, dstfolder)
        else:
            print ('Image not found: %s' % image)
    print ('%d images copied to %s' % (len(images), dstfolder))

IMAGE_LABELS = {
    '../openimages_v4/training': '../openimages_v4/train-annotations-human-imagelabels-boxable.csv',
    '../openimages_v4/validation': '../openimages_v4/validation-annotations-human-imagelabels-boxable.csv',
    '../openimages_v4/testing': '../openimages_v4/test-annotations-human-imagelabels-boxable.csv', 
}

label_names = {} # class label => name
name_labels = {} # class name => label
with open('../openimages_v4/class-descriptions-boxable.csv', mode='r') as infile:
    reader = csv.reader(infile)
    for rows in reader:
        label_names[rows[0]] = rows[1]
        name_labels[rows[1]] = rows[0]

labelled_images = {} # label => images
for folder, csvfile in IMAGE_LABELS.iteritems():
    with open(csvfile, mode='r') as infile:
        reader = csv.reader(infile)
        firstRow = True
        for rows in reader:
            if firstRow: # ignore first row
                firstRow = False
                continue
            if rows[3] != '1': # ignore negative labels
                continue
            labelled_images.setdefault(rows[2], set()).add(os.path.join(folder, rows[0] + '.jpg'))

# make a 'dog' vs 'not dog' dataset.

# 0) prepare.
if os.path.exists('./output_dnd'):
    shutil.rmtree('./output_dnd')
os.makedirs('./output_dnd/dog')
os.makedirs('./output_dnd/not_dog')

# 1) copy 'dog' images.
label = name_labels['Dog']
images = labelled_images[label]
print ('%d images found with \'Dog\'' % len(images))            
_copy_files(images, './output_dnd/dog')

# 2) copy 'not dog' images.
avg_not_dog = len(images) / (len(label_names) - 1) + 12
for lb, imgs in labelled_images.iteritems():
    if lb != label: # not dog
        imgs = [img for img in imgs if img not in images] # not dog
        imgs = imgs[:avg_not_dog] # take first n images
        #print ('%d images found with \'%s\'' % (len(imgs), label_names[lb]))    
        _copy_files(imgs, './output_dnd/not_dog')    
    
print ('Done.')

In [None]:
# Code to create a label file for each image. All these label files will be
# saved in a 'labels' subfolder inside corresponding images folder.

import os
import csv
import shutil

IMAGE_LABELS = {
    '../openimages_v4/training': '../openimages_v4/train-annotations-human-imagelabels-boxable.csv',
    '../openimages_v4/validation': '../openimages_v4/validation-annotations-human-imagelabels-boxable.csv',
    '../openimages_v4/testing': '../openimages_v4/test-annotations-human-imagelabels-boxable.csv', 
}

def _create_label_files(csvfile, imagesfolder):
    
    # image => label dictionary
    labels = {}
    with open(csvfile, mode='r') as infile:
        reader = csv.reader(infile)
        firstRow = True
        for rows in reader:
            if firstRow: # ignore first row
                firstRow = False
                continue
            if rows[3] != '1': # ignore negative labels
                continue    
            labels.setdefault(rows[0], []).append(rows[2])
    print ('Images with positive labels: %d' % len(labels))
    
    # labels will be saved to a separate folder
    outfolder = os.path.join(imagesfolder, 'labels')
    if os.path.exists(outfolder):
        shutil.rmtree(outfolder)
    os.makedirs(outfolder)

    # loop through all images
    images = [x for x in os.listdir(imagesfolder) if os.path.isfile(os.path.join(imagesfolder, x))]
    for index, image in enumerate(images):
        name = os.path.splitext(image)[0] # name without extension
        if name in labels:
            with open(os.path.join(outfolder, image + '.txt'), mode='wb') as outfile:
                for label in labels[name]:
                    outfile.write(label + '\n')        
        if index % 10000 == 0:
            print ('Progress: %d' % index)
    print ('Progress: %d' % len(images))

for imagesfolder, csvfile in IMAGE_LABELS.iteritems():
    _create_label_files(csvfile, imagesfolder)
    
print ('Done.')

In [None]:
# Code to create a small dataset from openimagesv4, the label files will
# be copied as well.

import os
import csv
import shutil

def _copy_files(srcfolder, dstfolder, filecount):
    if os.path.exists(dstfolder):
        shutil.rmtree(dstfolder)
    os.makedirs(dstfolder)
    os.makedirs(os.path.join(dstfolder, 'labels'))

    images = [x for x in os.listdir(srcfolder) if os.path.isfile(os.path.join(srcfolder, x))]
    images = images[:filecount]
    for image in images:
        shutil.copy(os.path.join(srcfolder, image), dstfolder)
        label = os.path.join(srcfolder, 'labels', image + '.txt')
        if os.path.exists(label):
            shutil.copy(label, os.path.join(dstfolder, 'labels'))
    print ('%d images copied to %s' % (len(images), dstfolder))

# copy files
_copy_files('../openimages_v4/training', './small_dataset/training', 5000)
_copy_files('../openimages_v4/validation', './small_dataset/validation', 500)
_copy_files('../openimages_v4/testing', './small_dataset/testing', 500)

print ('Done.')