In [None]:
# PROCESS FULL OPEN IMAGES V4 DATASET:
#
# Step #1 - parse the csv files and build a list of image ids for
# specific class (e.g. dogs).

import os
import csv
import shutil

TARGET_LABEL = 'Dog' # modify this line

IMAGE_LABELS = [
    '../openimages_v4/train-annotations-human-imagelabels.csv',
    '../openimages_v4/validation-annotations-human-imagelabels.csv',
    '../openimages_v4/test-annotations-human-imagelabels.csv', 
]

label_names = {} # class label => name
name_labels = {} # class name => label
with open('../openimages_v4/class-descriptions.csv', mode='r') as infile:
    reader = csv.reader(infile)
    for rows in reader:
        label_names[rows[0]] = rows[1]
        name_labels[rows[1]] = rows[0]

labelled_images = {} # label => images
for csvfile in IMAGE_LABELS:
    with open(csvfile, mode='r') as infile:
        reader = csv.reader(infile)
        firstRow = True
        for rows in reader:
            if firstRow: # ignore first row
                firstRow = False
                continue
            if rows[3] != '1': # ignore negative labels
                continue
            labelled_images.setdefault(rows[2], set()).add(rows[0])

label = name_labels[TARGET_LABEL]
images = labelled_images[label]
print ('%d images found with \'%s\' label' % (len(images), TARGET_LABEL))


In [None]:
# PROCESS FULL OPEN IMAGES V4 DATASET:
#
# Step #2 - parse the csv files and build a list of image links for
# all match image ids (e.g. dogs). Also, create a TSV file for using
# with Google Cloud Storage Transfer.

TARGET_TSV_FILE = 'all-dog-images.tsv' # modify this line

IMAGE_URLS = [
    '../openimages_v4/train-images-with-labels-with-rotation.csv',
    '../openimages_v4/validation-images-with-rotation.csv',
    '../openimages_v4/test-images-with-rotation.csv',    
]

image_urls = {} # image id => url
tsvfile = open(TARGET_TSV_FILE, 'w') # tsv file.
tsvfile.write('TsvHttpData-1.0\n')
for csvfile in IMAGE_URLS:
    with open(csvfile, mode='r') as infile:
        reader = csv.reader(infile)
        firstRow = True
        for rows in reader:
            if firstRow: # ignore first row
                firstRow = False
                continue
            if rows[0] in images:
                image_urls[rows[0]] = rows[2]
                tsvfile.write('\t'.join([rows[2], rows[8], rows[9]]) + '\n')

tsvfile.close() 
print ('%d image urls found with \'%s\' label' % (len(image_urls), TARGET_LABEL))


In [None]:
# PROCESS FULL OPEN IMAGES V4 DATASET:
#
# Step #3 - process the downloaded images and reorganize them
# by their image ids.

# modify these lines
DOWNLOADS_FOLDER = '/home/ubuntu/datasets/openimages_all_dogs/downloads'
PROCESSED_IMAGES = '/home/ubuntu/datasets/openimages_all_dogs/dog'

for k,v in image_urls.iteritems():
    path = v.replace('http://', '').replace('https://', '')
    path = os.path.join(DOWNLOADS_FOLDER, path)
    if os.path.exists(path):
        shutil.copyfile(path, os.path.join(PROCESSED_IMAGES, k + '.jpg'))
    else:
        print ('Image not found: %s' % k)
print ('Done.')


In [None]:
# PROCESS 601 CLASSES OPEN IMAGES V4 DATASET:
#
# Step #1 - build up a simple 'class => images' mapping and then we will
# create customized datasets like 'not vs not dog' upon it.

import os
import csv
import shutil

def _copy_files(images, dstfolder):
    if not os.path.exists(dstfolder):
        os.makedirs(dstfolder)
    for image in images:
        if os.path.exists(image):
            shutil.copy(image, dstfolder)
        else:
            print ('Image not found: %s' % image)
    print ('%d images copied to %s' % (len(images), dstfolder))

IMAGE_LABELS = {
    '../openimages_v4/training': '../openimages_v4/train-annotations-human-imagelabels-boxable.csv',
    '../openimages_v4/validation': '../openimages_v4/validation-annotations-human-imagelabels-boxable.csv',
    '../openimages_v4/testing': '../openimages_v4/test-annotations-human-imagelabels-boxable.csv', 
}

label_names = {} # class label => name
name_labels = {} # class name => label
with open('../openimages_v4/class-descriptions-boxable.csv', mode='r') as infile:
    reader = csv.reader(infile)
    for rows in reader:
        label_names[rows[0]] = rows[1]
        name_labels[rows[1]] = rows[0]

labelled_images = {} # label => images
for folder, csvfile in IMAGE_LABELS.iteritems():
    with open(csvfile, mode='r') as infile:
        reader = csv.reader(infile)
        firstRow = True
        for rows in reader:
            if firstRow: # ignore first row
                firstRow = False
                continue
            if rows[3] != '1': # ignore negative labels
                continue
            labelled_images.setdefault(rows[2], set()).add(os.path.join(folder, rows[0] + '.jpg'))


In [None]:
# PROCESS 601 CLASSES OPEN IMAGES V4 DATASET:
#
# Step #2.1 - make a 'dog' vs 'not dog' dataset by using all
# reviewed dog images, and using class-weights when build the
# not dog images, and then split train/val/test using pre-
# defined precentages.

import random
import math

DATASET_PATH = './output_dnd_69k_0918'
TRAIN_VAL_SPLIT = 0.8

if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)
os.makedirs(DATASET_PATH)

# 1) copy all reviewed 'dog' images.
total_dog_images = 0
folder = './output_dnd_accepted/dog'
images = [os.path.join(folder,x) for x in os.listdir(folder)]
random.shuffle(images)
print ('%d dog images in Google dataset.' % len(images))
_copy_files(images[:int(len(images) * TRAIN_VAL_SPLIT)], os.path.join(DATASET_PATH, 'train', 'dog'))
_copy_files(images[int(len(images) * TRAIN_VAL_SPLIT):], os.path.join(DATASET_PATH, 'val', 'dog'))
total_dog_images += len(images)

folder = './output_dnd_accepted/kaggle_dog'
images = [os.path.join(folder,x) for x in os.listdir(folder)]
random.shuffle(images)
print ('%d dog images in Kaggle dataset.' % len(images))
_copy_files(images[:int(len(images) * TRAIN_VAL_SPLIT)], os.path.join(DATASET_PATH, 'train', 'dog'))
_copy_files(images[int(len(images) * TRAIN_VAL_SPLIT):], os.path.join(DATASET_PATH, 'val', 'dog'))
total_dog_images += len(images)

# 2) copy 'not dog' images.
class_weights = {
    'Cat': .01,
    'Horse': .01,
    'Cattle': .01,
    'Boy': .075,
    'Woman': .075,
    'Man': .075,
    'Girl': .075,
    'Baby': '.075|../openimages_all_babies/babies'
}
class_images = {}
total_not_dog_images = total_dog_images # same as reviewed dog images.

for name,weight in class_weights.iteritems():
    if type(weight) is str:
        [weight, source] = weight.split('|')
        labelled_images[name] = set([os.path.join(source, x) for x in os.listdir(source)])
        label_names[name] = name 
        name_labels[name] = name
    class_images[name] = int(math.ceil(float(weight) * total_dog_images))
    total_not_dog_images -= class_images[name]
    
dog_label = name_labels['Dog']
avg_not_dog_images = int(math.ceil(total_not_dog_images * 1.0 / (len(labelled_images) - 1 - len(class_weights)))) + 2
for lb, imgs in labelled_images.iteritems():
    if lb == dog_label:
        continue  
    copy_total = 0
    name = label_names[lb]
    if name in class_weights:
        copy_total = class_images[name]
    else:
        copy_total = avg_not_dog_images
    imgs = [img for img in imgs if img not in labelled_images[dog_label]]
    random.shuffle(imgs)
    imgs = imgs[:copy_total]
    print ('Copy \'%s\' images.' % name)
    _copy_files(imgs[:int(len(imgs) * TRAIN_VAL_SPLIT)], os.path.join(DATASET_PATH, 'train', 'not_dog'))
    _copy_files(imgs[int(len(imgs) * TRAIN_VAL_SPLIT):], os.path.join(DATASET_PATH, 'val', 'not_dog'))
    
print ('Done.')

In [None]:
# PROCESS 601 CLASSES OPEN IMAGES V4 DATASET:
#
# Step #2.2 (outdated) - make a 'dog' vs 'not dog' dataset 
# by using all reviewed dog images, and split train/val/test
# using pre-defined precentages.

import random

# make a 'dog' vs 'not dog' dataset.
DATASET_PATH = './output_dnd_22k'
TRAIN_SET = 0.7
VALID_SET = 0.2
TEST_SET  = 0.1

# 0) prepare.
if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)
os.makedirs(DATASET_PATH)

# # 1) copy 'dog' images.
label = name_labels['Dog']
#images = list(labelled_images[label])
folder = './output_dnd_accepted/dog'
images = [os.path.join(folder,x) for x in os.listdir(folder)]
random.shuffle(images)
print ('%d images found with \'Dog\'' % len(images))
_copy_files(images[:int(len(images) * 0.7)], os.path.join(DATASET_PATH, 'train', 'dog'))
_copy_files(images[int(len(images) * 0.7):int(len(images) * 0.9)], os.path.join(DATASET_PATH, 'val', 'dog'))
_copy_files(images[int(len(images) * 0.9):], os.path.join(DATASET_PATH, 'test', 'dog'))

# 2) copy 'not dog' images.
avg_not_dog = len(images) / (len(label_names) - 1) + 3
for lb, imgs in labelled_images.iteritems():
    if lb != label: # not dog
        imgs = [img for img in imgs if img not in labelled_images[label]] # not dog
        random.shuffle(imgs)
        imgs = imgs[:avg_not_dog] # take first n images
        _copy_files(imgs[:int(len(imgs) * 0.7)], os.path.join(DATASET_PATH, 'train', 'not_dog'))
        _copy_files(imgs[int(len(imgs) * 0.7):int(len(imgs) * 0.9)], os.path.join(DATASET_PATH, 'val', 'not_dog'))
        _copy_files(imgs[int(len(imgs) * 0.9):], os.path.join(DATASET_PATH, 'test', 'not_dog'))
    
print ('Done.')

In [None]:
# PROCESS 601 CLASSES OPEN IMAGES V4 DATASET:
#
# Step #2.3 (outdated) - make a 'dog' vs 'not dog' dataset 
# by using all dog images.

# 0) prepare.
if os.path.exists('./output_dnd'):
    shutil.rmtree('./output_dnd')
os.makedirs('./output_dnd/dog')
os.makedirs('./output_dnd/not_dog')

# 1) copy 'dog' images.
label = name_labels['Dog']
images = labelled_images[label]
print ('%d images found with \'Dog\'' % len(images))            
_copy_files(images, './output_dnd/dog')

# 2) copy 'not dog' images.
avg_not_dog = len(images) / (len(label_names) - 1) + 12
for lb, imgs in labelled_images.iteritems():
    if lb != label: # not dog
        imgs = [img for img in imgs if img not in images] # not dog
        imgs = imgs[:avg_not_dog] # take first n images
        #print ('%d images found with \'%s\'' % (len(imgs), label_names[lb]))    
        _copy_files(imgs, './output_dnd/not_dog')    
    
print ('Done.')

In [None]:
# PROCESS 601 CLASSES OPEN IMAGES V4 DATASET:
#
# Step #2.4 (outdated) - make a 'dog' vs 'not dog' dataset 
# by using only a few dog images, and a few not dog images
# from a limited set of classes.

import random

# make a small 'dog' vs 'not dog' dataset.

DATASET_PATH = './output_dnd_1k'
DATASET_SIZE = 1000
TRAIN_SET = 0.7
VALID_SET = 0.2
TEST_SET  = 0.1

# 0) prepare.
if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)
os.makedirs(DATASET_PATH)

# 1) copy 'dog' images.
label = name_labels['Dog']
images = list(labelled_images[label])
random.shuffle(images)
images = images[:(DATASET_SIZE/2)]
print ('%d images for \'Dog\'' % len(images))            
_copy_files(images[:int(len(images) * 0.7)], os.path.join(DATASET_PATH, 'train', 'dog'))
_copy_files(images[int(len(images) * 0.7):int(len(images) * 0.9)], os.path.join(DATASET_PATH, 'val', 'dog'))
_copy_files(images[int(len(images) * 0.9):], os.path.join(DATASET_PATH, 'test', 'dog'))

# 2) copy 'not_dog' images.
not_dog = ["Human beard","Towel","Home appliance","Boy","Cat","Door","Scarf","Pillow","Woman","Horse","Sofa bed","Human hair","Curtain","Man","Girl","Pig","Loveseat","Television","Vehicle","Human face","Kitchen & dining room table","Human body","Sock","Desk","Chair"]
avg_not_dog = DATASET_SIZE / 2 / len(not_dog)
for cls in not_dog:
    lb = name_labels[cls]
    imgs = list(labelled_images[lb])
    random.shuffle(imgs)
    imgs = imgs[:avg_not_dog]
    _copy_files(imgs[:int(len(imgs) * 0.7)], os.path.join(DATASET_PATH, 'train', 'not_dog'))
    _copy_files(imgs[int(len(imgs) * 0.7):int(len(imgs) * 0.9)], os.path.join(DATASET_PATH, 'val', 'not_dog'))
    _copy_files(imgs[int(len(imgs) * 0.9):], os.path.join(DATASET_PATH, 'test', 'not_dog'))

print ('Done.')

In [None]:
# PROCESS 601 CLASSES OPEN IMAGES V4 DATASET:
#
# Step #2.5 (outdated) - make a 'cat', dog', 'not pet' dataset 
# by using only a few images from a limited set of classes.

import random

# make a small 'cat', dog', 'not pet' dataset.

DATASET_PATH = './output_cdn_1k'
DATASET_SIZE = 1000
TRAIN_SET = 0.7
VALID_SET = 0.2
TEST_SET  = 0.1

# 0) prepare.
if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)
os.makedirs(DATASET_PATH)

# 1) copy 'dog' images.
label = name_labels['Dog']
images = list(labelled_images[label])
random.shuffle(images)
images = images[:(DATASET_SIZE/10*2)]
print ('%d images for \'Dog\'' % len(images))            
_copy_files(images[:int(len(images) * 0.7)], os.path.join(DATASET_PATH, 'train', 'dog'))
_copy_files(images[int(len(images) * 0.7):int(len(images) * 0.9)], os.path.join(DATASET_PATH, 'val', 'dog'))
_copy_files(images[int(len(images) * 0.9):], os.path.join(DATASET_PATH, 'test', 'dog'))

# 2) copy 'cat' images.
label = name_labels['Cat']
images = list(labelled_images[label])
random.shuffle(images)
images = images[:(DATASET_SIZE/10*2)]
print ('%d images for \'Cat\'' % len(images))            
_copy_files(images[:int(len(images) * 0.7)], os.path.join(DATASET_PATH, 'train', 'cat'))
_copy_files(images[int(len(images) * 0.7):int(len(images) * 0.9)], os.path.join(DATASET_PATH, 'val', 'cat'))
_copy_files(images[int(len(images) * 0.9):], os.path.join(DATASET_PATH, 'test', 'cat'))

# 3) copy 'not_dog' images.
not_dog = ["Human beard","Towel","Home appliance","Boy","Door","Scarf","Pillow","Woman","Horse","Sofa bed","Human hair","Curtain","Man","Girl","Pig","Loveseat","Television","Vehicle","Human face","Human body"]
avg_not_dog = DATASET_SIZE / 10 * 6 / len(not_dog)
for cls in not_dog:
    lb = name_labels[cls]
    imgs = list(labelled_images[lb])
    random.shuffle(imgs)
    imgs = imgs[:avg_not_dog]
    _copy_files(imgs[:int(len(imgs) * 0.7)], os.path.join(DATASET_PATH, 'train', 'not_dog'))
    _copy_files(imgs[int(len(imgs) * 0.7):int(len(imgs) * 0.9)], os.path.join(DATASET_PATH, 'val', 'not_dog'))
    _copy_files(imgs[int(len(imgs) * 0.9):], os.path.join(DATASET_PATH, 'test', 'not_dog'))

print ('Done.')