In [56]:
# %%writefile prepare_openimages_data.py
"""
Set up
0) curl https://sdk.cloud.google.com | bash
1) gcloud auth login [your_mail_account]
2) mkdir openimagesv4 && cd openimagesv4
3) atleast the validation dataset is needed to create the dev testset
4) mkdir validation && gsutil -m rsync -r gs://open-images-dataset/validation ./validation
5) python prepare_openimages_data.py
"""
import pandas as pd
from pathlib import Path 
import os
from functools import partial
from keras.utils import get_file
import matplotlib.pyplot as plt
import time
import numpy as np 
import PIL
import shutil
from subprocess import call, Popen, PIPE


# processing the labels

PATH_TO_OPENIMAGES = Path(os.environ.get('OPENIMAGES_PATH','/input/all/openimages'))
KINDS = ['train', 'test', 'validation']

PATH_TO_OPENIMAGES.mkdir(exist_ok=True)

# url templates
bbox_annot_urls = 'https://storage.googleapis.com/openimages/2018_04/{kind}/{kind}-annotations-bbox.csv'
human_annot_urls = 'https://storage.googleapis.com/openimages/2018_04/{kind}/{kind}-annotations-human-imagelabels-boxable.csv'
rot_urls = 'https://storage.googleapis.com/openimages/2018_04/{kind}/{kind}-images-boxable-with-rotation.csv'
class_mappings_url = 'https://storage.googleapis.com/openimages/2018_04/class-descriptions-boxable.csv'

rot_tr_url = 'https://storage.googleapis.com/openimages/2018_04/train/train-images-boxable-with-rotation.csv'
rot_test_url = 'https://storage.googleapis.com/openimages/2018_04/test/test-images-with-rotation.csv'
rot_val_url = 'https://storage.googleapis.com/openimages/2018_04/validation/validation-images-with-rotation.csv'


ROT_URLS = [rot_tr_url, rot_test_url, rot_val_url]
ROT_NAMES = ['train-images-boxable-with-rotation.csv','test-images-with-rotation.csv','validation-images-with-rotation.csv']



# annotations / labels dirs
bbox_labels_dir = PATH_TO_OPENIMAGES / 'bbox'
human_labels_dir = PATH_TO_OPENIMAGES /'image-labels'
rot_labels_dir = PATH_TO_OPENIMAGES / 'imageIds'


# image directories
train_images_dir = PATH_TO_OPENIMAGES / 'train'
val_images_dir = PATH_TO_OPENIMAGES  / 'validation'
test_images_dir = PATH_TO_OPENIMAGES / 'test'
challenge2018_images_dir = PATH_TO_OPENIMAGES / 'challenge2018'

train_images_dir.mkdir(exist_ok=True)
val_images_dir.mkdir(exist_ok=True)
test_images_dir.mkdir(exist_ok=True)
human_labels_dir.mkdir(exist_ok=True)
rot_labels_dir.mkdir(exist_ok=True)
challenge2018_images_dir.mkdir(exist_ok=True)



# # download box labels 
def download(template_url, download_dir):
    download_dir.mkdir(exist_ok=True)
    urls = [template_url.format(kind=kind) for kind in KINDS]
    for url in urls:
        fname = url.split('/')[-1]
        download_succes, failed_count = False, 0
        while not download_succes and failed_count < 5:
            try:
                get_file(fname, url, cache_dir=str(download_dir), cache_subdir='')    
                download_succes = True
                failed_count += 1
            except Exception as e:
                print(e)
                print("Failed {}".format(failed_count))
                print("...repeating")
                time.sleep(0.5)
                
                
def download_rot_labels():

    download_dir = rot_labels_dir
    
    for url in ROT_URLS:
        fname = url.split('/')[-1]
        download_succes, failed_count = False, 0       
        while not download_succes and failed_count < 5:
            try:
                get_file(fname, url, cache_dir=str(download_dir), cache_subdir='')    
                download_succes = True
                failed_count += 1
            except Exception as e:
                print(e)
                print("Failed {}".format(failed_count))
                print("...repeating")
                time.sleep(0.5)
                

def get_boxes_dfs():
    collect = []
    for kind in KINDS:
        boxes_csv = pd.read_csv(PATH_TO_OPENIMAGES /'bbox'/ bbox_annot_urls.format(kind=kind).split('/')[-1])
        collect.append(boxes_csv)
    return collect

def get_images_ids_df():
    collect = []
    for kind in KINDS:
        boxes_csv = pd.read_csv(PATH_TO_OPENIMAGES /'image-labels'/ human_annot_urls.format(kind=kind).split('/')[-1])
        collect.append(boxes_csv)
    return collect


def get_image_rot_labels_df():
    collect = []
    for url in ROT_URLS:
        boxes_csv = pd.read_csv(PATH_TO_OPENIMAGES /'imageIds'/ url.split('/')[-1])
        collect.append(boxes_csv)
    return collect    


def enrich_boxes_df(boxes_df, base_path_to_images):
    if not isinstance(base_path_to_images, Path):
        base_path_to_images = Path(base_path_to_images)
        
    labels = pd.read_csv(PATH_TO_OPENIMAGES / 'class-descriptions-boxable.csv', header=None)
    boxes_df['RealName'] = boxes_df.LabelName.map(labels.set_index(0).to_dict()[1]) 
    boxes_df['ImageURI'] = boxes_df.ImageID.apply(lambda x: str(base_path_to_images / (x+'.jpg')))
    boxes_df['LabelID'] = boxes_df.LabelName.factorize()[0]

    return boxes_df

def dld_all_labels():
    print("downloading labels")
    download(bbox_annot_urls, bbox_labels_dir)
    download(human_annot_urls, human_labels_dir)
#     download(rot_urls, rot_labels_dir)
    get_file(class_mappings_url.split('/')[-1],class_mappings_url, cache_dir=str(PATH_TO_OPENIMAGES), cache_subdir='')


      
if __name__ == '__main__':  
    dld_all_labels()
    download_rot_labels()
#     train_boxes_df, test_boxes_df, val_boxes_df = get_boxes()
#     create_dev_dataset(val_boxes_df)
#     dev_sample_df = pd.read_csv(dev_labels_csv, index_col=[0])
    
#     train_boxes_df, test_boxes_df, val_boxes_df = get_boxes_dfs()
#     train_rot_df, test_rot_df, val_rot_df = get_image_rot_labels_df()
#     train_ids_df, test_ids_df, val_ids_df = get_images_ids_df()
#     train_boxes_df_sample.LabelName.unique()


In [1]:
# sample

print(train_rot_df.ImageID.unique().shape)
print(train_labels_df.ImageID.unique().shape)
print(train_boxes_df.ImageID.unique().shape)

# boxes lables

train_boxes_df_sample =  train_boxes_df.sample(5000)
test_boxes_df_sample = test_boxes_df.sample(500)
val_boxes_df_sample = val_boxes_df.sample(500)

# rot labels
train_rot_df_sample = train_rot_df[train_rot_df.ImageID.isin(train_boxes_df_sample.ImageID)]
test_rot_df_sample = test_rot_df[test_rot_df.ImageID.isin(test_boxes_df_sample.ImageID)]
val_rot_df_sample = val_rot_df[val_rot_df.ImageID.isin(val_boxes_df_sample.ImageID)]


#ids labels

train_ids_df_sample = train_ids_df[train_ids_df.ImageID.isin(train_boxes_df_sample.ImageID)]
test_ids_df_sample = test_ids_df[test_ids_df.ImageID.isin(test_boxes_df_sample.ImageID)]
val_ids_df_sample = val_ids_df[val_ids_df.ImageID.isin(val_boxes_df_sample.ImageID)]


SMALL_OPEN_IMAGES = PATH_TO_OPENIMAGES / 'small_openimages'
SMALL_OPEN_IMAGES.mkdir(exist_ok=True)

# annotations / labels dirs
bbox_labels_dir = SMALL_OPEN_IMAGES / 'bbox'
human_labels_dir = SMALL_OPEN_IMAGES /'image-labels'
rot_labels_dir = SMALL_OPEN_IMAGES / 'imageIds'

bbox_labels_dir.mkdir(exist_ok=True)
human_labels_dir.mkdir(exist_ok=True)
rot_labels_dir.mkdir(exist_ok=True)

# image directories
train_images_dir = SMALL_OPEN_IMAGES / 'train'
val_images_dir = SMALL_OPEN_IMAGES  / 'validation'
test_images_dir = SMALL_OPEN_IMAGES / 'test'
challenge2018_images_dir = SMALL_OPEN_IMAGES / 'challenge2018'

train_images_dir.mkdir(exist_ok=True)
val_images_dir.mkdir(exist_ok=True)
test_images_dir.mkdir(exist_ok=True)
human_labels_dir.mkdir(exist_ok=True)
rot_labels_dir.mkdir(exist_ok=True)
challenge2018_images_dir.mkdir(exist_ok=True)


# copy train images

for image in train_boxes_df_sample.ImageID.unique():
    _from = str(PATH_TO_OPENIMAGES / 'train' / (image + '.jpg'))
    _to = str(SMALL_OPEN_IMAGES / 'train' / (image + '.jpg'))
    print(image)    
    shutil.copy(_from, _to)
    
for image in test_boxes_df_sample.ImageID.unique():
    _from = str(PATH_TO_OPENIMAGES / 'test' / (image + '.jpg'))
    _to = str(SMALL_OPEN_IMAGES / 'test' / (image + '.jpg'))
    print(image)    
    shutil.copy(_from, _to)
        
for image in val_boxes_df_sample.ImageID.unique():
    _from = str(PATH_TO_OPENIMAGES / 'validation' / (image + '.jpg'))
    _to = str(SMALL_OPEN_IMAGES / 'validation' / (image + '.jpg'))
    print(image)    
    shutil.copy(_from, _to)
    
    
ROT_NAMES = ['train-images-boxable-with-rotation.csv','test-images-with-rotation.csv','validation-images-with-rotation.csv']
HUMAN_NAMES = ['{kind}-annotations-human-imagelabels-boxable.csv'.format(kind=kind) for kind in ['train','test','validation']]

bbox_labels_dir.mkdir(exist_ok=True)
human_labels_dir.mkdir(exist_ok=True)
rot_labels_dir.mkdir(exist_ok=True)

train_boxes_df_sample.to_csv(bbox_labels_dir / 'train-annotations-bbox.csv', index=None)
test_boxes_df_sample.to_csv(bbox_labels_dir / 'test-annotations-bbox.csv', index=None)
val_boxes_df_sample.to_csv(bbox_labels_dir / 'validation-annotations-bbox.csv', index=None)


# rot labels
train_rot_df_sample.to_csv(rot_labels_dir / ROT_NAMES[0]) 
test_rot_df_sample.to_csv(rot_labels_dir / ROT_NAMES[1]) 
val_rot_df_sample.to_csv(rot_labels_dir / ROT_NAMES[2]) 


#ids labels
train_ids_df_sample.to_csv(human_labels_dir / HUMAN_NAMES[0]) 
test_ids_df_sample.to_csv(human_labels_dir / HUMAN_NAMES[1]) 
val_ids_df_sample.to_csv(human_labels_dir / HUMAN_NAMES[2]) 
