# Initial file download

In [2]:
import torch
import ssl
import os
import pandas as pd
from openimages.download import download_dataset
from utils.file_utils import extract_specific_files_for_classes, combine_directories, delete_images_without_masks, read_annotations

In [3]:
ssl._create_default_https_context = ssl._create_unverified_context

torch.hub.set_dir(d = "C:\\Users\\Ugne\\torch\\hub")

data_dir = "images"
device = "cuda"
classes =  ["Person", "Skyscraper", "Car"]

class_mapping = {'Skyscraper': 1, 'Car': 2, 'Person': 3}

def get_dir_for_class(class_name, masks_or_images="images"):
    return data_dir+"/"+class_name.lower()+"/"+masks_or_images

download_dirs = [get_dir_for_class(c) for c in classes]+["images/all"]

In [None]:
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

download_dataset(data_dir, classes, limit=1000)

In [9]:
all_label_names = pd.read_csv(data_dir+'\class-descriptions-boxable.csv', names = ["label_name", "class"])
class_label_names = all_label_names[all_label_names["class"].isin(classes)]

all_annotations = pd.concat([read_annotations(dataset_type=t) for t in ["train", "validation", "test"]])
annotations = all_annotations.merge(class_label_names, how = "inner", left_on = "LabelName", right_on="label_name")

validation_test_annotations = annotations[annotations["type"].isin(["validation", "test"])]
validation_test_annotations["for_download"] = validation_test_annotations["type"]+"/"+validation_test_annotations["ImageID"]

validation_test_annotations[["for_download"]].to_csv("validation_test_list.txt", header=False, index=False)

In [None]:
!python downloader.py validation_test_list.txt --download_folder="images/all" --num_processes=5

In [None]:
downloaded_images = sum([os.listdir(d) for d in download_dirs], [])
downloaded_image_ids = [i.replace(".jpg", "") for i in downloaded_images]

downloaded_annotations = annotations[annotations["ImageID"].isin(downloaded_image_ids)]
downloaded_annotations["ImageID_first_character"] = downloaded_annotations["ImageID"].str.slice(0, 1)

downloaded_annotations.to_csv("downloaded_annotations.csv", index=False)

In [None]:
zip_file_groups = set(downloaded_annotations["ImageID_first_character"])

extract_specific_files_for_classes(annotation_df = downloaded_annotations, classes = classes, groups = zip_file_groups)
extract_specific_files_for_classes(annotation_df = downloaded_annotations, classes = classes, groups = zip_file_groups, zip_prefix = "validation-masks-")
extract_specific_files_for_classes(annotation_df = downloaded_annotations, classes = classes, groups = zip_file_groups, zip_prefix = "test-masks-")

In [None]:
masks_per_image = downloaded_annotations.groupby(["ImageID"], as_index=False)\
    .agg(masks = ("MaskPath", "nunique"), unique_classes = ("LabelName", "nunique"))\
    .sort_values(by = "unique_classes", ascending=False)

masks_per_image

In [None]:
target_dir = "images\\all"

combine_directories(download_dirs, target_dir)

delete_images_without_masks(images_dir = target_dir, classes=classes)