## Use Annotations File to find and download masks and images from Open Images V6

In [1]:
import glob
import pandas as pd
import os
import wget
import cv2
import matplotlib.pyplot as plt
import shutil 

train_mask_dir = "train_masks"
train_images_dir = "train_images"

class_table = pd.read_csv("class-descriptions-boxable.csv", header=None)
class_to_label = class_table.set_index(1).to_dict()[0] # Friendly Name to ID
label_to_class = class_table.set_index(0).to_dict()[1] # ID to Friendly Name

annotations = pd.read_csv("train-annotations-object-segmentation.csv")
image_ids = pd.read_csv("train-images-boxable-with-rotation.csv")

In [2]:
# Get the Train-0 images
train_0 = annotations.loc[annotations.MaskPath.map(lambda x : x[0]) == '0']
# Add a Nice Label Column
train_0['NiceLabel'] = train_0['LabelName'].map(label_to_class)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [3]:
# Get only the car masks
selected_classes = ['Car']

df = train_0.loc[train_0.NiceLabel.isin(selected_classes)]

In [4]:
df.shape

(11093, 11)

In [5]:
# Choose a box that occupies between 10% and 80% of the screen 
min_perc = 10
max_perc = 80

# Helper Function to calculate the Area  occupied by a mask
def find_area(x):
    return 100*(x['BoxXMax'] - x['BoxXMin'])*(x['BoxYMax']-x['BoxYMin'])

df['Area'] = df.apply(lambda x : find_area(x), axis=1)
df = df.loc[(df['Area'] < max_perc)]
df = df.loc[(df['Area'] > min_perc)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [6]:
# Filter to items with a predicted IoU of 100
df = df.loc[df.PredictedIoU > 0.8]

In [7]:
# Remove Images with multiple masks
mask_counts = df.groupby("ImageID")['MaskPath'].count().reset_index()
single_mask = mask_counts[mask_counts['MaskPath'] < 2]
df = pd.merge(df, single_mask[['ImageID']], 'inner')
df['SavedMaskPath'] = ""

In [8]:
# Copy Files nested under their own labels

for index, row in df.iterrows():
    sub_directory = os.path.join(train_mask_dir, row['NiceLabel'])
    if(not os.path.exists(sub_directory)):
        os.mkdir(sub_directory)
    
    from_path = os.path.join(train_mask_dir, row['MaskPath'])
    extension = row['MaskPath'].split(".")[-1]
    to_path = os.path.join(sub_directory, row['ImageID'] + "." + extension)
    df.loc[index, 'SavedMaskPath'] =to_path # Record where this mask was copied to

    shutil.copy(from_path, to_path)    

In [9]:
# Download the corresponding Images
image_ids = image_ids[['ImageID', 'OriginalURL']]

In [10]:
df_url = pd.merge(df, image_ids, 'left')

In [11]:
df_url["SavedImagePath"] = ""

for index, row in df_url.iterrows():
    url = row['OriginalURL']
    
    sub_directory = os.path.join(train_images_dir, row['NiceLabel'])
    if(not os.path.exists(sub_directory)):
        os.makedirs(sub_directory)
        
    file_extension = url.split(".")[-1] 
    to_path = os.path.join(sub_directory, row['ImageID'] + "." + file_extension)
    df_url.loc[index,'SavedImagePath'] = to_path
    try:
        wget.download(url, to_path)
    except:
        print("Error downloading : {}. Skipping .. ".format(url))

Error downloading : https://farm1.staticflickr.com/7406/13009477864_98f92a82d9_o.jpg. Skipping .. 
Error downloading : https://c6.staticflickr.com/9/8679/16016377644_17b9f1e5c8_o.jpg. Skipping .. 
Error downloading : https://c4.staticflickr.com/4/3682/13926511676_83d4a35225_o.jpg. Skipping .. 
Error downloading : https://farm2.staticflickr.com/7381/13474706873_a480bc959a_o.jpg. Skipping .. 
Error downloading : https://c7.staticflickr.com/4/3861/14487317826_95f3f37a86_o.jpg. Skipping .. 
Error downloading : https://c2.staticflickr.com/6/5084/14054625264_51b110efa4_o.jpg. Skipping .. 
Error downloading : https://c8.staticflickr.com/8/7024/6477032823_7339f79b0c_o.jpg. Skipping .. 
Error downloading : https://farm7.staticflickr.com/2259/2200305022_e6ba827187_o.jpg. Skipping .. 
Error downloading : https://c7.staticflickr.com/3/2158/2199510313_4f805a762a_o.jpg. Skipping .. 
Error downloading : https://c6.staticflickr.com/9/8609/16087686589_f9141b0be5_o.jpg. Skipping .. 
Error downloading : 

In [12]:
# Consistency Check Remove the Image Mask if the corresponding Image failed to download
df_url['Downloaded'] = True

for index, row in df_url.iterrows():
    if(not os.path.exists(row['SavedImagePath'])):
        df_url.loc[index, 'Downloaded'] = False
        # Remove the corresponding Mask to free up space
        if(os.path.exists(row['SavedMaskPath'])):
            print("Purging: {}".format(row['SavedMaskPath']))
            os.remove(row['SavedMaskPath'])

Purging: train_masks/Car/001bebecea382500.png
Purging: train_masks/Car/07589a163639319d.png
Purging: train_masks/Car/0ce7c72a1884958b.png
Purging: train_masks/Car/03591babb5ec62e1.png
Purging: train_masks/Car/0a866aa23de6574e.png
Purging: train_masks/Car/08465bc25f701ad9.png
Purging: train_masks/Car/0ff98355dac98d8f.png
Purging: train_masks/Car/0b165612be073586.png
Purging: train_masks/Car/065d85325a3cdc7e.png
Purging: train_masks/Car/06bddf70429f2ecd.png
Purging: train_masks/Car/043c0cf286a78717.png
Purging: train_masks/Car/02cda91e1d91a854.png
Purging: train_masks/Car/010e19969436776c.png
Purging: train_masks/Car/03447c4a5209f9e1.png
Purging: train_masks/Car/089d32b910de47ee.png
Purging: train_masks/Car/0d5282dac7aeff9c.png
Purging: train_masks/Car/0ef25fa2dbf87b21.png
Purging: train_masks/Car/0c53a79917295396.png
Purging: train_masks/Car/05bac9358b5c6b98.png
Purging: train_masks/Car/0042df4b1aa3cfab.png
Purging: train_masks/Car/004b75d1299e653c.png
Purging: train_masks/Car/0a0138f4e

In [13]:
df_url.Downloaded.value_counts()

True     1766
False     121
Name: Downloaded, dtype: int64

In [14]:
df_url = df_url.loc[df_url.Downloaded] #  Keep downloaded images only
df_url = df_url[["SavedImagePath", "SavedMaskPath", "NiceLabel"]]

In [15]:
# Save Output DataFrame
df_url.to_csv("dataset_paths.csv", index=False)