# Sample new labeled images for the v4 dataset


In [1]:
%load_ext autoreload
%autoreload 2

import io
import math
import pickle
import os
import sys
import random
import json

import numpy as np

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import cifar10
import utils

cifar = cifar10.CIFAR10Data('../other_data/cifar10')
cifar_labels = cifar.all_labels

version_string = 'v4'

all_new_imgs, img_data = utils.load_tinyimage_subset(version_string=version_string)
distances = utils.load_distances_to_cifar10(version_string)
cifar10_keywords = utils.load_cifar10_by_keyword(unique_keywords=True, version_string=version_string)

with open('../other_data/tinyimage_good_indices_subselected_v4.json', 'r') as f:
    tinyimage_good_indices = json.load(f)
# Blacklist contains images that are near-duplicates in CIFAR-10
with open('../other_data/blacklist_v4.json') as f:
    blacklist = json.load(f)
with open('../other_data/keywords_v4.json') as f:
    new_keywords = json.load(f)

    

# CIFAR-10 keywords

Determine the number of images for each keyword in CIFAR-10. If the keyword belongs to multiple classes, we assign the keyword to the class where it occurrs most frequently.

In [7]:
# Skip keywords for the associated list of classes.
skip_keywords = {
    'cruiser': [8],
    'sound_truck' : [1], 
    'cavalier' : [1],
    'domestic_dog' :[3],
    'persian_cat' :[5],
    'trailer_truck' : [1],
    'frog' : [5],
    'cab' :[9],
    'tractor_trailer' : [8],
    'pet' :[3],
    'ambulance':[9, 8],
    'gray' : [6],
    'taxi' :[0],
    'toy' : [2, 8, 3],
    'automobile' : [8],
    'sparrow' : [8],
    'lark': [7],
    'ford': [9]
}

# Build a map from keyword in CIFAR-10 to number of images in CIFAR-10
cifar10_by_keyword = {}
keyword_to_class = {}
for ii, keyword_entries in enumerate(cifar10_keywords):
    for entry in keyword_entries:
        cur_keyword = entry['nn_keyword']
        if cur_keyword in new_keywords:
            if (cur_keyword in skip_keywords) and (cifar_labels[ii] in skip_keywords[cur_keyword]):
                pass
            else:
                if cur_keyword in keyword_to_class:
                    if not keyword_to_class[cur_keyword] == cifar_labels[ii]:
                        print(cur_keyword)
                        print(ii)
                        print(keyword_to_class[cur_keyword])
                        print(cifar_labels[ii])
                    assert(keyword_to_class[cur_keyword] == cifar_labels[ii])
                else:
                    keyword_to_class[cur_keyword] = cifar_labels[ii]
            if not cur_keyword in cifar10_by_keyword:
                cifar10_by_keyword[cur_keyword] = 0
            cifar10_by_keyword[cur_keyword] +=1

# Sampling code

In [8]:
random.seed(670725112)
new_data = np.empty((2021, 32,32,3), float)
new_labels = np.empty(2021, int)

# These keywords need 0 new images in the new dataset
if 'sport_car' in new_keywords:
    new_keywords.remove('sport_car')
if 'door' in new_keywords:
    new_keywords.remove('door')
if 'ford' in new_keywords:
    new_keywords.remove('ford')
if 'opel' in new_keywords:
    new_keywords.remove('opel')
if 'sports_car' in new_keywords:
    new_keywords.remove('sports_car')

i = 0
threshold = 1000
tiny_image_map = []
new_indices_dict = {}
for keyword_name in new_keywords:
    
    cur_good_indices_1 = set(tinyimage_good_indices[keyword_name])
    cur_good_indices_2 = []
    cur_good_indices = []
    
    # Remove if the idx has an l2 nearest neighbor in CIFAR-10
    for idx in cur_good_indices_1:
        cur_distance = distances[idx][0][1]
        if cur_distance > threshold:
            cur_good_indices_2.append(idx)
    
    # Remove if the idx is on the blacklist 
    # (the blacklist mostly contains near duplicates with CIFAR-10)
    for idx in cur_good_indices_2:
        if idx not in blacklist:
            cur_good_indices.append(idx)

    num_cifar10_indices = cifar10_by_keyword[keyword_name]
    new_imgs = all_new_imgs[keyword_name]
    
    num_selected_images = np.int(len(cur_good_indices))
    if num_cifar10_indices / 30 < 0.5:
        num_new_images = 0
    else:
        num_new_images = np.int(np.ceil(num_cifar10_indices/30))

    if len(cur_good_indices) < num_new_images:
        print(keyword_name)
        print(num_new_images)
        print(len(cur_good_indices))
        continue
    
    # Sample the correct number of new indices
    sampled_indices = random.sample(cur_good_indices, num_new_images)
    new_indices_dict[keyword_name] = list(sampled_indices)

    # Add the images and labels for this keyword
    for idx in sampled_indices:
        tiny_image_map.append(idx)
        new_data[i] = img_data[idx]
        new_labels[i] = np.int(keyword_to_class[keyword_name])
        i = i+1

print('Got {} images'.format(i))

# Save a map from index in the new dataset to TinyImage index
with open('../other_data/cifar10.1_v4_ti_indices_map.json', 'w') as f:
    json.dump(tiny_image_map, f, indent=2)

np.save('../datasets/cifar10.1_v4_data.npy', new_data.astype(np.uint8))
np.save('../datasets/cifar10.1_v4_labels.npy', new_labels.astype(np.int32))


Got 2021 images
