## A notebook for looking through the Tiny Image keywords corresponding to CIFAR-10

### Setup, data loading, and some checks

In [1]:
%load_ext autoreload
%autoreload 2

from collections import Counter
import json
import os
import sys

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import cifar10
import utils

keywords = utils.load_cifar10_keywords(unique_keywords=True,
                                       lists_for_unique=True,
                                       version_string='')
cifar = cifar10.CIFAR10Data('../other_data/cifar10')

Loading keywords from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/cifar10_keywords_unique.json


Example entry

In [2]:
keywords[:3]

[[{'nn_index': 40372906,
   'nn_keyword': 'leptodactylus_pentadactylus',
   'nn_l2_dst': 0.0}],
 [{'nn_index': 11334562, 'nn_keyword': 'camion', 'nn_l2_dst': 0.0}],
 [{'nn_index': 71913805, 'nn_keyword': 'tipper_truck', 'nn_l2_dst': 0.0}]]

Check how many indices have multiple associated keywords (should be 0 for a unique keywords file).

In [3]:
indices_with_multiple = []

for ii, kw in enumerate(keywords):
    if len(kw) > 1:
        indices_with_multiple.append(ii)

len(indices_with_multiple)

0

### Compute the keyword counts per class.

In [4]:
class_counters = {}
for cl in cifar.label_names:
    class_counters[cl] = Counter()
for ii, kws in enumerate(keywords):
    cur_keywords = []
    for kw in kws:
        cur_keywords.append(kw['nn_keyword'])
    class_counters[cifar.label_names[cifar.all_labels[ii]]].update(cur_keywords)

### Show the top k keywords for each class

In [5]:
top_k = 100

print_csv = True


all_keywords = []
for class_name, counter in class_counters.items():
    total_count = 0
    for count in counter.values():
        total_count += count
    top_k_count = 0
    cur_keywords = []
    for keyword, count in counter.most_common(top_k):
        top_k_count += count
        cur_keywords.append(keyword)
    all_keywords.extend(cur_keywords)
    print('{}  (keyword count sum: {}, top {} keyword count sum: {}, number of keywords: {})'.format(class_name, total_count, top_k, top_k_count, len(counter)))
    for keyword, count in counter.most_common(top_k):
        print('  {}: {}'.format(keyword, count))
    if print_csv:
        print('')
        print('  Comma-separated list of keywords: {}'.format(','.join(cur_keywords)))
    print('\n')
if print_csv:
    print('Comma-separated list of all keywords: {}'.format(','.join(all_keywords)))

airplane  (keyword count sum: 6000, top 100 keyword count sum: 6000, number of keywords: 29)
  stealth_bomber: 587
  airbus: 537
  stealth_fighter: 474
  fighter_aircraft: 466
  biplane: 443
  attack_aircraft: 401
  airliner: 391
  jetliner: 345
  monoplane: 335
  dive_bomber: 314
  jumbo_jet: 314
  twinjet: 311
  jumbojet: 182
  propeller_plane: 167
  fighter: 133
  amphibious_aircraft: 119
  multiengine_airplane: 105
  plane: 92
  seaplane: 82
  reconnaissance_plane: 53
  airplane: 38
  floatplane: 31
  aeroplane: 23
  hangar_queen: 19
  bomber: 15
  multiengine_plane: 14
  kamikaze: 4
  interceptor: 4
  hydroplane: 1

  Comma-separated list of keywords: stealth_bomber,airbus,stealth_fighter,fighter_aircraft,biplane,attack_aircraft,airliner,jetliner,monoplane,dive_bomber,jumbo_jet,twinjet,jumbojet,propeller_plane,fighter,amphibious_aircraft,multiengine_airplane,plane,seaplane,reconnaissance_plane,airplane,floatplane,aeroplane,hangar_queen,bomber,multiengine_plane,kamikaze,interceptor

## Print information for images with multiple keywords

In [6]:
for ii in indices_with_multiple[:40]:
    cur_class = keywords[ii][0]['cifar10_label']
    kws = []
    for tmp in keywords[ii]:
        kws.append(tmp['nn_keyword'])
    print('\nindex {}  class {}'.format(ii, cur_class))
    for kw in kws:
        print('  {}: {}'.format(kw, class_counters[cur_class][kw]))