# Let's map ImageNet (1000) labels to COCO (80) label!

In [1]:
import operator
from collections import defaultdict
from nltk.corpus import wordnet as wn

### Read ImageNet labels
- Each label in imagenet may contain multiple entities, e.g., 'tench, Tinca tinca'. 
- These entities are all from wordnet synsets.
- We split them and normalize each entity to be queryable by wordnet synset, e.g., 'tench, Tinca tinca' -> ['tench', 'tinca_tinca'].

In [2]:
imagenet_ids = []
imagenet_labels = []
imagenet_labels_words = []

with open('./imagenet_1000_labels.txt', 'r') as file:
    for i, line in enumerate(file):
        id_, label = line.strip().split(': ')
        label_words = ['_'.join(s.split()).lower() for s in label.split(', ') if s]
        imagenet_ids.append(id_)
        imagenet_labels.append(label)
        imagenet_labels_words.append(label_words)

In [3]:
len(imagenet_ids), len(imagenet_labels), len(imagenet_labels_words)

(1000, 1000, 1000)

### Read COCO labels
And also normalized it to wordnet synset format, e.g., 'traffic light' -> 'traffic_light'

In [4]:
coco_labels = []

with open('coco_labels.txt', 'r') as file:
    for line in file:
        coco_labels.append('_'.join(line.strip().split()).lower())
        
len(coco_labels)

80

### How do we map a ImageNet label to COCO label?
1. For a ImageNet label, find the common synset of every entity in that label. 
2. With the common synset, we can get all its hypernyms.
3. If its hypernyms match one of the coco labels, we map this imagenet label to the matched coco label (1-1 mapping).

In [5]:
def common_synset(words):
    synsets = []
    for word in words:
        synsets.extend(wn.synsets(word))
    count_dict = defaultdict(lambda: 0)
    for synset in synsets:
        count_dict[synset.name()] += 1
    try:
        common_synset_name = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True)[0][0]
    except:
        print(words, count_dict)
    return wn.synset(common_synset_name)

def get_hypernyms(synset):
    return [s[0].name().split('.')[0] for s in list(synset.hypernym_distances())]

In [6]:
imagenet2coco = [None for _ in range(len(imagenet_labels))]

for i, imagenet_label_words in enumerate(imagenet_labels_words):
    imagenet_label_hypernyms = get_hypernyms(common_synset(imagenet_label_words))
    for coco_label in coco_labels:
        if coco_label in imagenet_label_hypernyms:
            imagenet2coco[i] = coco_label

In [7]:
# There are 725 imagenet labels are not belong to COCO label.
len([label for label in imagenet2coco if label == None])

725

### Save results
Format of each line: `{imagenet_folder_id}\t{imagnet_label}\t{coco_label}`. Note that `{coco_label}` may be `None` if no mapping.

In [8]:
with open('imagenet2coco.txt', 'w') as file:
    for i, (id_, label, coco_label) in enumerate(zip(imagenet_ids, imagenet_labels, imagenet2coco)):
        if i != len(imagenet_labels) - 1:
            file.write(f'{id_}\t{label}\t{coco_label}\n')
        else:
            file.write(f'{id_}\t{label}\t{coco_label}')

### Reference
- [ImageNet download](http://image-net.org/download): Login first, and download orginal images (2012).
- ImageNet preprocessing:
    - [Extract nested training image tars](https://github.com/arundasan91/Deep-Learning-with-Caffe/blob/master/Imagenet/How-to-properly-set-up-Imagenet-Dataset.md)
    - [Move validation images to hashed-class-named folders ](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
- [ImageNet all class mapping](https://github.com/tensorflow/models/blob/master/research/inception/inception/data/imagenet_metadata.txt)
- [Imagenet 1000 class mapping](https://gist.github.com/yrevar/667fd94b94f1666137f45d1363f60910): We re-format it in our repo.
- [Simpler human-readable labels for ImageNet](https://github.com/anishathalye/imagenet-simple-labels)
- [ImageNet training in PyTorch](https://github.com/pytorch/examples/tree/master/imagenet)