# Generating a hypernyms tree
This Python notebook provides a comprehensive guide on how to extract hypernym classes from the standard 1000 ImageNet classes and save the information related to these hypernyms. 

In [32]:
import json
import pandas as pd

data = open('../assets/imagenet/ILSVRC2012_classes_original.json')
data = data.read()
data = json.loads(data)

### Open file with image labels

In [88]:
!cp /Users/kirillbykov/Documents/Work/INVERT/data/imagenet_val/ILSVRC2012_val.csv ../assets/imagenet/ILSVRC2012_val.csv

In [80]:
df = pd.read_csv('/Users/kirillbykov/Documents/Work/INVERT/data/imagenet_val/ILSVRC2012_val.csv')
df

Unnamed: 0,image_name,n01440764,n01443537,n01484850,n01491361,n01494475,n01496331,n01498041,n01514668,n01514859,...,n12768682,n12985857,n12998815,n13037406,n13040303,n13044778,n13052670,n13054560,n13133613,n15075141
0,ILSVRC2012_val_00000001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ILSVRC2012_val_00000002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ILSVRC2012_val_00000003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ILSVRC2012_val_00000004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ILSVRC2012_val_00000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,ILSVRC2012_val_00049996,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,ILSVRC2012_val_00049997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,ILSVRC2012_val_00049998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,ILSVRC2012_val_00049999,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Collecting hypernyms

In [None]:
! pip install nltk
! pip install wn
! python -m wn download omw-en:1.4  # the Open English WordNet 2021

In [None]:
import nltk
from nltk.corpus import wordnet as wn
hyper = lambda s: s.hypernyms()

hypernyms_df = pd.DataFrame()
for index, synset_id in data.items():
    synset = wn.synset_from_pos_and_offset('n',int(synset_id[1:]))
    for hypernym in list(synset.closure(hyper)):
        column_name = '%s%08d' % (hypernym.pos(), hypernym.offset())
        if column_name in hypernyms_df:
            #this is possible due to the fact that imagenet labels do not intersect
            hypernyms_df[column_name] += df[synset_id]
        else:
            hypernyms_df[column_name] = df[synset_id]

#(optional) delete all hypernyms that contain only one class (i.e. 50 images)
#hypernyms_df = hypernyms_df.loc[:, (hypernyms_df.sum(axis=0) > 50)]

In [82]:
df = pd.concat([df, hypernyms_df], axis=1, join="inner")
df

Unnamed: 0,image_name,n01440764,n01443537,n01484850,n01491361,n01494475,n01496331,n01498041,n01514668,n01514859,...,n00017222,n12056217,n12041446,n12157056,n12156819,n12992868,n12997654,n15074962,n14974264,n14580897
0,ILSVRC2012_val_00000001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ILSVRC2012_val_00000002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ILSVRC2012_val_00000003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ILSVRC2012_val_00000004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ILSVRC2012_val_00000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,ILSVRC2012_val_00049996,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,ILSVRC2012_val_00049997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49997,ILSVRC2012_val_00049998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,ILSVRC2012_val_00049999,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create a JSON file with descriptions

In [83]:
df.columns

Index(['image_name', 'n01440764', 'n01443537', 'n01484850', 'n01491361',
       'n01494475', 'n01496331', 'n01498041', 'n01514668', 'n01514859',
       ...
       'n00017222', 'n12056217', 'n12041446', 'n12157056', 'n12156819',
       'n12992868', 'n12997654', 'n15074962', 'n14974264', 'n14580897'],
      dtype='object', length=1861)

In [85]:
from tqdm import tqdm

ILSVRC2012_label_description = {}

for label in tqdm(df.columns[1:]):
    synset = wn.synset_from_pos_and_offset('n',int(label[1:]))
    name = synset.name()
    definition = synset.definition()
    ILSVRC2012_label_description[label] = {'name': name,
                                           'definition': definition,
                                           'offset': 'n' + label[1:]
                                          }

100%|██████████| 1860/1860 [00:00<00:00, 131642.63it/s]


In [87]:
import json

with open('../assets/imagenet/ILSVRC2012_classes.json', 'w') as fp:
    json.dump(ILSVRC2012_label_description, fp)