## 1. download preprocessed data file from GDrive

https://drive.google.com/file/d/1gpRoUAUUmSHYL2a103FybDbufE6iX7sS/view?usp=sharing

In [32]:
import os
import gdown

def download_file_from_google_drive(file_id, destination):
    if not os.path.exists(destination):
        url = f'https://drive.google.com/uc?id={file_id}'
        gdown.download(url, destination, quiet=False)
    else:
        print("File already exists.")

file_id = '1gpRoUAUUmSHYL2a103FybDbufE6iX7sS'
destination = 'data/data.pkl'

download_file_from_google_drive(file_id, destination)

File already exists.


## 2. Load data

In [20]:
import pickle


def unpickle(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo)
    return data


training_data = unpickle('data/data.pkl')

Structure of preprocessed data:
```{python}
{
    "Images": [...],
    "Labels": [...],
    "GeneralClass": [...]
}
```

`training_data["Images"]` is a list of 32x32x3 ndarray, each represents an Image.
`training_data["Labels"]` represents that image's label subclass ID, range from 0 to 87 (see
`subclass_mapping.csv`)
`training_data["GeneralClass"]` represents that image's label general ID, range from 0 to 3 (see
`superclass_mapping.csv`)

## 3. Load labels mapping

In [23]:
import csv

subclass_generalclass_map = {}  # {"Subclass ID": "General class ID"}
imageNetId_taskId_map = {}  # {"ImageNet ID": "Subclass ID"}
subclassId_className_map = {}  # {"Subclass ID": "Subclass name"}
generalclassId_className_map = {0: "bird", 1: "dog", 2: "reptile", 3: "novel"}  # {"General class
# ID": "General Class Name"}
with open("data/subclass_mapping.csv", "r", newline="", encoding="utf-8") as f:
    csv_file = csv.reader(f)
    next(csv_file, None)

    for row in csv_file:
        try:
            subclass_generalclass_map[int(row[4])] = int(row[3])
            imageNetId_taskId_map[int(row[4])] = int(row[0])
            subclassId_className_map[int(row[0])] = row[1]
        except ValueError:
            pass
    subclassId_className_map[87] = "Novel"

# 4. Some statistics

In [21]:
generalclass_count = {0: 0, 1: 0, 2: 0, 3: 0}
for super_class in training_data["GeneralClass"]:
    generalclass_count[super_class] += 1
for super_class in generalclassId_className_map:
    print(generalclassId_className_map[super_class] + ": " + str(generalclass_count[super_class]) +
          " Images")

bird: 36224 Images
dog: 36224 Images
reptile: 36224 Images
novel: 36224 Images


In [26]:
subclass_count = {}
for subclass in subclassId_className_map:
    subclass_count[subclass] = 0
for subclass in training_data["Labels"]:
    subclass_count[subclass] += 1
for subclass in subclassId_className_map:
    print(subclassId_className_map[subclass] + "\n\t\t\t\t\t\t " + str(subclass_count[subclass]) +
          " Images")

Scotch terrier, Scottish terrier, Scottie
						 1300 Images
African chameleon, Chamaeleo chamaeleon
						 1252 Images
standard schnauzer
						 1300 Images
terrapin
						 1246 Images
great grey owl, great gray owl, Strix nebulosa
						 1245 Images
bustard
						 1241 Images
ptarmigan
						 1256 Images
Pekinese, Pekingese, Peke
						 1300 Images
hen
						 1261 Images
Lhasa, Lhasa apso
						 1300 Images
Lakeland terrier
						 1300 Images
pelican
						 1234 Images
Tibetan terrier, chrysanthemum dog
						 1300 Images
agama
						 1255 Images
junco, snowbird
						 1253 Images
mud turtle
						 1244 Images
cock
						 1253 Images
cairn, cairn terrier
						 1300 Images
Blenheim spaniel
						 1300 Images
brambling, Fringilla montifringilla
						 1256 Images
king penguin, Aptenodytes patagonica
						 1248 Images
Chihuahua
						 1300 Images
Japanese spaniel
						 772 Images
Dandie Dinmont, Dandie Dinmont terrier
						 1156 Images
bald eagle, American eagle, Haliaeetus leucocep

## 5. Test the image and the label is correctly aligned

In [29]:
from PIL import Image

# Show first ten images with label to make sure they are correct
for i in range(10):
    img = training_data['Images'][i]
    file_name = "samples/" + str(i) + "-" + \
                generalclassId_className_map[training_data["GeneralClass"][i]] + "-" \
                + \
                subclassId_className_map[training_data["Labels"][i]] + '.png'
    Image.fromarray(img, 'RGB').save(file_name)
    print(file_name)

samples/0-novel-Novel.png
samples/1-reptile-triceratops.png
samples/2-reptile-Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis.png
samples/3-dog-Maltese dog, Maltese terrier, Maltese.png
samples/4-dog-basset, basset hound.png
samples/5-dog-West Highland white terrier.png
samples/6-dog-Maltese dog, Maltese terrier, Maltese.png
samples/7-bird-goldfinch, Carduelis carduelis.png
samples/8-novel-Novel.png
samples/9-dog-West Highland white terrier.png
