In [1]:
import collections
import csv
import itertools
import json
import pathlib

import networkx as nx
import numpy as np

import util

In [2]:
RESOURCES_DIR = pathlib.Path('../resources')

In [3]:
INAT17_LEVELS = ['supercategory', 'name']
INAT18_LEVELS = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'name']

In [4]:
# with open(RESOURCES_DIR / 'hierarchy_raw/inat18/train2018.json') as f:
#     categories = json.load(f)['categories']

# with open(RESOURCES_DIR / 'hierarchy_raw/inat18/categories_anon.json') as f:
#     categories = json.load(f)

with open(RESOURCES_DIR / 'hierarchy_raw/inat18/categories.json') as f:
    categories = json.load(f)

In [5]:
def count_unique_values(categories):
    # Count number of unique values for each field.
    fields = set(itertools.chain.from_iterable(x.keys() for x in categories))
    unique_values = {k: set() for k in fields}
    for x in categories:
        for k, v in x.items():
            unique_values[k].add(v)
    return {k: len(v) for k, v in unique_values.items()}

sorted((v, k) for k, v in count_unique_values(categories).items())

[(6, 'kingdom'),
 (14, 'supercategory'),
 (25, 'phylum'),
 (57, 'class'),
 (272, 'order'),
 (1118, 'family'),
 (4401, 'genus'),
 (8142, 'id'),
 (8142, 'name')]

In [6]:
def make_graph_inat(levels, categories, root_name='Life', exclude_na=False):
    paths = []
    for cat in categories:
        path = [root_name] + [cat[level] for level in levels]
        if exclude_na:
            path = [x for x in path if x != 'NA']
        paths.append(path)

    # Find parents of each node (to check whether unique).
    parents = collections.defaultdict(set)
    for path in paths:
        for u, v in zip(path, path[1:]):
            parents[v].add(u)

    # Modify path names in-place.
    for path in paths:
        for i in range(len(path)):
            if len(parents[path[i]]) > 1:
                path[i] = '{parent}/{node}'.format(parent=path[i - 1], node=path[i])

    g = nx.DiGraph()
    for path in paths:
        for u, v in zip(path, path[1:]):
            g.add_edge(u, v)
    return g

In [7]:
g = make_graph_inat(INAT18_LEVELS, categories, exclude_na=False)

In [8]:
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])

In [9]:
edges[:10]

[('Life', 'Animalia'),
 ('Animalia', 'Annelida'),
 ('Annelida', 'Polychaeta'),
 ('Polychaeta', 'Phyllodocida'),
 ('Phyllodocida', 'Amphinomidae'),
 ('Amphinomidae', 'Hermodice'),
 ('Hermodice', 'Hermodice carunculata'),
 ('Polychaeta', 'Sabellida'),
 ('Sabellida', 'Sabellariidae'),
 ('Sabellariidae', 'Phragmatopoma')]

In [10]:
def print_summary(g: nx.DiGraph):
    degree = np.array([g.out_degree[x] for x in g])
    print('nodes {}, leaf {}, internal {} (non-trivial {}), median branch {} (non-trivial {})'.format(
        len(g), np.sum(degree == 0), np.sum(degree > 0), np.sum(degree > 1),
        np.median(degree[degree > 0]), np.median(degree[degree > 1])))

In [11]:
print_summary(g)

nodes 14036, leaf 8142, internal 5894 (non-trivial 2111), median branch 1.0 (non-trivial 3.0)


In [12]:
input_dir = RESOURCES_DIR / 'hierarchy_raw'
output_dir = RESOURCES_DIR / 'hierarchy'

output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)

In [13]:
with open(input_dir / 'inat18/categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
print_summary(g)
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat18.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

nodes 14036, leaf 8142, internal 5894 (non-trivial 2111), median branch 1.0 (non-trivial 3.0)


In [14]:
with open(input_dir / 'inat18/categories_anon.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
print_summary(g)
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat18_anon.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

nodes 14036, leaf 8142, internal 5894 (non-trivial 2111), median branch 1.0 (non-trivial 3.0)


In [15]:
with open(input_dir / 'inat19/categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
print_summary(g)
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat19.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

nodes 1190, leaf 1010, internal 180 (non-trivial 103), median branch 2.0 (non-trivial 11.0)


In [16]:
with open(input_dir / 'inat19/categories_anon.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
print_summary(g)
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat19_anon.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

nodes 1190, leaf 1010, internal 180 (non-trivial 103), median branch 2.0 (non-trivial 11.0)


In [17]:
with open(input_dir / 'inat17/categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT17_LEVELS, categories)
print_summary(g)
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat17.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

nodes 5103, leaf 5089, internal 14 (non-trivial 14), median branch 104.0 (non-trivial 104.0)


In [18]:
with open(input_dir / 'inat21/categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories, exclude_na=False)
print_summary(g)
edges = util.dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat21.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

nodes 16344, leaf 10000, internal 6344 (non-trivial 2445), median branch 1.0 (non-trivial 3.0)
