In [1]:
import csv
import itertools
import json
import pathlib

import networkx as nx

In [2]:
INAT17_LEVELS = ['supercategory', 'name']
INAT18_LEVELS = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'name']

In [3]:
# with open('resources/hierarchy_raw/inat18/train2018.json') as f:
#     categories = json.load(f)['categories']

# with open('resources/hierarchy_raw/inat18/train2018_categories.json') as f:
#     categories = json.load(f)

with open('resources/hierarchy_raw/inat18/categories.json') as f:
    categories = json.load(f)

In [4]:
def count_unique_values(categories):
    # Count number of unique values for each field.
    fields = set(itertools.chain.from_iterable(x.keys() for x in categories))
    unique_values = {k: set() for k in fields}
    for x in categories:
        for k, v in x.items():
            unique_values[k].add(v)
    return {k: len(v) for k, v in unique_values.items()}

sorted((v, k) for k, v in count_unique_values(categories).items())

[(6, 'kingdom'),
 (14, 'supercategory'),
 (25, 'phylum'),
 (57, 'class'),
 (272, 'order'),
 (1118, 'family'),
 (4401, 'genus'),
 (8142, 'id'),
 (8142, 'name')]

In [5]:
def make_graph_inat(levels, categories, root_name='Life', name_fn=None, exclude_na=False):
    # The function name_fn maps list of (level, name) pairs to a unique identifier.
    # This permits the whole path to be used if the names are not unique.

    def default_name_fn(path):
        k, v = path[-1]
        return v

    if name_fn is None:
        name_fn = default_name_fn

    def path_name(path):
        return name_fn(path) if path else root_name

    g = nx.DiGraph()
    for x in categories:
        path = tuple((k, x[k]) for k in levels[:-1])
        if exclude_na:
            path = tuple((k, v) for k, v in path if v != 'NA')
        for i in range(len(path)):
            g.add_edge(path_name(path[:i]), path_name(path[:i+1]))
        g.add_edge(path_name(path), x[levels[-1]])

    # Check that each node has at most one parent.
    multiple_parents = [k for k in g if g.in_degree[k] > 1]
    if multiple_parents:
        raise ValueError('nodes with multiple parents', multiple_parents)

    return g

In [6]:
# Functions that give naming convention for internal nodes.

def full_path(path):
    return '/'.join(v for _, v in path)

def node_with_level(path):
    k, v = path[-1]
    return f'{k} {v}'

In [7]:
g = make_graph_inat(INAT18_LEVELS, categories, name_fn=full_path, exclude_na=False)

In [8]:
def dfs_edges_with_order(g, order):
    visited = set()
    edges = []

    def visit(node):
        if node in visited:
            return
        visited.add(node)
        if not g.in_degree[node]:
            return
        parents = list(g.predecessors(node))
        if len(parents) > 1:
            raise ValueError('multiple parents', node, parents)
        parent, = parents
        visit(parent)
        edges.append((parent, node))
    
    for leaf in order:
        visit(leaf)

    return edges

edges = dfs_edges_with_order(g, [x['name'] for x in categories])

In [9]:
edges[:10]

[('Life', 'Animalia'),
 ('Animalia', 'Animalia/Annelida'),
 ('Animalia/Annelida', 'Animalia/Annelida/Polychaeta'),
 ('Animalia/Annelida/Polychaeta', 'Animalia/Annelida/Polychaeta/Phyllodocida'),
 ('Animalia/Annelida/Polychaeta/Phyllodocida',
  'Animalia/Annelida/Polychaeta/Phyllodocida/Amphinomidae'),
 ('Animalia/Annelida/Polychaeta/Phyllodocida/Amphinomidae',
  'Animalia/Annelida/Polychaeta/Phyllodocida/Amphinomidae/Hermodice'),
 ('Animalia/Annelida/Polychaeta/Phyllodocida/Amphinomidae/Hermodice',
  'Hermodice carunculata'),
 ('Animalia/Annelida/Polychaeta', 'Animalia/Annelida/Polychaeta/Sabellida'),
 ('Animalia/Annelida/Polychaeta/Sabellida',
  'Animalia/Annelida/Polychaeta/Sabellida/Sabellariidae'),
 ('Animalia/Annelida/Polychaeta/Sabellida/Sabellariidae',
  'Animalia/Annelida/Polychaeta/Sabellida/Sabellariidae/Phragmatopoma')]

In [10]:
input_dir = pathlib.Path('resources/hierarchy_raw')
output_dir = pathlib.Path('resources/hierarchy')

output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)

In [11]:
with open(input_dir / 'inat18/categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories, name_fn=full_path)
edges = dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat18.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

In [12]:
with open(input_dir / 'inat18/train2018_categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
edges = dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat18_anon.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

In [13]:
with open(input_dir / 'inat19/categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
edges = dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat19.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

In [14]:
with open(input_dir / 'inat19/train2019_categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories)
edges = dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat19_anon.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

In [15]:
with open(input_dir / 'inat17/train2017_categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT17_LEVELS, categories)
edges = dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat17.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

In [16]:
with open(input_dir / 'inat21/train_categories.json') as f:
    categories = json.load(f)
g = make_graph_inat(INAT18_LEVELS, categories, name_fn=full_path, exclude_na=False)
edges = dfs_edges_with_order(g, [x['name'] for x in categories])
with open(output_dir / 'inat21.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)