In [22]:
import collections
import csv
import functools
import gzip
import itertools
import json
import math
import pathlib

import lmdb
import networkx as nx
import numpy as np

import util

RESOURCES_DIR = pathlib.Path('../resources')

In [163]:
with open(RESOURCES_DIR / 'hierarchy_raw/audioset/ontology.json') as f:
    ontology = json.load(f)

tag_to_cat = {x['id']: x for x in ontology}
tag_to_name = {x['id']: x['name'] for x in ontology}
name_to_tag = {x['name']: x['id'] for x in ontology}
blacklist = set(x['id'] for x in ontology if 'blacklist' in x['restrictions'])

In [164]:
g = nx.DiGraph()
for x in ontology:
    node = x['id']
    for child in x['child_ids']:
        g.add_edge(node, child)

In [165]:
top_nodes = [x for x in g if g.in_degree(x) == 0]
[tag_to_name[x] for x in top_nodes]

['Human sounds',
 'Animal',
 'Music',
 'Natural sounds',
 'Sounds of things',
 'Source-ambiguous sounds',
 'Channel, environment and background']

In [166]:
# Add root node (not present in topology).
for x in top_nodes:
    g.add_edge('root', x)

tag_to_name['root'] = 'ROOT'

len(g)

633

In [173]:
# The labels do not cover the entire hierarchy.
# Take the subgraph that is reachable from the labels.

with open(RESOURCES_DIR / 'hierarchy_raw/audioset/class_labels_indices.csv') as f:
    reader = csv.reader(f)
    next(reader)  # Skip header row.
    label_order = [row[1] for row in reader]

label_tags = list(label_order)

In [174]:
# Note: Some of the *labels* are in the blacklist.
{tag: tag_to_name[tag] for tag in label_tags if tag in blacklist}

{'/m/01jg1z': 'Heart murmur',
 '/m/07r4gkf': 'Patter',
 '/m/074ft': 'Song',
 '/m/01h82_': 'Engine knocking',
 '/m/02z32qm': 'Fusillade',
 '/m/0b_fwt': 'Electronic tuner',
 '/m/07s8j8t': 'Roll',
 '/m/07qmpdm': 'Clatter',
 '/m/0b9m1': 'Harmonic',
 '/m/08p9q4': 'Sidetone',
 '/m/01jwx6': 'Vibration',
 '/m/07hvw1': 'Field recording'}

In [176]:
# Take the subgraph that is reachable from non-blacklisted labels.

def subgraph_with_ancestors(g, nodes):
    # Expand skeleton to include ancestors.
    nodes = set(nodes)
    expanded = set.union(
        nodes,
        set(itertools.chain.from_iterable(
            nx.ancestors(g, node) for node in nodes)))
    return g.subgraph(expanded)

label_tags = [x for x in label_tags if x not in blacklist]
g = subgraph_with_ancestors(g, label_tags)
len(g)

544

In [177]:
# There still exist some nodes with multiple parents.
{tag_to_name[node]: [tag_to_name[p] for p in g.predecessors(node)]
 for node in g if g.in_degree(node) > 1}

{'Children shouting': ['Shout', 'Human group actions'],
 'Choir': ['Singing', 'Musical instrument'],
 'Chant': ['Singing', 'Vocal music'],
 'Clapping': ['Hands', 'Human group actions'],
 'Hubbub, speech noise, speech babble': ['Human group actions', 'Noise'],
 'Howl': ['Dog', 'Canidae, dogs, wolves'],
 'Growling': ['Dog',
  'Cat',
  'Roaring cats (lions, tigers)',
  'Canidae, dogs, wolves'],
 'Hiss': ['Cat', 'Snake', 'Steam', 'Onomatopoeia'],
 'Clip-clop': ['Horse', 'Clicking'],
 'Cowbell': ['Cattle, bovinae', 'Percussion', 'Bell'],
 'Bleat': ['Goat', 'Sheep'],
 'Chirp, tweet': ['Bird vocalization, bird call, bird song', 'Brief tone'],
 'Buzz': ['Fly, housefly', 'Bee, wasp, etc.', 'Brief tone'],
 'Rattle': ['Snake', 'Onomatopoeia'],
 'Bell': ['Musical instrument', 'Sounds of things'],
 'Bicycle bell': ['Bell', 'Bicycle', 'Alarm'],
 'Beatboxing': ['Hip hop music', 'Vocal music'],
 'Wind noise (microphone)': ['Wind', 'Microphone'],
 'Crackle': ['Fire', 'Onomatopoeia'],
 'Vehicle horn, ca

In [178]:
# Just in case, check whether every node is "required".
# Each node is said to be "required" if
#   i) it is a leaf node, or
#   ii) it has a child that is "required" and has no other parents
# In this case, all nodes are required.

def find_required(g):

    @functools.lru_cache
    def required(x):
        children = list(g.successors(x))
        if len(children) == 0:
            return True
        return any(required(c) and g.in_degree[c] == 1 for c in children)

    return {x: required(x) for x in g}

required = find_required(g)
all(required)

True

In [180]:
# Check which labels have multiple paths from root to node.

def find_num_paths(g):
    @functools.lru_cache
    def num_paths(x):
        parents = list(g.predecessors(x))
        if not parents:
            return 1  # Root node has 1 path.
        return sum(num_paths(parent) for parent in parents)
    return {x: num_paths(x) for x in g}

num_paths = find_num_paths(g)

# Most labels have a single path.
dict(sorted(collections.Counter([num_paths[x] for x in label_tags]).items()))

{1: 469, 2: 41, 3: 1, 4: 4}

In [181]:
# Remove the nodes that have multiple paths.
# Check which labels we are removing.

def print_multiple_paths(g, label_tags):
    num_paths = find_num_paths(g)

    for tag in label_tags:
        if not num_paths[tag] > 1:
            continue
        ancestors = nx.ancestors(g, tag)
        # Find ancestors with multiple parents.
        many_to_one = [
            (set(g.predecessors(u)), u) for u in
            itertools.chain([tag], ancestors) if g.in_degree[u] > 1
        ]
        # Find ancestors with multiple children that are ancestors.
        one_to_many = [
            (u, vs) for u, vs in 
            ((u, set(g.successors(u)).intersection(ancestors)) for u in ancestors)
            if len(vs) > 1
        ]

        print(tag_to_name[tag])
        for us, v in many_to_one:
            print('  ({} <- {})'.format(tag_to_name[v], [tag_to_name[u] for u in us]))
        for u, vs in one_to_many:
            print('  ({} <- {})'.format([tag_to_name[v] for v in vs], tag_to_name[u]))

print_multiple_paths(g, label_tags)

Children shouting
  (Children shouting <- ['Human group actions', 'Shout'])
  (['Human group actions', 'Human voice'] <- Human sounds)
Choir
  (Choir <- ['Musical instrument', 'Singing'])
  (['Music', 'Human sounds'] <- ROOT)
Chant
  (Chant <- ['Vocal music', 'Singing'])
  (['Music', 'Human sounds'] <- ROOT)
Mantra
  (Chant <- ['Vocal music', 'Singing'])
  (['Music', 'Human sounds'] <- ROOT)
Clapping
  (Clapping <- ['Human group actions', 'Hands'])
  (['Human group actions', 'Hands'] <- Human sounds)
Hubbub, speech noise, speech babble
  (Hubbub, speech noise, speech babble <- ['Noise', 'Human group actions'])
  (['Channel, environment and background', 'Human sounds'] <- ROOT)
Howl
  (Howl <- ['Canidae, dogs, wolves', 'Dog'])
  (['Domestic animals, pets', 'Wild animals'] <- Animal)
Growling
  (Growling <- ['Roaring cats (lions, tigers)', 'Cat', 'Canidae, dogs, wolves', 'Dog'])
  (['Cat', 'Dog'] <- Domestic animals, pets)
  (['Roaring cats (lions, tigers)', 'Canidae, dogs, wolves'] <- W

In [208]:
def delete_node(g, x):
    if x not in g:
        return g
    h = nx.DiGraph(g)
    h.remove_node(x)
    node_subset = ['root'] + list(nx.descendants(h, 'root'))
    return g.subgraph(node_subset)

def delete_edge(g, u, v):
    if not g.has_edge(u, v):
        return g
    h = nx.DiGraph(g)
    h.remove_edge(u, v)
    return h

In [231]:
# Try to conform DAG to a tree by making classes mutually exclusive!
# Could simply delete nodes with more than 1 path.
# However, we often find that classes like onomatopoeia
# lead to multiple labels per example at test time.

# The hierarchy should be firstly a hierarchy of sources
# and then a hierarchy of types of noise from that source.
# source category -> source -> sound category

# Helpful visualization:
# http://www.jordipons.me/apps/audioset/

# Channel, environment and background:
g = delete_node(g, name_to_tag['Acoustic environment'])
g = delete_node(g, name_to_tag['Sound reproduction'])
# Noise:
g = delete_node(g, name_to_tag['Background noise'])
g = delete_node(g, name_to_tag['Hubbub, speech noise, speech babble'])
g = delete_node(g, name_to_tag['Cacophony'])
g = delete_node(g, name_to_tag['Throbbing'])
g = delete_node(g, name_to_tag['Vibration'])

# Animal:
# "Growling" is a sound category for cat, dog, wild cats, wild dogs.
g = delete_node(g, name_to_tag['Growling'])
# "Howl" is a sound category for dog, wild dogs.
g = delete_node(g, name_to_tag['Howl'])
# "Hiss" is a sound category for cat, snake.
g = delete_node(g, name_to_tag['Hiss'])
# "Buzz" is a sound category for bee, fly.
g = delete_node(g, name_to_tag['Buzz'])
# "Bleat" is a sound category for goat, sheep.
g = delete_node(g, name_to_tag['Bleat'])
# "Cowbell" is a sound made by the bell, not the cow.
# (Although cowbell can be both musical instrument and bell.)
g = delete_edge(g, name_to_tag['Cattle, bovinae'], name_to_tag['Cowbell'])

# Music:
# Only the musical instrument is the source.
g = delete_node(g, name_to_tag['Music mood'])
g = delete_node(g, name_to_tag['Music role'])
g = delete_node(g, name_to_tag['Musical concepts'])
g = delete_node(g, name_to_tag['Music genre'])
# The source of "Choir" is a group of people, not an object.
g = delete_edge(g, name_to_tag['Musical instrument'], name_to_tag['Choir'])

# Human sounds:
# The source is EITHER one human or a group of humans.
# Human voice:
# "Children shouting" is inherently a group of humans.
g = delete_edge(g, name_to_tag['Shout'], name_to_tag['Children shouting'])

# Onomatopoeia is a property of the word, it does not describe a source.
g = delete_node(g, name_to_tag['Onomatopoeia'])


# Silence is the absence of a sound.
g = delete_node(g, name_to_tag['Silence'])


# Remove source-ambiguous sounds.
g = delete_node(g, name_to_tag['Source-ambiguous sounds'])


# The source is EITHER a group or a single person.
# Detach human group actions from their other parents.

# Often we have two classes that are a part and a whole.
# The part is what makes the sound.
# We eliminate the whole.
# The cowbell is part of the cow.
# The engine is part of the car, aeroplane, etc.
# The bell is part of the bicycle.

# "Police car (siren)" belongs to "Siren" and not to "Emergency vehicle".
g = delete_node(g, name_to_tag['Emergency vehicle'])
# "Vehicle horn, car horn, honking" is not the sound of the car.
g = delete_edge(g, name_to_tag['Car'], name_to_tag['Vehicle horn, car horn, honking'])
g = delete_edge(g, name_to_tag['Truck'], name_to_tag['Air horn, truck horn'])

label_tags = [x for x in label_tags if x in g]

print_multiple_paths(g, label_tags)

Clapping
  (Clapping <- ['Human group actions', 'Hands'])
  (['Human group actions', 'Hands'] <- Human sounds)
Cowbell
  (Cowbell <- ['Bell', 'Percussion'])
  (Bell <- ['Musical instrument', 'Sounds of things'])
  (['Bell', 'Percussion'] <- Musical instrument)
  (['Music', 'Sounds of things'] <- ROOT)
Bell
  (Bell <- ['Musical instrument', 'Sounds of things'])
  (['Music', 'Sounds of things'] <- ROOT)
Church bell
  (Bell <- ['Musical instrument', 'Sounds of things'])
  (['Music', 'Sounds of things'] <- ROOT)
Jingle bell
  (Bell <- ['Musical instrument', 'Sounds of things'])
  (['Music', 'Sounds of things'] <- ROOT)
Bicycle bell
  (Bicycle bell <- ['Alarm', 'Bell', 'Bicycle'])
  (Bell <- ['Musical instrument', 'Sounds of things'])
  (['Alarm', 'Bell', 'Vehicle'] <- Sounds of things)
  (['Music', 'Sounds of things'] <- ROOT)
Tuning fork
  (Bell <- ['Musical instrument', 'Sounds of things'])
  (['Music', 'Sounds of things'] <- ROOT)
Chime
  (Bell <- ['Musical instrument', 'Sounds of thing

(544, True)

In [None]:
# Check for nodes whose entire subtrees are blacklisted.
blacklisted = [
    node for node in g if node != 'root'
    and node in blacklist
    and all(x in blacklist for x in nx.descendants(g, node))]

[tag_to_name[node] for node in blacklisted]

In [None]:
# Delete all blacklisted classes.
g = nx.DiGraph(g_present)
g.remove_nodes_from(blacklisted)
len(g)

In [None]:
leaf_nodes = set(x for x in g if g.out_degree[x] == 0)
len(leaf_nodes)

In [None]:
# Check for nodes with multiple parents.
sum(1 for x in g if g.in_degree[x] > 1)

In [None]:
for node in g:
    parents = list(g.predecessors(node))
    if len(parents) > 1:
        print('{!r} has multiple parents: {!r}'.format(
            tag_to_name[node],
            [tag_to_name[p] for p in parents]))

In [None]:
# Find LCA of parents for nodes that have multiple parents.
for node in g:
    parents = list(g.predecessors(node))
    if len(parents) > 1:
        lca = functools.reduce(
            functools.partial(nx.lowest_common_ancestor, g),
            parents)
        print('{!r} <- {!r} <- ... <- {!r}'.format(
              tag_to_name[node],
              [tag_to_name[p] for p in parents],
              tag_to_name[lca]))

In [None]:
leaf_descendants = {x: set(nx.descendants(g, x)).union({x}).intersection(leaf_nodes) for x in g}
num_leaf_descendants = {k: len(v) for k, v in leaf_descendants.items()}

In [None]:
min(num_leaf_descendants.values()), max(num_leaf_descendants.values())

In [None]:
information = {x: math.log(len(leaf_nodes)) - math.log(num_leaf_descendants[x]) for x in g}

In [None]:
# Compare amount of information in different parents.
for node in g:
    parents = list(g.predecessors(node))
    if len(parents) > 1:
        lca = functools.reduce(
            functools.partial(nx.lowest_common_ancestor, g),
            parents)
        print('{!r} <- {!r} <- ... <- {!r}'.format(
              tag_to_name[node],
              [(tag_to_name[p], round(information[p], 2)) for p in parents],
              tag_to_name[lca]))

In [None]:
# Take subgraph containing problematic nodes for visualization.
subg = subgraph_with_ancestors(g, [x for x in g if g.in_degree[x] > 1])
len(subg)

In [None]:
from matplotlib import pyplot as plt

In [None]:
subg.graph['graph'] = dict(rankdir='LR')
pos = nx.nx_pydot.graphviz_layout(subg, prog='dot')
plt.figure(figsize=[20, 30])
nx.draw(
    subg, pos,
    node_color='#ccccff',
    with_labels=True,
    labels={k: tag_to_name[k] for k in subg},
    nodelist=list(subg))

In [None]:
whitelist_label_tags = [x for x in label_tags if x not in blacklist]
len(whitelist_label_tags), len(label_tags)

In [None]:
# What fraction of labels have multiple paths?
sum(1 for x in whitelist_label_tags if num_paths[x] > 1), len(whitelist_label_tags)

In [None]:
# What's the distribution of number of paths?
dict(collections.Counter(num_paths[tag] for tag in whitelist_label_tags).items())

In [None]:
{tag_to_name[tag]: num_paths[tag] for tag in whitelist_label_tags if num_paths[tag] >= 3}

In [None]:
{tag_to_name[tag]: num_paths[tag] for tag in whitelist_label_tags if num_paths[tag] == 2}

In [None]:
# Get subset of (whitelisted) labels that have a single path from the root.
tree_labels = set([x for x in whitelist_label_tags if num_paths[x] == 1])
len(tree_labels)

In [None]:
tree = subgraph_with_ancestors(g, tree_labels)
len(tree)

In [None]:
# Write hierarchy with subset of labels.
edges = util.dfs_edges_with_order(tree, [x for x in label_tags if x in tree_labels])
with open(RESOURCES_DIR / 'hierarchy/audioset_tree.csv', 'w') as f:
    w = csv.writer(f)
    for edge in edges:
        w.writerow(edge)

In [None]:
# Write label subset.
with open(RESOURCES_DIR / 'hierarchy/audioset_tree_subset.txt', 'w') as f:
    f.writelines([x + '\n' for x in label_tags if x in tree])

In [None]:
len([x for x in label_tags if x in tree]), len(tree)

In [None]:
# What happens if we remove trivial nodes (1 child)?
tree_nontrivial = util.remove_trivial(tree, root='root')
len(tree_nontrivial)

In [None]:
# edges = util.dfs_edges_with_order(tree_nontrivial, tree_labels)
# with open(RESOURCES_DIR / 'hierarchy/audioset_tree_nontrivial.csv', 'w') as f:
#     w = csv.writer(f)
#     for edge in edges:
#         w.writerow(edge)

In [None]:
import tensorflow as tf

In [None]:
data_dir = pathlib.Path('~/data/manual/audioset/audioset_v1_embeddings').expanduser()
split = 'bal_train'

def get_labels(record):
    example = tf.train.SequenceExample.FromString(record.numpy())
    return tuple(example.context.feature['labels'].int64_list.value)

split_dir = data_dir / split
tfrecord_files = list(map(str, split_dir.iterdir()))
len(tfrecord_files)

In [None]:
dataset = tf.data.TFRecordDataset(tfrecord_files)
labels = list(map(get_labels, dataset))
del dataset

In [None]:
len(labels)

In [None]:
# Convert from integers to tags.
original_tags = [tuple(map(label_tags.__getitem__, x)) for x in labels]
example_tags = list(original_tags)

In [None]:
# Are all labels used?
len(set(itertools.chain.from_iterable(example_tags))), len(label_tags)

In [None]:
# Exclude labels that are not in the tree.
example_tags = [tuple(tag for tag in tags if tag in tree) for tags in example_tags]

In [None]:
# Exclude tags that are ancestors of other tags.
ancestors = {x: nx.ancestors(tree, x) for x in tree}
example_tags = [tuple(u for u in tags if not any(u in ancestors[v] for v in tags)) for tags in example_tags]

In [None]:
# How many examples have no labels?
sum(1 for tags in example_tags if not tags) / len(example_tags)

In [None]:
leaf_nodes = [x for x in tree if tree.out_degree[x] == 0]
len(leaf_nodes)

In [None]:
# Count the number of times that each tag occurs.
counts = collections.Counter(itertools.chain.from_iterable(example_tags))
{tag_to_name[k]: counts[k] for k in leaf_nodes}

In [None]:
# Check how many times each tag occurs by itself.
singular_tags = [tags[0] if len(tags) == 1 else None for tags in example_tags]
singular_counts = collections.Counter(x for x in singular_tags if x)

In [None]:
# What fraction were kept after excluding non-singular labels?
sorted((singular_counts[k] / counts[k], tag_to_name[k]) for k in leaf_nodes)

In [None]:
# Which labels appeared together the most?
pair_counts = collections.Counter(itertools.chain.from_iterable(
    ((a, b) for a, b in itertools.product(tags, tags) if a < b)
    for tags in example_tags
))

In [None]:
sorted(((n, (tag_to_name[a], tag_to_name[b])) for (a, b), n in pair_counts.items()), reverse=True)

In [None]:
# Which leaf-node labels appeared together the most?
leaf_pair_counts = collections.Counter(itertools.chain.from_iterable(
    ((a, b) for a, b in itertools.product(tags, tags) if a < b and a in leaf_nodes and b in leaf_nodes)
    for tags in example_tags
))

In [None]:
sorted(((n, (tag_to_name[a], tag_to_name[b])) for (a, b), n in leaf_pair_counts.items()), reverse=True)

In [None]:
len(leaf_nodes)

In [None]:
# What fraction of examples contains only labels with 1 path?
sum(1 for tags in example_tags if len(tags) > 0 and all(num_paths[tag] == 1 for tag in tags)), len(example_tags)

In [None]:
# What fraction of examples has a single label?
sum(1 for tags in example_tags if len(tags) == 1), len(example_tags)

In [None]:
# What fraction of examples has a single label that has 1 path?
sum(1 for tags in example_tags if len(tags) == 1 and num_paths[tags[0]] == 1), len(example_tags)

In [None]:
ancestors = {x: nx.ancestors(g, x) for x in g}

def remove_ancestors(nodes):
    return [u for u in nodes if not any(u in ancestors[v] for v in nodes)]

In [None]:
without_ancestors = list(map(remove_ancestors, example_tags))

In [None]:
np.mean([len(tags) for tags in example_tags if tags])

In [None]:
# Did removing the ancestors have an effect?
np.mean([len(tags) for tags in without_ancestors if tags])

In [None]:
# What fraction of examples contains one label (plus ancestors of that label)?
sum(1 for tags in without_ancestors if len(tags) == 1), len(example_tags)

In [None]:
# And if we further exclude example with multiple paths?
sum(1 for tags in without_ancestors if len(tags) == 1 and num_paths[tags[0]] == 1), len(example_tags)

In [None]:
# Write labels to file with empty string for excluded examples.
final_labels = [
    tags[0] if len(tags) == 1 and num_paths[tags[0]] == 1 else ''
    for tags in without_ancestors
]

In [None]:
# How many labels are not leaf nodes?
np.mean([tree.out_degree[label] == 0 for label in final_labels if label])

In [None]:
with gzip.open(RESOURCES_DIR / f'override_labels/audioset-{split}-tree.txt.gz', 'wt') as f:
    f.writelines([x + '\n' for x in final_labels])