In [1]:
import csv
import json
import pathlib

import networkx as nx

In [2]:
with open('audioset/ontology.json') as f:
    ontology = json.load(f)

g = nx.DiGraph()
for node in ontology:
    for child_id in node['child_ids']:
        g.add_edge(node['id'], child_id)

kgid_to_node = {node['id']: node for node in ontology}
kgid_to_name = {node['id']: node['name'] for node in ontology}

In [3]:
{node['id']: node['name'] for node in ontology if 'instrument' in node['name'].lower()}

{'/m/04szw': 'Musical instrument',
 '/m/0fx80y': 'Plucked string instrument',
 '/m/05r5wn': 'Rattle (instrument)',
 '/m/01kcd': 'Brass instrument',
 '/m/0l14_3': 'Bowed string instrument',
 '/m/085jw': 'Wind instrument, woodwind instrument',
 '/m/01vj9c': 'Bass (instrument role)'}

In [4]:
{node['id']: node['name'] for node in ontology if 'voice' in node['name'].lower()}

{'/m/09l8g': 'Human voice'}

In [5]:
{node['id']: node['name'] for node in ontology if 'speech' in node['name'].lower()}

{'/m/09x0r': 'Speech',
 '/m/05zppz': 'Male speech, man speaking',
 '/m/02zsn': 'Female speech, woman speaking',
 '/m/0ytgt': 'Child speech, kid speaking',
 '/m/0brhx': 'Speech synthesizer',
 '/m/07qfr4h': 'Hubbub, speech noise, speech babble'}

In [6]:
{node['id']: node['name'] for node in ontology if 'singing' in node['name'].lower()}

{'/m/015lz1': 'Singing',
 '/t/dd00003': 'Male singing',
 '/t/dd00004': 'Female singing',
 '/t/dd00005': 'Child singing',
 '/t/dd00006': 'Synthetic singing',
 '/m/0l14t7': 'Singing bowl'}

In [7]:
lower_name_to_kgid = {str.lower(v): k for k, v in kgid_to_name.items()}

{k: kgid_to_name[k] for k in map(lower_name_to_kgid.__getitem__, [
    'musical instrument',
    'human voice',
    'singing',
    'music',
    'music genre',
])}

{'/m/04szw': 'Musical instrument',
 '/m/09l8g': 'Human voice',
 '/m/015lz1': 'Singing',
 '/m/04rlf': 'Music',
 '/m/0kpv1t': 'Music genre'}

In [8]:
subsets = {
    'instruments': '/m/04szw',
    'voice': '/m/09l8g',
    'speech': '/m/09x0r',
    'singing': '/m/015lz1',
    'music': '/m/04rlf',
    'genre': '/m/0kpv1t',
}

In [9]:
FIELDNAMES = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']

for subset_name, subset_kgid in subsets.items():
    # Include all labels below label.
    kgid_subset = set.union({subset_kgid}, nx.descendants(g, subset_kgid))
    # print('label:', kgid_to_name[subset_kgid])
    # print('label set:', [kgid_to_name[x] for x in kgid_subset])

    out_dir = pathlib.Path(f'audioset_{subset_name}')
    out_dir.mkdir(exist_ok=True)

    for split in ['balanced_train', 'eval', 'unbalanced_train']:
        with open(f'audioset/{split}_segments.csv') as f:
            examples = list(csv.DictReader(
                (line for line in f if not line.startswith('#')),
                fieldnames=FIELDNAMES,
                skipinitialspace=True))

        example_subset = [
            example for example in examples
            if any(kgid in kgid_subset for kgid in example['positive_labels'].split(','))
        ]
        print('{}/{}: {} / {}'.format(subset_name, split, len(example_subset), len(examples)))

        with open(out_dir / f'{split}_segments.csv', 'w') as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writerows(example_subset)

    print()

instruments/balanced_train: 3447 / 22160
instruments/eval: 3327 / 20371
instruments/unbalanced_train: 226652 / 2041789

voice/balanced_train: 7210 / 22160
voice/eval: 6638 / 20371
voice/unbalanced_train: 1075362 / 2041789

speech/balanced_train: 5842 / 22160
speech/eval: 5422 / 20371
speech/unbalanced_train: 1004564 / 2041789

singing/balanced_train: 916 / 22160
singing/eval: 776 / 20371
singing/unbalanced_train: 69032 / 2041789

music/balanced_train: 8836 / 22160
music/eval: 7848 / 20371
music/unbalanced_train: 1063333 / 2041789

genre/balanced_train: 2490 / 22160
genre/eval: 2048 / 20371
genre/unbalanced_train: 193831 / 2041789

