### Preprocessing

This notebooks merges the computed YAMNET Embeddings with other metadata like user tags, title and hierarchical labels of the FSD50K annotations (ground truth labels). Further we translate the class names to their corresponding ontology IDs to avoid comparison problems of different string (text) representations. The computed embeddings are saved in json format for each audiofile with three keys: main class label, class probabilities and the 1024 sized feature vector. From 10231 files of the evaluation set, 9093 embeddings were computed succesfully while 1138 were null embeddings.

In [6]:
import json
import os
import csv
import pandas as pd
import numpy as np

In [7]:
# YAMNET Embeddings for FSD50K evaluation set: original size (10231), embeddings size (9093), null embeddings (1138)
# Specify paths for all necessary files
DATA_DIR          = 'data'
EMB_DIR           = os.path.join(DATA_DIR, 'yamnet-embeddings-eval')
FSD50K_EVAL_CLIPS = os.path.join(DATA_DIR, 'eval_clips_info_FSD50K.json')
EVAL_HLABELS      = os.path.join(DATA_DIR, 'eval.csv')
ONTOLOGY_JSON     = os.path.join(DATA_DIR, 'ontology.json')
FSD50K_VOCABULARY = os.path.join(DATA_DIR, 'vocabulary.csv')
OUTPUT_FILE       = os.path.join(DATA_DIR, 'eval_clips_info_FSD50K_analysis.json')

In [10]:
# Output dictionary
output_dict = {}

# Ontology translation
onto_df         = pd.read_json(ONTOLOGY_JSON)
fsdk_df         = pd.read_csv(FSD50K_VOCABULARY, header=None)
fsdk_name_to_id = dict(zip(fsdk_df[1], fsdk_df[2]))
onto_name_to_id = dict(zip(onto_df['name'], onto_df['id']))
fsdk_id_to_name = dict(zip(fsdk_df[2], fsdk_df[1]))
onto_id_to_name = dict(zip(onto_df['id'], onto_df['name']))

# Open evaluation set
with open(FSD50K_EVAL_CLIPS , 'r') as f:
    fsd50k_dict = json.load(f)

# Subset of files for better distribution of data
all_files = np.load('all_files_ids.npy')

# Iterate over embeddings jsons
for f in os.listdir(EMB_DIR):
    
    # Skip dotfiles etc.
    if f.startswith('.') or not os.path.isfile(os.path.join(EMB_DIR, f)): continue
    
    # File ID
    f_id = f.split('-')[0]  
    
    if str(f_id + '.wav') in all_files:
        # Open JSON
        with open(os.path.join(EMB_DIR, f)) as jf:
            emb_json = json.load(jf)
    
        # Embeddings data
        emb_class    = str(emb_json['classes'][0])
        prob_classes = emb_json['top_25_classes_probabilities']
        embedding    = emb_json['embeddings']
        # FSD50K metadata
        title        = fsd50k_dict[f_id]['title']
        user_tags    = fsd50k_dict[f_id]['tags']
        # IDs translation
        emb_class_id = onto_name_to_id[emb_class]
        # Save data in temporary dict
        temp_dict = {} 
        temp_dict['title']               = title
        temp_dict['user_tags']           = user_tags
        temp_dict['yamnet_class']        = emb_class
        temp_dict['yamnet_class_id']     = emb_class_id
        temp_dict['yamnet_prob_classes'] = prob_classes
        temp_dict['yamnet_embedding']    = embedding
        
        # Write data to output dictionary with filename as ID
        output_dict[f_id] = temp_dict
    else:
        continue
    
# Save to JSON  
with open(OUTPUT_FILE, 'w') as outfile:
    json.dump(output_dict, outfile)  