In [1]:
%cd '/scratch/sk7898/l3embedding/classifier/sonyc_ust'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import random
import csv
import json
import glob
import pandas as pd
import pickle as pk
import numpy as np
import librosa
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import StandardScaler
from keras.models import load_model
from matplotlib import cm
from metrics import parse_coarse_prediction, micro_averaged_auprc, macro_averaged_auprc, evaluate_df
from classify import load_embeddings, predict_mil, construct_mlp_mil
# New modules: oyaml and pandas
import oyaml as yaml

/scratch/sk7898/l3embedding/classifier/sonyc_ust


Using TensorFlow backend.


In [4]:
def get_file_targets(annotation_data, file_list, labels):
    target_list = []

    for filename in file_list:
        file_df = annotation_data[annotation_data['audio_filename'] == filename]
        target = []

        for label in labels:
            count = 0

            for _, row in file_df.iterrows():
                if int(row['annotator_id']) == 0:
                    # If we have a validated annotation, just use that
                    count = row[label]
                    break
                else:
                    count += row[label]

            if count > 0:
                target.append(1.0)
            else:
                target.append(0.0)

        target_list.append(target)

    return np.array(target_list)

In [8]:
version = 'v0.4'
SONYC_PATH = '/scratch/work/sonyc/sonyc/ust/annotations'
META_FOLDER = '/scratch/sk7898/l3embedding/notebooks/data'
DATA_FOLDER = os.path.join('/scratch/sk7898/l3embedding/notebooks/data', version)

taxonomy_path = os.path.join(SONYC_PATH, '{}/dcase-ust-taxonomy.yaml'.format(version))
if version == 'v2.2':
    annotation_path = os.path.join(SONYC_PATH, 'latest/annotations_w_test_anns.csv')
else:
    annotation_path = os.path.join(SONYC_PATH, '{}/annotations.csv'.format(version))

annotation_data = pd.read_csv(annotation_path).sort_values('audio_filename')
with open(taxonomy_path, 'r') as f:
    taxonomy = yaml.load(f, Loader=yaml.Loader)

coarse_target_labels = ["_".join([str(k), v, 'presence']) for k, v in taxonomy['coarse'].items()]

In [11]:
coarse_target_labels

['1_engine_presence',
 '2_machinery-impact_presence',
 '3_non-machinery-impact_presence',
 '4_powered-saw_presence',
 '5_alert-signal_presence',
 '6_music_presence',
 '7_human-voice_presence',
 '8_dog_presence']

In [9]:
meta_df = pd.read_csv(os.path.join(META_FOLDER, 'node_meta.csv'))
sensor_df = pd.read_csv(os.path.join(DATA_FOLDER, 'sensor_split_ids_{}.csv'.format(version)))

# Replace nan with 0
meta_df = meta_df.fillna(0)
meta_df['sensor_name'] = meta_df['node_id'].apply(lambda x: x[10:-6])

In [18]:
grp_cls_path = os.path.join(META_FOLDER, 'grp_cls_dist.csv')

meta_cols = ['near_construction', 'on_thoroughfare', 'near_park',
       'near_dogpark', 'near_highway', 'near_commercial', 'nyu_location',
       'nyu_surroundings', 'near_transporthub', 'near_touristspot',
       'bus_route']
coarse_columns = [
    '1_engine_presence', '2_machinery-impact_presence', 
    '3_non-machinery-impact_presence', '4_powered-saw_presence', 
    '5_alert-signal_presence', '6_music_presence',
    '7_human-voice_presence', '8_dog_presence'
]

df2 = sensor_df[['sensor_id', 'sensor_name']].drop_duplicates()
df = meta_df.merge(df2, how='left', on='sensor_name')
final = annotation_data.merge(df, how='right', on='sensor_id')

grouped_df = df.groupby(meta_cols)['sensor_id']

grp_cls = {}
for i, (grp, df_group) in enumerate(grouped_df):
    sensors = df_group.values
    grp_lst = [c for c, g in zip(meta_cols, grp) if g > 0 and c!='nyu_location'] 
    grp_name = '| '.join(grp_lst)
    file_list = final[final['sensor_id'].isin(sensors)]['audio_filename'].unique().tolist()
    
    targets = get_file_targets(annotation_data, file_list, coarse_target_labels)
    
#     test = test.groupby('audio_filename', group_keys=False).max()
#     if len(pos_test_files) > 0:
    grp_cls[grp_name] = {}

    for i, cls in enumerate(coarse_columns):
        idxs = np.where(targets[:, i] == 1)
        if idxs:
            pos_idxs = idxs[0]
            pos_test_files = np.array(file_list)[pos_idxs].tolist()
            grp_cls[grp_name][cls] = len(pos_test_files) #len(test[test[cls] == 1])
        else:
            grp_cls[grp_name][cls] = 0

In [20]:
# Verify the class counts are right before saving into csv
# ann = annotation_data[annotation_data['sensor_id'].isin(final['sensor_id'].unique().tolist())]
# ann_test = ann.groupby('audio_filename', group_keys=False).max()

# for i, cls in enumerate(coarse_columns):
#     assert cls_df[cls].sum() == len(ann_test[ann_test[cls] == 1])

cls_df = pd.DataFrame.from_dict(grp_cls, orient='index', columns=coarse_columns)
cls_df['sensor_group'] = cls_df.index.tolist()
cls_df.to_csv(grp_cls_path, index=False)