In [1]:
%cd '/scratch/sk7898/l3embedding/classifier/sonyc_ust'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import random
import csv
import json
import glob
import pandas as pd
import pickle as pk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import resampy
import librosa
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import StandardScaler
from matplotlib import cm
from metrics import parse_coarse_prediction, micro_averaged_auprc, macro_averaged_auprc, evaluate_df
from classify import load_embeddings, predict_mil, construct_mlp_mil
# New modules: oyaml and pandas
import oyaml as yaml

/scratch/sk7898/l3embedding/classifier/sonyc_ust


Using TensorFlow backend.


In [62]:
np.random.seed(42)
version = 'v0.4'
#cls = 'engine'
cls = 'machinery-impact'
SONYC_PATH = '/scratch/work/sonyc/sonyc/ust/annotations'
META_FOLDER = '/scratch/sk7898/l3embedding/notebooks/data'
DATA_FOLDER = os.path.join('/scratch/sk7898/l3embedding/notebooks/data', version)
NEW_ANNOTATION_DIR = os.path.join(DATA_FOLDER, cls)

In [63]:
def equal_lists(l1, l2):
    l1.sort()
    l2.sort()
    return l1 == l2

def get_split(row, cols, test_grps):
    col_list = [c for c in cols if row[c] != 0]
    for test_col in test_grps:
        split = 'test' if equal_lists(col_list, test_col) else 'train_val'
        if split == 'test':
            break
    row['new_split'] = split
    return row

In [64]:
def get_meta_df(meta_df, sensor_df, cols, test_grps):
    meta_df['grp_id'] = None
    # Replace nan with 0
    meta_df = meta_df.fillna(0)
    meta_df['sensor_name'] = meta_df['node_id'].apply(lambda x: x[10:-6])
    meta_df['sensor_id'] = meta_df['sensor_name'].apply(
                                        lambda x: sensor_df[sensor_df['sensor_name']==x].iloc[0]['sensor_id'] 
                                                    if len(sensor_df['sensor_name']==x) > 0 else None
                                        )
    gp = meta_df.groupby(cols)
    keys = {k: i for i, k in enumerate(gp.groups.keys())}

    for i, (grp, grp_df) in enumerate(gp):
        idxs = grp_df.index.tolist()
        meta_df.loc[idxs, 'grp_id'] = keys[grp]

    meta_df = meta_df.apply(get_split, args=(cols, test_grps, ), axis=1)
    return meta_df

In [65]:
def get_coarse_targets(taxonomy_path, cls):
    with open(taxonomy_path, 'r') as f:
        taxonomy = yaml.load(f, Loader=yaml.Loader)

    # cls_dict = {k: v for k, v in taxonomy['coarse'].items() if v in cls_list}
    cls_dict = {k: v for k, v in taxonomy['coarse'].items() if v == cls} 
    coarse_target_labels = ["_".join([str(k), v, 'presence'])
                                for k, v in cls_dict.items()]
    return coarse_target_labels

In [66]:
def get_train_valid_files(annotation_data, file_list, positive=True, valid_ratio=0.15, n_neg_files=None):

    valid_files = []
    cls_data = annotation_data[annotation_data['audio_filename'].isin(file_list)][['new_split', 'audio_filename', 'grp_id']].drop_duplicates()
    
    if not positive and n_neg_files and n_neg_files < len(file_list):
        file_list = np.random.choice(file_list, n_neg_files, replace=False) 
        cls_data = cls_data[cls_data['audio_filename'].isin(file_list)]

    grouped_df = cls_data[cls_data['new_split'] == 'train_val'].groupby('grp_id')

    for i, (grp, df_group) in enumerate(grouped_df):
        files = df_group['audio_filename']
        n_valid = int(valid_ratio * len(files))
        s = np.random.choice(files, n_valid, replace=False)
        for i in range(n_valid):
            valid_files.append(s[i])
            
    train_files = [f for f in file_list if f not in valid_files]
 
    return train_files, valid_files

In [67]:
def get_file_targets(annotation_data, file_list, labels):
    target_list = []

    for filename in file_list:
        file_df = annotation_data[annotation_data['audio_filename'] == filename]
        target = []

        for label in labels:
            count = 0

            for _, row in file_df.iterrows():
                if int(row['annotator_id']) == 0:
                    # If we have a validated annotation, just use that
                    count = row[label]
                    break
                else:
                    count += row[label]

            if count > 0:
                target.append(1.0)
            else:
                target.append(0.0)

        target_list.append(target)

    return np.array(target_list)

In [68]:
# Meta data attributes
meta_cols = ['near_construction', 'on_thoroughfare', 'near_park',
       'near_dogpark', 'near_highway', 'near_commercial', 'nyu_location',
       'nyu_surroundings', 'near_transporthub', 'near_touristspot',
       'bus_route']

test_grp_dict = {
    'engine': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'machinery-impact': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub'],
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub', 'near_touristspot'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ]
}

test_grps = test_grp_dict[cls]
sensor_df = pd.read_csv(os.path.join(DATA_FOLDER, 'sensor_split_ids_{}.csv'.format(version)))
meta_df = pd.read_csv(os.path.join(META_FOLDER, 'node_meta.csv'))

meta_df = get_meta_df(meta_df, sensor_df, meta_cols, test_grps)

In [69]:
meta_df[meta_df['new_split'] == 'test']

Unnamed: 0,node_id,near_construction,on_thoroughfare,near_park,near_dogpark,near_highway,near_commercial,nyu_location,nyu_surroundings,near_transporthub,...,roadway_width_ft,height_ft,lat,lng,address,borough,grp_id,sensor_name,sensor_id,new_split
8,sonycnode-b827eb42bd4a.sonyc,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,100,40.694443,-73.985477,5 MetroTech Center,Brooklyn,4,b827eb42bd4a,14,test
12,sonycnode-b827eb1685c7.sonyc,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,50.0,25,40.729442,-73.996501,30 W 4th St,Manhattan,1,b827eb1685c7,4,test
14,sonycnode-b827eb5895e9.sonyc,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,50.0,20,40.776519,-73.963915,970 5th Ave,Manhattan,10,b827eb5895e9,20,test
19,sonycnode-b827eb29eb77.sonyc,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,30.0,12,40.736283,-73.991084,25 Union Square W,Manhattan,5,b827eb29eb77,6,test


In [70]:
taxonomy_path = os.path.join(SONYC_PATH, '{}/dcase-ust-taxonomy.yaml'.format(version))
coarse_target_labels = get_coarse_targets(taxonomy_path, cls)

if version == 'v2.2':
    annotation_path = os.path.join(SONYC_PATH, 'latest/annotations_w_test_anns.csv')
else:
    annotation_path = os.path.join(SONYC_PATH, '{}/annotations.csv'.format(version))

annotation_data = pd.read_csv(annotation_path).sort_values('audio_filename')
annotation_data = annotation_data.merge(meta_df, on='sensor_id')

file_list = annotation_data[annotation_data['new_split'] == 'train_val'].sort_values('audio_filename')['audio_filename'].unique().tolist()
targets = get_file_targets(annotation_data, file_list, coarse_target_labels)

In [71]:
test_list = annotation_data[annotation_data['new_split'] == 'test'].sort_values('audio_filename')['audio_filename'].unique().tolist()
targets = get_file_targets(annotation_data, test_list, coarse_target_labels)
test_pos_idxs = np.where(targets == 1)[0]
pos_test_files = np.array(test_list)[test_pos_idxs].tolist()
len(pos_test_files)

18

In [72]:
pos_idxs = np.where(targets == 1)[0]
neg_idxs = np.where(targets == 0)[0]
pos_files, neg_files = np.array(file_list)[pos_idxs].tolist(), np.array(file_list)[neg_idxs].tolist()
train_val = annotation_data[annotation_data['new_split'] == 'train_val']

pos_train_files, pos_valid_files = get_train_valid_files(
                                        train_val,
                                        pos_files,
                                        positive=True, 
                                        valid_ratio=0.15, 
                                        n_neg_files=None
                                    )

neg_train_files, neg_valid_files = get_train_valid_files(
                                        train_val,
                                        neg_files,
                                        positive=False, 
                                        valid_ratio=0.15, 
                                        n_neg_files=len(pos_train_files)+len(pos_valid_files)
                                    )
valid_files = pos_valid_files + neg_valid_files
train_files = pos_train_files + neg_train_files

In [73]:
annotation_data.loc[annotation_data['audio_filename'].isin(train_files), 'new_split'] = 'train'
annotation_data.loc[annotation_data['audio_filename'].isin(valid_files), 'new_split'] = 'validate'
l1 = len(annotation_data[annotation_data['new_split'] == 'train']['audio_filename'].unique().tolist())
l2 = len(annotation_data[annotation_data['new_split'] == 'validate']['audio_filename'].unique().tolist())
assert (l1 + l2) == (len(train_files) + len(valid_files))

In [74]:
annotation_data.drop(annotation_data[annotation_data['new_split'] == 'train_val'].index, inplace=True) 

In [75]:
if not os.path.exists(NEW_ANNOTATION_DIR):
    os.makedirs(NEW_ANNOTATION_DIR)
    
annotation_path = os.path.join(NEW_ANNOTATION_DIR, '{}_annotation.csv'.format(cls))
annotation_data.to_csv(annotation_path)

In [76]:
annotation_data[annotation_data['new_split'] == 'train_val']

Unnamed: 0,split,sensor_id,audio_filename,annotator_id,1-1_small-sounding-engine_presence,1-2_medium-sounding-engine_presence,1-3_large-sounding-engine_presence,1-X_engine-of-uncertain-size_presence,2-1_rock-drill_presence,2-2_jackhammer_presence,...,bus_route,roadway_width_ft,height_ft,lat,lng,address,borough,grp_id,sensor_name,new_split


In [77]:
annotation_data[annotation_data['new_split'] == 'train']

Unnamed: 0,split,sensor_id,audio_filename,annotator_id,1-1_small-sounding-engine_presence,1-2_medium-sounding-engine_presence,1-3_large-sounding-engine_presence,1-X_engine-of-uncertain-size_presence,2-1_rock-drill_presence,2-2_jackhammer_presence,...,bus_route,roadway_width_ft,height_ft,lat,lng,address,borough,grp_id,sensor_name,new_split
72,validate,0,00_000612.wav,-4,0.0,1.0,1.0,0.0,-1.0,-1.0,...,1.0,90.0,25,40.729432,-73.993588,721 Broadway,Manhattan,8,b827eb0d8af7,train
73,validate,0,00_000612.wav,7,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,90.0,25,40.729432,-73.993588,721 Broadway,Manhattan,8,b827eb0d8af7,train
74,validate,0,00_000612.wav,64,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,90.0,25,40.729432,-73.993588,721 Broadway,Manhattan,8,b827eb0d8af7,train
75,validate,0,00_000612.wav,152,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,90.0,25,40.729432,-73.993588,721 Broadway,Manhattan,8,b827eb0d8af7,train
76,validate,0,00_000612.wav,0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,90.0,25,40.729432,-73.993588,721 Broadway,Manhattan,8,b827eb0d8af7,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,train,2,02_000630.wav,52,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,60.0,20,40.731403,-73.994565,30 E 8th St,Manhattan,8,b827eb122f0f,train
1172,train,2,02_000630.wav,58,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,60.0,20,40.731403,-73.994565,30 E 8th St,Manhattan,8,b827eb122f0f,train
1176,train,2,02_000658.wav,76,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,60.0,20,40.731403,-73.994565,30 E 8th St,Manhattan,8,b827eb122f0f,train
1177,train,2,02_000658.wav,92,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,60.0,20,40.731403,-73.994565,30 E 8th St,Manhattan,8,b827eb122f0f,train


In [78]:
annotation_data[annotation_data['new_split'] == 'validate']

Unnamed: 0,split,sensor_id,audio_filename,annotator_id,1-1_small-sounding-engine_presence,1-2_medium-sounding-engine_presence,1-3_large-sounding-engine_presence,1-X_engine-of-uncertain-size_presence,2-1_rock-drill_presence,2-2_jackhammer_presence,...,bus_route,roadway_width_ft,height_ft,lat,lng,address,borough,grp_id,sensor_name,new_split
896,train,1,01_002494.wav,1367,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
897,train,1,01_002494.wav,1376,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
898,train,1,01_002494.wav,1621,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
938,train,1,01_002666.wav,1609,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
939,train,1,01_002666.wav,1613,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
940,train,1,01_002666.wav,1621,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
1007,train,1,01_003166.wav,64,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
1008,train,1,01_003166.wav,1376,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate
1009,train,1,01_003166.wav,1623,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,20,40.730419,-73.998614,53 Washington Square S,Manhattan,6,b827eb0fedda,validate


In [79]:
annotation_data[annotation_data['new_split'] == 'test']

Unnamed: 0,split,sensor_id,audio_filename,annotator_id,1-1_small-sounding-engine_presence,1-2_medium-sounding-engine_presence,1-3_large-sounding-engine_presence,1-X_engine-of-uncertain-size_presence,2-1_rock-drill_presence,2-2_jackhammer_presence,...,bus_route,roadway_width_ft,height_ft,lat,lng,address,borough,grp_id,sensor_name,new_split
2225,validate,4,04_000030.wav,-2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,50.0,25,40.729442,-73.996501,30 W 4th St,Manhattan,1,b827eb1685c7,test
2226,validate,4,04_000030.wav,100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,50.0,25,40.729442,-73.996501,30 W 4th St,Manhattan,1,b827eb1685c7,test
2227,validate,4,04_000030.wav,-3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,50.0,25,40.729442,-73.996501,30 W 4th St,Manhattan,1,b827eb1685c7,test
2228,validate,4,04_000030.wav,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,50.0,25,40.729442,-73.996501,30 W 4th St,Manhattan,1,b827eb1685c7,test
2229,validate,4,04_000030.wav,64,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,50.0,25,40.729442,-73.996501,30 W 4th St,Manhattan,1,b827eb1685c7,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4438,test,20,20_010950.wav,-3,0.0,0.0,1.0,0.0,-1.0,-1.0,...,1.0,50.0,20,40.776519,-73.963915,970 5th Ave,Manhattan,10,b827eb5895e9,test
4439,test,20,20_010950.wav,4083,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,50.0,20,40.776519,-73.963915,970 5th Ave,Manhattan,10,b827eb5895e9,test
4440,test,20,20_010950.wav,64,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,50.0,20,40.776519,-73.963915,970 5th Ave,Manhattan,10,b827eb5895e9,test
4441,test,20,20_010950.wav,-4,0.0,0.0,1.0,0.0,-1.0,-1.0,...,1.0,50.0,20,40.776519,-73.963915,970 5th Ave,Manhattan,10,b827eb5895e9,test
