In [1]:
%cd '/scratch/sk7898/l3embedding/classifier/sonyc_ust'
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import random
import csv
import json
import glob
import pandas as pd
import pickle as pk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import resampy
import librosa
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import StandardScaler
from matplotlib import cm
from metrics import parse_coarse_prediction, micro_averaged_auprc, macro_averaged_auprc, evaluate_df
from classify import load_embeddings, predict_mil, construct_mlp_mil
# New modules: oyaml and pandas
import oyaml as yaml

/scratch/sk7898/l3embedding/classifier/sonyc_ust


Using TensorFlow backend.


In [2]:
def equal_lists(l1, l2):
    l1.sort()
    l2.sort()
    return l1 == l2

def get_split(row, cols, test_grps):
    col_list = [c for c in cols if row[c] != 0]
    for test_col in test_grps:
        split = 'test' if equal_lists(col_list, test_col) else 'train_val'
        if split == 'test':
            break
    row['new_split'] = split
    return row

In [3]:
def get_meta_df(meta_df, sensor_df, cols, test_grps):
    meta_df['grp_id'] = None
    # Replace nan with 0
    meta_df = meta_df.fillna(0)
    meta_df['sensor_name'] = meta_df['node_id'].apply(lambda x: x[10:-6])
    meta_df['sensor_id'] = meta_df['sensor_name'].apply(
                                        lambda x: sensor_df[sensor_df['sensor_name']==x].iloc[0]['sensor_id'] 
                                                    if len(sensor_df['sensor_name']==x) > 0 else None
                                        )
    gp = meta_df.groupby(cols)
    keys = {k: i for i, k in enumerate(gp.groups.keys())}

    for i, (grp, grp_df) in enumerate(gp):
        idxs = grp_df.index.tolist()
        meta_df.loc[idxs, 'grp_id'] = keys[grp]

    meta_df = meta_df.apply(get_split, args=(cols, test_grps, ), axis=1)
    return meta_df

In [4]:
def get_coarse_targets(taxonomy_path, cls):
    with open(taxonomy_path, 'r') as f:
        taxonomy = yaml.load(f, Loader=yaml.Loader)

    # cls_dict = {k: v for k, v in taxonomy['coarse'].items() if v in cls_list}
    cls_dict = {k: v for k, v in taxonomy['coarse'].items() if v == cls} 
    coarse_target_labels = ["_".join([str(k), v, 'presence'])
                                for k, v in cls_dict.items()]
    return coarse_target_labels

In [5]:
def get_train_valid_files(annotation_data, file_list, positive=True, valid_ratio=0.15, n_neg_files=None):

    valid_files = []
    cls_data = annotation_data[annotation_data['audio_filename'].isin(file_list)][['new_split', 'audio_filename', 'grp_id']].drop_duplicates()
    
    if not positive and n_neg_files and n_neg_files < len(file_list):
        file_list = np.random.choice(file_list, n_neg_files, replace=False) 
        cls_data = cls_data[cls_data['audio_filename'].isin(file_list)]

    grouped_df = cls_data[cls_data['new_split'] == 'train_val'].groupby('grp_id')

    for i, (grp, df_group) in enumerate(grouped_df):
        files = df_group['audio_filename']
        n_valid = int(valid_ratio * len(files))
        s = np.random.choice(files, n_valid, replace=False)
        for i in range(n_valid):
            valid_files.append(s[i])
            
    train_files = [f for f in file_list if f not in valid_files]
 
    return train_files, valid_files

In [6]:
def get_file_targets(annotation_data, file_list, labels):
    target_list = []

    for filename in file_list:
        file_df = annotation_data[annotation_data['audio_filename'] == filename]
        target = []

        for label in labels:
            count = 0

            for _, row in file_df.iterrows():
                if int(row['annotator_id']) == 0:
                    # If we have a validated annotation, just use that
                    count = row[label]
                    break
                else:
                    count += row[label]

            if count > 0:
                target.append(1.0)
            else:
                target.append(0.0)

        target_list.append(target)

    return np.array(target_list)

In [163]:
np.random.seed(42)
version = 'v0.4'
cls_list = ['engine', 'machinery-impact', 'non-machinery-impact', 'powered-saw',
            'alert-signal', 'music', 'human-voice', 'dog'
           ]
SONYC_PATH = '/scratch/work/sonyc/sonyc/ust/annotations'
META_FOLDER = '/scratch/sk7898/l3embedding/notebooks/data'
DATA_FOLDER = os.path.join('/scratch/sk7898/l3embedding/notebooks/data', version)


# Meta data attributes
meta_cols = ['near_construction', 'on_thoroughfare', 'near_park',
       'near_dogpark', 'near_highway', 'near_commercial', 'nyu_location',
       'nyu_surroundings', 'near_transporthub', 'near_touristspot',
       'bus_route']

test_grp_dict = {
    'engine': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'machinery-impact': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub'],
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub', 'near_touristspot'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'non-machinery-impact': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub'],
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub', 'near_touristspot'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'powered-saw': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub', 'near_touristspot'],
        ['nyu_location', 'on_thoroughfare', 'near_commercial', 'bus_route'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'alert-signal': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'], 
        ['nyu_location', 'near_commercial'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'music': [
        ['nyu_location', 'nyu_surroundings', 'near_touristspot'],  
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub', 'near_touristspot'],
        ['nyu_location', 'on_thoroughfare', 'near_commercial', 'near_transporthub', 'bus_route']
        
    ],
    'human-voice':[
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub'],
        ['nyu_location', 'on_thoroughfare', 'near_commercial', 'bus_route'],
        ['nyu_location', 'on_thoroughfare', 'near_park', 'near_touristspot', 'bus_route']
    ],
    'dog':[
        ['nyu_location', 'on_thoroughfare', 'near_commercial', 'near_transporthub', 'bus_route'],
        ['nyu_location', 'near_park', 'near_commercial', 'near_transporthub', 'near_touristspot'],
        ['nyu_location', 'near_construction', 'near_commercial']
    ]
}

In [164]:
cls = cls_list[7]
NEW_ANNOTATION_DIR = os.path.join(DATA_FOLDER, cls)
taxonomy_path = os.path.join(SONYC_PATH, '{}/dcase-ust-taxonomy.yaml'.format(version))
coarse_target_labels = get_coarse_targets(taxonomy_path, cls)

if version == 'v2.2':
    annotation_path = os.path.join(SONYC_PATH, 'latest/annotations_w_test_anns.csv')
else:
    annotation_path = os.path.join(SONYC_PATH, '{}/annotations.csv'.format(version))

annotation_data = pd.read_csv(annotation_path).sort_values('audio_filename')
annotation_data = annotation_data.merge(meta_df, on='sensor_id')

test_grps = test_grp_dict[cls]
sensor_df = pd.read_csv(os.path.join(DATA_FOLDER, 'sensor_split_ids_{}.csv'.format(version)))
meta_df = pd.read_csv(os.path.join(META_FOLDER, 'node_meta.csv'))

meta_df = get_meta_df(meta_df, sensor_df, meta_cols, test_grps)
print('Test sensor groups in metadata: ',meta_df[meta_df['new_split'] == 'test']['grp_id'].unique())
print('Train Valid sensor groups in metadata: ', meta_df[meta_df['new_split'] == 'train_val']['grp_id'].unique())

# print('Test sensor groups in annotation: ', annotation_data[annotation_data['new_split'] == 'test']['grp_id'].unique())
# print('Train Valid sensor groups in annotation: ', annotation_data[annotation_data['new_split'] == 'train_val']['grp_id'].unique())

Test sensor groups in metadata:  [ 8 11  5]
Train Valid sensor groups in metadata:  [ 3  0  6  7  4  2  1 10  9 12]


In [165]:
file_list = annotation_data[annotation_data['new_split'] == 'train_val'].sort_values('audio_filename')['audio_filename'].unique().tolist()
targets = get_file_targets(annotation_data, file_list, coarse_target_labels)

pos_idxs = np.where(targets == 1)[0]
neg_idxs = np.where(targets == 0)[0]
pos_files, neg_files = np.array(file_list)[pos_idxs].tolist(), np.array(file_list)[neg_idxs].tolist()
train_val = annotation_data[annotation_data['new_split'] == 'train_val']

pos_train_files, pos_valid_files = get_train_valid_files(
                                        train_val,
                                        pos_files,
                                        positive=True, 
                                        valid_ratio=0.15, 
                                        n_neg_files=None
                                    )

neg_train_files, neg_valid_files = get_train_valid_files(
                                        train_val,
                                        neg_files,
                                        positive=False, 
                                        valid_ratio=0.15, 
                                        n_neg_files=len(pos_train_files)+len(pos_valid_files)
                                    )
valid_files = pos_valid_files + neg_valid_files
train_files = pos_train_files + neg_train_files

test_list = annotation_data[annotation_data['new_split'] == 'test'].sort_values('audio_filename')['audio_filename'].unique().tolist()
targets = get_file_targets(annotation_data, test_list, coarse_target_labels)
test_pos_idxs = np.where(targets == 1)[0]
test_neg_idxs = np.where(targets == 0)[0]
pos_test_files = np.array(test_list)[test_pos_idxs].tolist()
neg_test_files = np.array(test_list)[test_neg_idxs].tolist()

n_neg_files = min(len(pos_test_files), len(neg_test_files))
if n_neg_files < len(neg_test_files):
    reduced_neg_list = np.random.choice(neg_test_files, n_neg_files, replace=False) 
    reduced_test_files = pos_test_files + reduced_neg_list.tolist()
    annotation_data.drop(annotation_data[
                        (annotation_data['new_split'] == 'test') & 
                        (~annotation_data['audio_filename'].isin(reduced_test_files))
                   ].index, inplace=True) 

print('Total Train:{} \t#positive: {} \t| #negative: {}'.format(len(train_files), len(pos_train_files), len(neg_train_files)))
print('Total Valid: {} \t#positive: {} \t| #negative: {}'.format(len(valid_files), len(pos_valid_files), len(neg_valid_files)))
print('Total Test : {} \t#positive: {} \t| #negative: {}'.format(len(pos_test_files)+n_neg_files, len(pos_test_files), n_neg_files))    

Total Train:250 	#positive: 124 	| #negative: 126
Total Valid: 36 	#positive: 19 	| #negative: 17
Total Test : 30 	#positive: 15 	| #negative: 15


In [166]:
for f in train_files:
    annotation_data.loc[((annotation_data['audio_filename'] == f) & (annotation_data['new_split'] == 'train_val')), 'new_split'] = 'train'
    
for f in valid_files:
    annotation_data.loc[((annotation_data['audio_filename'] == f) & (annotation_data['new_split'] == 'train_val')), 'new_split'] = 'validate'    
    
l1 = len(annotation_data[annotation_data['new_split'] == 'train']['audio_filename'].unique().tolist())
l2 = len(annotation_data[annotation_data['new_split'] == 'validate']['audio_filename'].unique().tolist())
assert (l1 + l2) == (len(train_files) + len(valid_files)) 

annotation_data.drop(annotation_data[annotation_data['new_split'] == 'train_val'].index, inplace=True) 
print('Test sensor groups in annotation: ', annotation_data[annotation_data['new_split'] == 'test']['grp_id'].unique())
print('Train sensor groups in annotation: ', annotation_data[annotation_data['new_split'] == 'train']['grp_id'].unique())
print('Valid sensor groups in annotation: ', annotation_data[annotation_data['new_split'] == 'train']['grp_id'].unique())

Test sensor groups in annotation:  [ 8  5 11]
Train sensor groups in annotation:  [ 6  7  1  0 10  3  2]
Valid sensor groups in annotation:  [ 6  7  1  0 10  3  2]


In [167]:
if not os.path.exists(NEW_ANNOTATION_DIR):
    os.makedirs(NEW_ANNOTATION_DIR)
    
annotation_path = os.path.join(NEW_ANNOTATION_DIR, 'annotations.csv')
annotation_data.to_csv(annotation_path, index=False)