In [374]:
import pandas as pd
import librosa
import numpy as np
import os,glob
from tqdm import tqdm

# Metrics  
The metrics are based on true positives (TP) and false positives (FP) determined not only by correct or wrong detections, but also based on if they are closer or further than a distance threshold T∘ (angular in our case) from the reference. For the evaluation of this challenge we take this threshold to be T=20∘.  
  
More specifically, for each class c∈[1,...,C] and each frame or segment:  
  
Pc predicted events of class c are associated with Rc reference events of class c  
false negatives are counted for misses: FNc=max(0,Rc−Pc)  
false positives are counted for extraneous predictions: FPc,∞=max(0,Pc−Rc)  
Kc predictions are spatially associated with references based on Hungarian algorithm: Kc=min(Pc,Rc). Those can also be considered as the unthresholded true positives TPc=Kc.  
the spatial threshold is applied which moves Lc≤Kc predictions further than threhold to false positives: FPc,≥20∘=Lc, and FPc=FPc,∞+FPc,≥20∘  
the remaining matched estimates per class are counted as true positives: TPc,≤20∘=Kc−FPc,≥20∘  
finally: predictions Pc=TPc,≤20∘+FPc, but references Rc=TPc,≤20∘+FPc,≥20∘+FNc  
Based on those, we form the location-dependent F1-score (F≤20∘) and Error Rate (ER≤20∘). Contrary to the previous challenges, in which F≤20∘ was micro-averaged, in this challenge we perform macro-averaging of the location-dependent F1-score: F≤20∘=∑cFc,≤20∘/C.  
  
Additionally, we evaluate localization accuracy through a class-dependent localization error LEc, computed as the mean angular error of the matched true positives per class, and then macro-averaged:
  
LEc=∑kθk/Kc=∑kθk/TPc for each frame or segment, with θk being the angular error between the kth matched prediction and re  ference,  
and after averaging across all frames that have any true positives, LECD=∑cLEc/C.
Complementary to the localization error, we compute a localization recall metric per class, also macro-averaged:  

LRc=Kc/Rc=TPc/(TPc+FNc), and
LRCD=∑cLRc/C.
Note that the localization error and recall are not thresholded in order to give more varied complementary information to the location-dependent F1-score, presenting localization accuracy outside of the spatial threshold.

# Ranking
Overall ranking will be based on the cumulative rank of the metrics mentioned above, sorted in ascending order. By cumulative rank we mean the following: if system A was ranked individually for each metric as ER:1,F1:1,LE:3,LR:1, then its cumulative rank is 1+1+3+1=6. Then if system B has ER:3,F1:2,LE:2,LR:3 (10), and system C has ER:2,F1:3,LE:1,LR:2 (8), then the overall rank of the systems is A,C,B. If two systems end up with the same cumulative rank, then they are assumed to have equal place in the challenge, even though they will be listed alphabetically in the ranking tables

In [394]:
root_eval = "/home/data/kbh/DCASE_eval/"

dir_label = "eval_label"
dir_audio = "foa_eval"

list_label = [x for x in glob.glob(os.path.join(root_eval,dir_label,"*.csv"))]

cat = [
    "Female speech, woman speaking", 
    "Male speech, man speaking", 
    "Clapping",
    "Telephone",
    "Laughter",
    "Domestic sounds",
    "Walk, footsteps",
    "Door, open or close",
    "Music",
    "Musical instrument",
    "Water tap, faucet",
    "Bell",
    "Knock"
]

In [395]:
def eval(path_target,dir_sub):
    name_target = path_target.split('/')[-1]
    id_target = name_target.split('.')[0]
    
    path_sub = os.path.join(root_eval,dir_sub,name_target)
    path_aud = os.path.join(root_eval,dir_audio,id_target+".wav")    

    raw,_ = librosa.load(path_aud,sr=24000)
    len_target = len(raw)
    n_label = int(len_target/24000)
    
    #print("Target :  {}".format(id_target))
    #print(n_label)
    
    # Label
    csv_label = pd.read_csv(
        path_target,
        names=["idx","1","2","3","4","5"],
        index_col="idx",
        #dtype=np.int32,
        keep_default_na=False
    )
    label = csv_label[:n_label]
    
    csv_sub   = pd.read_csv(path_sub,names=["idx","1","2","3","4","5"], keep_default_na=False)
    #print("csv_sub")
    #display(csv_sub[15:])
    # submission compression
    
    sub = pd.DataFrame(np.empty((n_label, 5),dtype=str) ,columns = ["1","2","3","4","5"])
    
    #display(csv_sub)
    for i_sub in csv_sub.index:
        idx = csv_sub.loc[i_sub][0]
        cat = csv_sub.loc[i_sub][1]
        
        j_sub = int(idx/10)
        
        if j_sub >= n_label:
            break
        
        # print("{} {} {}".format(idx,j_sub,cat))
        i_src = 0
        
        inserted = False
        for i_src in range(5) : 
            if sub.iloc[j_sub][i_src] == "" :
                sub.iloc[j_sub][i_src] = cat
                inserted = True
                break
            # dup
            elif sub.iloc[j_sub][i_src] == cat:
                inserted = True
                break
                
        if not inserted :
            print("{} sub[{}] more than 5 soruce".format(id_target,j_sub))
    
    #print("sub")
    #display(sub)
    
    ## Eval
    
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    
    TP_cat = np.zeros(13)
    FP_cat = np.zeros(13)
    FN_cat = np.zeros(13)
    TN_cat = np.zeros(13)
    
    for i_label in range(n_label) : 
        #print("=====")
        s_label = label.iloc[i_label]
        s_sub   = sub.iloc[i_label]
        
        #print("label s: \n{}".format(s_label))
        #print("submt s: \n{}".format(s_sub))
        
        l_label = list(filter(None, s_label))
        #l_sub = list(filter(None, sub.iloc[i_label])) => also remove 0 value
        
        l_sub = [i for i in s_sub if i != ""]
        
        #print("label l: {}".format(l_label))
        #print("submt l: {}".format(l_sub))
        
        # Exception
        
        if " " in l_label : 
            l_label = l_label.remove(" ")
        if l_label is None : 
            l_label = []
        if "  " in l_label :
            l_label = l_label.remove("  ")  
        if l_label is None : 
            l_label = []
        
        if len(l_label) == 0 and len(l_sub) == 0 :
            TN +=1
            TN_cat[:] +=1
            continue
            
        l_label = list(map(int, l_label))

        ## True
        for iter_sub in l_sub : 
            if iter_sub in l_label : 
                TP+=1
                TP_cat[iter_sub] +=1
            else : 
                FP+=1
                FP_cat[iter_sub] +=1
                
        for iter_label in l_label :
            if iter_label not in l_sub:
                FN+=1
                FN_cat[iter_label] +=1

    recall = TP/(TP+FN)
    f1 = (2*TP)/(2*TP+FP+FN)
    acc = (TP+TN)/(TP+TN+FP+FN)
                
    """       
    print("    |  PP   |   PN ")
    print(" P  | {:2d}    |   {:2d}".format(TP,FN))
    print(" N  | {:2d}    |   {:2d}".format(FP, TN))
    print("f1-score {:.3f}".format(f1))
    print("accuracy {:.3f}".format(acc))
    print("recall   {:.3f}".format(recall))
    """

    return recall,f1,acc, TP_cat,FP_cat,FN_cat,TN_cat

In [391]:
t_recall, t_f1, t_acc,t_TP,t_FP,t_FN,t_TN = eval("/home/data/kbh/DCASE_eval/eval_label/mix020.csv","submission-665")
print(t_TP)

[54.  0.  0.  3.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [396]:
def eval_model(dir_sub) : 
    list_label = [x for x in glob.glob(os.path.join(root_eval,dir_label,"*.csv"))]

    recall = 0.0
    f1 = 0.0
    acc = 0.0
    
    min_f1 = 10
    max_f1 = 0
    min_path = ""
    max_path = ""
    
    TP_cat = np.zeros(13)
    FP_cat = np.zeros(13)
    FN_cat = np.zeros(13)
    TN_cat = np.zeros(13)
        
    
    for path in tqdm(list_label) : 
        t_recall, t_f1, t_acc,t_TP,t_FP,t_FN,t_TN = eval(path,dir_sub)
        
        if t_f1 > max_f1 : 
            max_f1 = t_f1
            max_path = path
        
        if t_f1 < min_f1 : 
            min_f1 = t_f1
            min_path = path
        
        TP_cat += t_TP
        FP_cat += t_FP
        FN_cat += t_FN
        TN_cat += t_TN
        
        
        recall += t_recall
        f1 += t_f1
        acc += t_acc

    n_target = len(list_label)

    print(dir_sub)
    print("n_target : {}".format(n_target))
    print("f1-score {:.3f}".format(f1/n_target))
    print("accuracy {:.3f}".format(acc/n_target))
    print("recall   {:.3f}".format(recall/n_target))
    print("max f1 {:.3f} at {}".format(max_f1,max_path))
    print("min f1 {:.3f} at {}".format(min_f1,min_path))
    print("--------------------------")
    for i in range(13):
        print("f1[{}] : {:.6f}".format(cat[i],(2*TP_cat[i])/(2*TP_cat[i]+FP_cat[i]+FN_cat[i])))
    print("--------------------------")
    for i in range(13):
        print("acc[{}] : {:.6f}".format(cat[i],(TP_cat[i]+TN_cat[i])/(TP_cat[i]+TN_cat[i]+FP_cat[i]+FN_cat[i])))
    print("--------------------------")
    for i in range(13):
        print("recall[{}] : {:.6f}".format(cat[i],(TP_cat[i]/(TP_cat[i]+FN_cat[i]))))

In [402]:
eval_model("13_627")

100%|███████████████████████████████████████████████| 52/52 [00:18<00:00,  2.84it/s]

13_627
n_target : 52
f1-score 0.697
accuracy 0.582
recall   0.706
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.042 at /home/data/kbh/DCASE_eval/eval_label/mix041.csv
--------------------------
f1[Female speech, woman speaking] : 0.704913
f1[Male speech, man speaking] : 0.793070
f1[Clapping] : 0.597222
f1[Telephone] : 0.460317
f1[Laughter] : 0.397638
f1[Domestic sounds] : 0.696477
f1[Walk, footsteps] : 0.319728
f1[Door, open or close] : 0.202899
f1[Music] : 0.730315
f1[Musical instrument] : 0.300000
f1[Water tap, faucet] : 0.021978
f1[Bell] : 0.673740
f1[Knock] : 0.238095
--------------------------
acc[Female speech, woman speaking] : 0.620387
acc[Male speech, man speaking] : 0.694240
acc[Clapping] : 0.896057
acc[Telephone] : 0.877256
acc[Laughter] : 0.645833
acc[Domestic sounds] : 0.646316
acc[Walk, footsteps] : 0.715909
acc[Door, open or close] : 0.894027
acc[Music] : 0.656355
acc[Musical instrument] : 0.523020
acc[Water tap, faucet] : 0.837294
acc[Bell] : 




In [403]:
#eval_model("submission-665")
#eval_model("3track-submission652")
#eval_model("7_ER641_submissions")
#eval_model("new_7_641_submissions")

eval_model("new_13_627")

100%|███████████████████████████████████████████████| 52/52 [00:18<00:00,  2.84it/s]

new_13_627
n_target : 52
f1-score 0.697
accuracy 0.581
recall   0.705
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.042 at /home/data/kbh/DCASE_eval/eval_label/mix041.csv
--------------------------
f1[Female speech, woman speaking] : 0.704746
f1[Male speech, man speaking] : 0.793197
f1[Clapping] : 0.597222
f1[Telephone] : 0.460317
f1[Laughter] : 0.392857
f1[Domestic sounds] : 0.689373
f1[Walk, footsteps] : 0.319728
f1[Door, open or close] : 0.202899
f1[Music] : 0.730315
f1[Musical instrument] : 0.301221
f1[Water tap, faucet] : 0.021978
f1[Bell] : 0.670241
f1[Knock] : 0.238095
--------------------------
acc[Female speech, woman speaking] : 0.620249
acc[Male speech, man speaking] : 0.694405
acc[Clapping] : 0.896057
acc[Telephone] : 0.877256
acc[Laughter] : 0.645012
acc[Domestic sounds] : 0.640000
acc[Walk, footsteps] : 0.715909
acc[Door, open or close] : 0.894027
acc[Music] : 0.656355
acc[Musical instrument] : 0.524469
acc[Water tap, faucet] : 0.837294
acc[Bell




In [404]:
eval_model("16_616")

100%|███████████████████████████████████████████████| 52/52 [00:19<00:00,  2.71it/s]

16_616
n_target : 52
f1-score 0.691
accuracy 0.580
recall   0.706
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.227 at /home/data/kbh/DCASE_eval/eval_label/mix018.csv
--------------------------
f1[Female speech, woman speaking] : 0.644556
f1[Male speech, man speaking] : 0.786447
f1[Clapping] : 0.478632
f1[Telephone] : 0.427350
f1[Laughter] : 0.445026
f1[Domestic sounds] : 0.704770
f1[Walk, footsteps] : 0.204380
f1[Door, open or close] : 0.322581
f1[Music] : 0.715291
f1[Musical instrument] : 0.346032
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.491909
f1[Knock] : 0.265060
--------------------------
acc[Female speech, woman speaking] : 0.572932
acc[Male speech, man speaking] : 0.687585
acc[Clapping] : 0.895369
acc[Telephone] : 0.885666
acc[Laughter] : 0.731985
acc[Domestic sounds] : 0.660631
acc[Walk, footsteps] : 0.705405
acc[Door, open or close] : 0.923077
acc[Music] : 0.635676
acc[Musical instrument] : 0.594089
acc[Water tap, faucet] : 0.847341
acc[Bell] : 




In [406]:
eval_model("ER615_real")

100%|███████████████████████████████████████████████| 52/52 [00:18<00:00,  2.74it/s]

ER615_real
n_target : 52
f1-score 0.686
accuracy 0.582
recall   0.682
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.138 at /home/data/kbh/DCASE_eval/eval_label/mix020.csv
--------------------------
f1[Female speech, woman speaking] : 0.587963
f1[Male speech, man speaking] : 0.777210
f1[Clapping] : 0.597015
f1[Telephone] : 0.432000
f1[Laughter] : 0.397820
f1[Domestic sounds] : 0.684355
f1[Walk, footsteps] : 0.277966
f1[Door, open or close] : 0.140845
f1[Music] : 0.776935
f1[Musical instrument] : 0.293548
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.686016
f1[Knock] : 0.200000
--------------------------
acc[Female speech, woman speaking] : 0.530343
acc[Male speech, man speaking] : 0.677860
acc[Clapping] : 0.911765
acc[Telephone] : 0.884740
acc[Laughter] : 0.727833
acc[Domestic sounds] : 0.647239
acc[Walk, footsteps] : 0.724093
acc[Door, open or close] : 0.895548
acc[Music] : 0.709830
acc[Musical instrument] : 0.581662
acc[Water tap, faucet] : 0.853377
acc[Bell




In [409]:
eval_model("ER616_Real")

100%|███████████████████████████████████████████████| 52/52 [00:19<00:00,  2.64it/s]

ER616_Real
n_target : 52
f1-score 0.693
accuracy 0.581
recall   0.708
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.227 at /home/data/kbh/DCASE_eval/eval_label/mix018.csv
--------------------------
f1[Female speech, woman speaking] : 0.645625
f1[Male speech, man speaking] : 0.786203
f1[Clapping] : 0.478632
f1[Telephone] : 0.427350
f1[Laughter] : 0.451948
f1[Domestic sounds] : 0.714605
f1[Walk, footsteps] : 0.204380
f1[Door, open or close] : 0.322581
f1[Music] : 0.715090
f1[Musical instrument] : 0.344937
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.498403
f1[Knock] : 0.265060
--------------------------
acc[Female speech, woman speaking] : 0.573844
acc[Male speech, man speaking] : 0.687273
acc[Clapping] : 0.895369
acc[Telephone] : 0.885666
acc[Laughter] : 0.733586
acc[Domestic sounds] : 0.669426
acc[Walk, footsteps] : 0.705405
acc[Door, open or close] : 0.923077
acc[Music] : 0.635447
acc[Musical instrument] : 0.592920
acc[Water tap, faucet] : 0.847341
acc[Bell




In [412]:
eval_model("616_postprocess")

100%|███████████████████████████████████████████████| 52/52 [00:20<00:00,  2.56it/s]

616_postprocess
n_target : 52
f1-score 0.702
accuracy 0.590
recall   0.730
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.227 at /home/data/kbh/DCASE_eval/eval_label/mix018.csv
--------------------------
f1[Female speech, woman speaking] : 0.700055
f1[Male speech, man speaking] : 0.786203
f1[Clapping] : 0.478632
f1[Telephone] : 0.427350
f1[Laughter] : 0.467337
f1[Domestic sounds] : 0.714605
f1[Walk, footsteps] : 0.204380
f1[Door, open or close] : 0.322581
f1[Music] : 0.715090
f1[Musical instrument] : 0.363309
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.498403
f1[Knock] : 0.171429
--------------------------
acc[Female speech, woman speaking] : 0.617906
acc[Male speech, man speaking] : 0.686846
acc[Clapping] : 0.894281
acc[Telephone] : 0.884483
acc[Laughter] : 0.732661
acc[Domestic sounds] : 0.668396
acc[Walk, footsteps] : 0.702997
acc[Door, open or close] : 0.922222
acc[Music] : 0.634657
acc[Musical instrument] : 0.624602
acc[Water tap, faucet] : 0.845754
acc




In [415]:
eval_model("ER615_real")

100%|███████████████████████████████████████████████| 52/52 [00:19<00:00,  2.71it/s]

ER615_real
n_target : 52
f1-score 0.686
accuracy 0.582
recall   0.682
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.138 at /home/data/kbh/DCASE_eval/eval_label/mix020.csv
--------------------------
f1[Female speech, woman speaking] : 0.587963
f1[Male speech, man speaking] : 0.777210
f1[Clapping] : 0.597015
f1[Telephone] : 0.432000
f1[Laughter] : 0.397820
f1[Domestic sounds] : 0.684355
f1[Walk, footsteps] : 0.277966
f1[Door, open or close] : 0.140845
f1[Music] : 0.776935
f1[Musical instrument] : 0.293548
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.686016
f1[Knock] : 0.200000
--------------------------
acc[Female speech, woman speaking] : 0.530343
acc[Male speech, man speaking] : 0.677860
acc[Clapping] : 0.911765
acc[Telephone] : 0.884740
acc[Laughter] : 0.727833
acc[Domestic sounds] : 0.647239
acc[Walk, footsteps] : 0.724093
acc[Door, open or close] : 0.895548
acc[Music] : 0.709830
acc[Musical instrument] : 0.581662
acc[Water tap, faucet] : 0.853377
acc[Bell




In [416]:
eval_model("616 postprocess_v2")

100%|███████████████████████████████████████████████| 52/52 [00:21<00:00,  2.45it/s]

616 postprocess_v2
n_target : 52
f1-score 0.705
accuracy 0.592
recall   0.739
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.246 at /home/data/kbh/DCASE_eval/eval_label/mix041.csv
--------------------------
f1[Female speech, woman speaking] : 0.700055
f1[Male speech, man speaking] : 0.786203
f1[Clapping] : 0.478632
f1[Telephone] : 0.427350
f1[Laughter] : 0.467337
f1[Domestic sounds] : 0.710425
f1[Walk, footsteps] : 0.204380
f1[Door, open or close] : 0.322581
f1[Music] : 0.715090
f1[Musical instrument] : 0.363309
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.626703
f1[Knock] : 0.171429
--------------------------
acc[Female speech, woman speaking] : 0.616283
acc[Male speech, man speaking] : 0.685988
acc[Clapping] : 0.892035
acc[Telephone] : 0.882042
acc[Laughter] : 0.728553
acc[Domestic sounds] : 0.658919
acc[Walk, footsteps] : 0.698061
acc[Door, open or close] : 0.920455
acc[Music] : 0.633067
acc[Musical instrument] : 0.619764
acc[Water tap, faucet] : 0.842478





In [417]:
eval_model("615_postprocess_v1")

 73%|██████████████████████████████████▎            | 38/52 [00:16<00:06,  2.08it/s]

mix052 sub[61] more than 5 soruce


100%|███████████████████████████████████████████████| 52/52 [00:21<00:00,  2.43it/s]

615_postprocess_v1
n_target : 52
f1-score 0.703
accuracy 0.593
recall   0.737
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.202 at /home/data/kbh/DCASE_eval/eval_label/mix018.csv
--------------------------
f1[Female speech, woman speaking] : 0.695264
f1[Male speech, man speaking] : 0.777210
f1[Clapping] : 0.600000
f1[Telephone] : 0.432000
f1[Laughter] : 0.393443
f1[Domestic sounds] : 0.684355
f1[Walk, footsteps] : 0.359897
f1[Door, open or close] : 0.139535
f1[Music] : 0.776935
f1[Musical instrument] : 0.273077
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.686016
f1[Knock] : 0.202899
--------------------------
acc[Female speech, woman speaking] : 0.608474
acc[Male speech, man speaking] : 0.677137
acc[Clapping] : 0.913189
acc[Telephone] : 0.882838
acc[Laughter] : 0.723192
acc[Domestic sounds] : 0.645427
acc[Walk, footsteps] : 0.698912
acc[Door, open or close] : 0.932482
acc[Music] : 0.708680
acc[Musical instrument] : 0.605016
acc[Water tap, faucet] : 0.850921





In [418]:
eval_model("615_v2")

 73%|██████████████████████████████████▎            | 38/52 [00:17<00:06,  2.02it/s]

mix052 sub[61] more than 5 soruce


100%|███████████████████████████████████████████████| 52/52 [00:21<00:00,  2.40it/s]

615_v2
n_target : 52
f1-score 0.704
accuracy 0.593
recall   0.741
max f1 1.000 at /home/data/kbh/DCASE_eval/eval_label/mix036.csv
min f1 0.202 at /home/data/kbh/DCASE_eval/eval_label/mix018.csv
--------------------------
f1[Female speech, woman speaking] : 0.695264
f1[Male speech, man speaking] : 0.777210
f1[Clapping] : 0.600000
f1[Telephone] : 0.432000
f1[Laughter] : 0.393443
f1[Domestic sounds] : 0.684355
f1[Walk, footsteps] : 0.359897
f1[Door, open or close] : 0.139535
f1[Music] : 0.776935
f1[Musical instrument] : 0.273077
f1[Water tap, faucet] : 0.000000
f1[Bell] : 0.707965
f1[Knock] : 0.202899
--------------------------
acc[Female speech, woman speaking] : 0.608474
acc[Male speech, man speaking] : 0.677137
acc[Clapping] : 0.913189
acc[Telephone] : 0.882838
acc[Laughter] : 0.723192
acc[Domestic sounds] : 0.645427
acc[Walk, footsteps] : 0.698912
acc[Door, open or close] : 0.932482
acc[Music] : 0.708680
acc[Musical instrument] : 0.605016
acc[Water tap, faucet] : 0.850921
acc[Bell] : 


