In [210]:
import pandas as pd
import librosa
import numpy as np
import os,glob
from tqdm import tqdm

# Metrics  
The metrics are based on true positives (TP) and false positives (FP) determined not only by correct or wrong detections, but also based on if they are closer or further than a distance threshold T∘ (angular in our case) from the reference. For the evaluation of this challenge we take this threshold to be T=20∘.  
  
More specifically, for each class c∈[1,...,C] and each frame or segment:  
  
Pc predicted events of class c are associated with Rc reference events of class c  
false negatives are counted for misses: FNc=max(0,Rc−Pc)  
false positives are counted for extraneous predictions: FPc,∞=max(0,Pc−Rc)  
Kc predictions are spatially associated with references based on Hungarian algorithm: Kc=min(Pc,Rc). Those can also be considered as the unthresholded true positives TPc=Kc.  
the spatial threshold is applied which moves Lc≤Kc predictions further than threhold to false positives: FPc,≥20∘=Lc, and FPc=FPc,∞+FPc,≥20∘  
the remaining matched estimates per class are counted as true positives: TPc,≤20∘=Kc−FPc,≥20∘  
finally: predictions Pc=TPc,≤20∘+FPc, but references Rc=TPc,≤20∘+FPc,≥20∘+FNc  
Based on those, we form the location-dependent F1-score (F≤20∘) and Error Rate (ER≤20∘). Contrary to the previous challenges, in which F≤20∘ was micro-averaged, in this challenge we perform macro-averaging of the location-dependent F1-score: F≤20∘=∑cFc,≤20∘/C.  
  
Additionally, we evaluate localization accuracy through a class-dependent localization error LEc, computed as the mean angular error of the matched true positives per class, and then macro-averaged:
  
LEc=∑kθk/Kc=∑kθk/TPc for each frame or segment, with θk being the angular error between the kth matched prediction and re  ference,  
and after averaging across all frames that have any true positives, LECD=∑cLEc/C.
Complementary to the localization error, we compute a localization recall metric per class, also macro-averaged:  

LRc=Kc/Rc=TPc/(TPc+FNc), and
LRCD=∑cLRc/C.
Note that the localization error and recall are not thresholded in order to give more varied complementary information to the location-dependent F1-score, presenting localization accuracy outside of the spatial threshold.

# Ranking
Overall ranking will be based on the cumulative rank of the metrics mentioned above, sorted in ascending order. By cumulative rank we mean the following: if system A was ranked individually for each metric as ER:1,F1:1,LE:3,LR:1, then its cumulative rank is 1+1+3+1=6. Then if system B has ER:3,F1:2,LE:2,LR:3 (10), and system C has ER:2,F1:3,LE:1,LR:2 (8), then the overall rank of the systems is A,C,B. If two systems end up with the same cumulative rank, then they are assumed to have equal place in the challenge, even though they will be listed alphabetically in the ranking tables

In [262]:
root_eval = "/home/data/kbh/DCASE_eval/"

dir_label = "eval_label"
#dir_sub = "submission-665"
dir_sub = "3track-submission652"
dir_audio = "foa_eval"

list_label = [x for x in glob.glob(os.path.join(root_eval,dir_label,"*.csv"))]

In [263]:
def eval(path_target):
    name_target = path_target.split('/')[-1]
    id_target = name_target.split('.')[0]
    
    path_sub = os.path.join(root_eval,dir_sub,name_target)
    path_aud = os.path.join(root_eval,dir_audio,id_target+".wav")
    
    raw,_ = librosa.load(path_aud,sr=24000)
    len_target = len(raw)
    n_label = int(len_target/24000)
    
    #print("Target :  {}".format(id_target))
    #print(n_label)
    
    # Label
    csv_label = pd.read_csv(
        path_target,
        names=["idx","1","2","3","4","5"],
        index_col="idx",
        #dtype=np.int32,
        keep_default_na=False,
    )
    label = csv_label[:n_label]
    
    csv_sub   = pd.read_csv(path_sub)
    # submission compression
    
    sub = pd.DataFrame(np.empty((n_label, 5),dtype=str) ,columns = ["1","2","3","4","5"])
    
    #display(csv_sub)
    for i_sub in csv_sub.index:
        idx = csv_sub.loc[i_sub][0]
        cat = csv_sub.loc[i_sub][1]
        
        j_sub = int(idx/10)
        
        if j_sub >= n_label:
            break
        
        # print("{} {} {}".format(idx,j_sub,cat))
        i_src = 0
        
        inserted = False
        for i_src in range(5) : 
            if sub.iloc[j_sub][i_src] == "" :
                sub.iloc[j_sub][i_src] = cat
                inserted = True
                break
            # dup
            elif sub.iloc[j_sub][i_src] == cat:
                inserted = True
                break
                
        if not inserted :
            print("{} sub[{}] more than 5 soruce".format(id_target,j_sub))
    
    
    ## Eval
    
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    
    for i_label in range(n_label) : 
        s_label = label.iloc[i_label]
        s_sub   = sub.iloc[i_label]
        
        l_label = list(filter(None, label.iloc[i_label]))
        l_sub = list(filter(None, sub.iloc[i_label]))
        
        # Exception
        
        if " " in l_label : 
            l_label = l_label.remove(" ")
            if l_label is None : 
                l_label = []
        
        if len(l_label) == 0 and len(l_sub) == 0 :
            TN +=1
            continue
            
        l_label = list(map(int, l_label))

        #print("=====")
        #print("label : {}".format(l_label))
        #print("submt : {}".format(l_sub))
        
        ## True
        for iter_sub in l_sub : 
            if iter_sub in l_label : 
                TP+=1
            else : 
                FP+=1
                
        for iter_label in l_label :
            if iter_label not in l_sub:
                FN+=1

    recall = TP/(TP+FN)
    f1 = (2*TP)/(2*TP+FP+FN)
    acc = (TP+TN)/(TP+TN+FP+FN)
                
    """       
    print("    |  PP   |   PN ")
    print(" P  | {:2d}    |   {:2d}".format(TP,FN))
    print(" N  | {:2d}    |   {:2d}".format(FP, TN))
    print("f1-score {:.3f}".format(f1))
    print("accuracy {:.3f}".format(acc))
    print("recall   {:.3f}".format(recall))
    """
    
    return recall,f1,acc

In [266]:
list_label = [x for x in glob.glob(os.path.join(root_eval,dir_label,"*.csv"))]

recall = 0.0
f1 = 0.0
acc = 0.0

#print(eval("/home/data/kbh/DCASE_eval/eval_label/mix025.csv"))


for path in tqdm(list_label) : 
    t_recall, t_f1, t_acc = eval(path)
    recall += t_recall
    f1 += t_f1
    acc += t_acc

        
n_target = len(list_label)

print(dir_sub)
print("n_target : {}".format(n_target))
    
print("f1-score {:.3f}".format(f1/n_target))
print("accuracy {:.3f}".format(acc/n_target))
print("recall   {:.3f}".format(recall/n_target))

100%|██████████████████████████████████████████████████████| 29/29 [00:08<00:00,  3.43it/s]

3track-submission652
n_target : 29
f1-score 0.625
accuracy 0.542
recall   0.587



