In [1]:
import pandas as pd
import os
from pyannote.core import Timeline, Segment, Annotation

from inaGVAD.vad_metrics import VadEval, csv2annot, annot2vad


In [2]:
class VadFaEval(VadEval):
    def compare_csv(self, ref_csv, pred_csv, reset = False):
        predannot = annot2vad(csv2annot(pred_csv))
        refannot = Annotation() ## NOTHING DETECTED - we'll iterate on non speech segments
        
        ref = pd.read_csv(ref_csv)
        dret = {}
        for cat in nonspeech:
            uem = Timeline()
            for t in ref[ref[cat] == True].itertuples():
                uem.add(Segment(t.start, t.stop))
            uem = uem.support()
            #print(cat)
            #print('uem dur', uem.duration())
            ret = self(refannot, predannot, uem)
            dret[(cat, 'total')] = uem.duration()
            #print(ret)
            dret[(cat, 'fa')] = ret['false positive']
        return dret
ve = VadFaEval(collar=0.3)

In [3]:
nonspeech = ['applause', 'noise', 'hubbub', 'jingle', 'fg_music', 'bg_music', 'respiration', 'laughers', 'other', 'empty']
lsystems = ['inaspeechsegmenter', 'liumspkdiarization_csv', 'pyannote_vad', 'rvad_csv', 'silero_vad', 'speechbrain_vad']

In [4]:
def myeval(rdir, pdir):
    ld = []
    for f in os.listdir(rdir):
        ld.append(ve.compare_csv(rdir + f, pdir + f))
    stats = pd.DataFrame.from_dict(ld).sum()
    d = {'system' : pdir}
    for cat in nonspeech:
        d[cat] = 100 * stats[(cat, 'fa')] / stats[(cat, 'total')]
    return d
#myeval(rdir, pdir)

In [8]:
ld = []
for sys in lsystems:
    ld.append(myeval('./annotations/detailed_csv/', 'automatic_baselines/' + sys  + '/'))
df = pd.DataFrame.from_dict(ld).set_index('system')
df

Unnamed: 0_level_0,applause,noise,hubbub,jingle,fg_music,bg_music,respiration,laughers,other,empty
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
automatic_baselines/inaspeechsegmenter/,15.43371,35.589857,35.950409,8.105043,1.509349,25.895633,95.051145,77.006912,21.973769,45.742934
automatic_baselines/liumspkdiarization_csv/,28.01382,33.957232,35.218788,26.209908,31.051727,32.741995,95.993088,65.912187,22.174443,66.476826
automatic_baselines/pyannote_vad/,7.506705,21.893214,30.829815,5.001881,27.709654,21.222281,88.955646,63.072211,35.913848,41.539049
automatic_baselines/rvad_csv/,33.438827,33.771815,60.816923,83.775627,91.478572,53.774784,41.12048,85.735711,68.293557,21.737731
automatic_baselines/silero_vad/,10.681225,25.862606,29.770845,20.121973,41.123387,25.681731,63.446152,76.040794,33.359612,35.384863
automatic_baselines/speechbrain_vad/,30.329397,35.107573,39.993123,11.892817,9.293187,28.98036,97.73686,72.700005,21.237961,65.962379


In [16]:
ld = []

ld.append({'system' : 'MEAN'})
ld[-1].update(df.mean())

ld.append({'system' : 'DIFF'})
ld[-1].update(df.max() - df.min())

#ld.append({'system' : 'MEAN'})
#ld[-1].update(df.mean())



dmean = pd.DataFrame.from_dict(ld).set_index('system')
dmean

#d = {'system' : 'MAX'}
#d.update(df.max())
#dmean = pd.DataFrame.from_dict([d]).set_index('system')
#dmean

dall = pd.concat([df, dmean])
#.to_latex()
#df
dall

Unnamed: 0_level_0,applause,noise,hubbub,jingle,fg_music,bg_music,respiration,laughers,other,empty
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
automatic_baselines/inaspeechsegmenter/,15.43371,35.589857,35.950409,8.105043,1.509349,25.895633,95.051145,77.006912,21.973769,45.742934
automatic_baselines/liumspkdiarization_csv/,28.01382,33.957232,35.218788,26.209908,31.051727,32.741995,95.993088,65.912187,22.174443,66.476826
automatic_baselines/pyannote_vad/,7.506705,21.893214,30.829815,5.001881,27.709654,21.222281,88.955646,63.072211,35.913848,41.539049
automatic_baselines/rvad_csv/,33.438827,33.771815,60.816923,83.775627,91.478572,53.774784,41.12048,85.735711,68.293557,21.737731
automatic_baselines/silero_vad/,10.681225,25.862606,29.770845,20.121973,41.123387,25.681731,63.446152,76.040794,33.359612,35.384863
automatic_baselines/speechbrain_vad/,30.329397,35.107573,39.993123,11.892817,9.293187,28.98036,97.73686,72.700005,21.237961,65.962379
MEAN,20.900614,31.030383,38.763317,25.851208,33.694313,31.382797,80.383895,73.411303,33.825532,46.14063
DIFF,25.932122,13.696643,31.046078,78.773746,89.969222,32.552503,56.61638,22.663501,47.055596,44.739095


In [14]:
print(dall.to_latex(float_format='%.1f').replace('_', ''))

\begin{tabular}{lrrrrrrrrrr}
\toprule
 & applause & noise & hubbub & jingle & fgmusic & bgmusic & respiration & laughers & other & empty \\
system &  &  &  &  &  &  &  &  &  &  \\
\midrule
automaticbaselines/inaspeechsegmenter/ & 15.4 & 35.6 & 36.0 & 8.1 & 1.5 & 25.9 & 95.1 & 77.0 & 22.0 & 45.7 \\
automaticbaselines/liumspkdiarizationcsv/ & 28.0 & 34.0 & 35.2 & 26.2 & 31.1 & 32.7 & 96.0 & 65.9 & 22.2 & 66.5 \\
automaticbaselines/pyannotevad/ & 7.5 & 21.9 & 30.8 & 5.0 & 27.7 & 21.2 & 89.0 & 63.1 & 35.9 & 41.5 \\
automaticbaselines/rvadcsv/ & 33.4 & 33.8 & 60.8 & 83.8 & 91.5 & 53.8 & 41.1 & 85.7 & 68.3 & 21.7 \\
automaticbaselines/silerovad/ & 10.7 & 25.9 & 29.8 & 20.1 & 41.1 & 25.7 & 63.4 & 76.0 & 33.4 & 35.4 \\
automaticbaselines/speechbrainvad/ & 30.3 & 35.1 & 40.0 & 11.9 & 9.3 & 29.0 & 97.7 & 72.7 & 21.2 & 66.0 \\
MEAN & 20.9 & 31.0 & 38.8 & 25.9 & 33.7 & 31.4 & 80.4 & 73.4 & 33.8 & 46.1 \\
\bottomrule
\end{tabular}

