In [1]:
import pandas as pd
import glob
import os
from pyannote.metrics.base import BaseMetric
from inaGVAD.gender_metrics import WstpErr, IdentificationErrorRateLabel
from pyannote.metrics.identification import IdentificationErrorRate

In [2]:
from pyannote.core import Annotation, Timeline, Segment

def df2annot(df, col, rmnan=True, uri=None):
    an = Annotation(uri=uri)
#    print('df2annot', df)
    for start, stop, val in zip(df.start, df.stop, df[col]):
        if rmnan and val != val:
            continue
        seg = Segment(start, stop)
        an[seg] = val
    return an.support()


def init_uem(df):
    uem = Timeline()
    uem.add(Segment(df.start[0], df.stop[len(df) - 1]))
    return uem


def rm_uem(uem, df, col, rmlist):
    for start, stop, val in zip(df.start, df.stop, df[col]):
        if val in rmlist:
            uem = uem.extrude(Segment(start, stop))
    return uem

def keep_uem(uem, df, col, keeplist):
    for start, stop, val in zip(df.start, df.stop, df[col]):
        if val not in keeplist:
            uem = uem.extrude(Segment(start, stop))
    return uem


class GenderEval:
    def __init__(self, collar=.3):
        self.wstp = WstpErr()
        self.ier = IdentificationErrorRate(collar=collar)        
        self.ierF = IdentificationErrorRateLabel('female', collar=collar)
        self.ierM = IdentificationErrorRateLabel('male', collar=collar)
        
    def __call__(self, fref, fpred):
        
        uri, _ = os.path.splitext(os.path.basename(fref))
        
        # parse reference
        dfref = pd.read_csv(fref)
        anref = df2annot(dfref, 'speaker_gender', uri = uri)
        uem = init_uem(dfref)
        uem = rm_uem(uem, dfref, 'speaker_gender', ['undefgender'])
        #print('main uem', uem)
        #print(uem)
        
        # parse prediction
        dfpred = pd.read_csv(fpred)
        #print('dfpred', dfpred)
        if len(dfpred) > 0:
            dfpred = dfpred[dfpred.label.map(lambda x: x in ['male', 'female'])]
        anpred = df2annot(dfpred, 'label', uri = uri)
            
        wstp = self.wstp(anref, anpred, uem=None)
        
        ier = self.ier(anref, anpred, uem=uem)
        errF = self.ierF(anref,anpred,uem=uem)
        errM = self.ierM(anref,anpred,uem=uem)
        
        return wstp, ier, errF, errM
    
    def reset(self):
        self.wstp.reset()
        self.ier.reset()
        self.ierF.reset()
        self.ierM.reset()
        
    def report(self):
        wstp = self.wstp.report()
        wstp.columns = [('WSTP', e1, e2) for e1, e2 in wstp.columns]
        ier = self.ier.report()
        ier.columns = [('IER', e1, e2) for e1, e2 in ier.columns]
        ierF = self.ierF.report()
        ierF.columns = [('IER Female', e1, e2) for e1, e2 in ierF.columns]
        ierM = self.ierM.report()
        ierM.columns = [('IER Male', e1, e2) for e1, e2 in ierM.columns]
        return wstp.join(ier).join(ierF).join(ierM)
        
#        if onlypercent:
#            ret = ret[[k for k in ret if k[2] == '%']]
#            ret.columns = [(e1, e2) for (e1, e2, e3) in ret]
#        return ret

    def compare_lfiles(self, lref, lhyp, simplify=None):
#        assert(simplify in [None, 'onlypercent', 'genderdetail', 'gender', ''])

        for ref, hyp in zip(lref, lhyp):
            self(ref, hyp)
        ret = self.report()
        self.reset()
        return ret
        
#        if simplify == 'only percent':
#            ret = ret[[k for k in ret if k[2] == '%']]
#            ret.columns = [(e1, e2) for (e1, e2, e3) in ret]

    
    def compare_category(self, pred_dir, criterion, csvfname='./annotations/filesplit/testset.csv'):

        df = pd.read_csv(csvfname)
        lret = []
        #ldf = []
        #dret = {}
        ltotal = []
        
        for k, sdf in df.groupby(criterion):
            src = sdf.fileid.map(lambda x: './annotations/detailed_csv/%s.csv' % x)
            dst = sdf.fileid.map(lambda x: '%s/%s.csv' % (pred_dir, x))
            
            ret = self.compare_lfiles(src, dst)
            #print(ret)
            dtotal = ret.tail(1).to_dict(orient='records')[0]
            #print(dtotal)
            
            if isinstance(criterion, list):
                for i, key in enumerate(criterion):
                    dtotal[key] = k[i]
            else:
#                criterion = str(criterion)
                dtotal[criterion] = k
            #print(dtotal)
            
            lret.append(ret[:(len(ret) - 1)])
            dtotal['nbfile'] = len(ret)
            ltotal.append(dtotal)

        details = df.join(pd.concat(lret), on='fileid')
        return details, pd.DataFrame.from_dict(ltotal).set_index(criterion)

def tuple3cols(df):
    return [k for k in df.columns if isinstance(k, tuple) and len(k) == 3]

def simplify_results(df, add_speech_percent=True, rmdur=True, iergbias = True, ierdetail=False, genderdetail=False):
    #assert strategy in ['percents', 'wstp_ier', 'wstp_iergender', 'wstp_iergenderdetail']
    
#    if add_speech_percent:
#        
#        if 'nbfile' in df:
#            nbfile = df.nbfile
#            df = df.drop(['nbfile'], axis=1)
        
    ldrop = []
    if rmdur:
        ldrop += [k for k in df.columns if isinstance(k, tuple) and len(k) == 3 and k[2] != '%']
        
    if iergbias:
        df['IER gender bias'] = df[('IER Female', 'identification error rate', '%')] - df[('IER Male', 'identification error rate', '%')]
        
    if ierdetail == False:
        ldrop += [k for k in tuple3cols(df) if k[0] == 'IER' and k[1] != 'identification error rate']

    if genderdetail == False:
        ldrop += [k for k in tuple3cols(df) if k[0] in ['IER Female', 'IER Male'] and k[1] != 'identification error rate']
        
    return df.drop(ldrop, axis=1)    

In [3]:
ref = './annotations/detailed_csv/'
lref = sorted(glob.glob('./annotations/detailed_csv/*.csv'))
iss = './automatic_baselines/inaspeechsegmenter/'
liss = sorted(glob.glob('./automatic_baselines/inaspeechsegmenter/*.csv'))
lium = './automatic_baselines/liumspkdiarization_csv/'
llium = sorted(glob.glob('./automatic_baselines/liumspkdiarization_csv/*.csv'))

In [4]:
# init evaluator and COLLAR (zone to ignore around boundaries)
ge = GenderEval(collar=0.3)

In [5]:
# evaluation performed on dev split : return results per file in details and per category in category
details, categories = ge.compare_category(iss, ['channel_category'])#, 'vad_difficulty'])

In [6]:
# human readable simplication of results per category
simplify_results(categories)

Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",nbfile,IER gender bias
channel_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
generalist_radio,0.670746,5.851559,6.088145,5.800402,35,0.287743
generalist_tv,-1.437044,25.517894,26.107704,25.120865,94,0.986839
music_radio,3.610936,16.286951,5.260995,23.403875,47,-18.142881
news_tv,1.042652,6.466632,5.425766,7.250946,45,-1.82518


In [7]:
# simplified view of results per file
simplify_results(details)

Unnamed: 0,fileid,media,channel_code,channel_name,channel_category,broadcast_hour,vad_difficulty,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",IER gender bias
0,radio-FBL-011904,radio,FBL,France Bleu,music_radio,1,EASY,0.000000,0.000000,0.000000,0.000000,0.000000
1,radio-FBL-025455,radio,FBL,France Bleu,music_radio,2,MEDIUM,0.000000,0.000000,0.000000,0.000000,0.000000
2,radio-FBL-035452,radio,FBL,France Bleu,music_radio,3,EASY,0.000000,0.000000,0.000000,0.000000,0.000000
3,radio-FBL-044522,radio,FBL,France Bleu,music_radio,4,MEDIUM,0.000000,0.000000,0.000000,0.000000,0.000000
4,radio-FBL-051706,radio,FBL,France Bleu,music_radio,5,EASY,0.000000,2.633636,2.633636,0.000000,2.633636
...,...,...,...,...,...,...,...,...,...,...,...,...
212,tv-TF1-194413,tv,TF1,TF1,generalist_tv,19,HARD,7.479113,77.934416,41.005938,100.189447,-59.183509
213,tv-TF1-215118,tv,TF1,TF1,generalist_tv,21,HARD,-4.688474,10.443075,14.007484,2.263856,11.743628
214,tv-TF1-225026,tv,TF1,TF1,generalist_tv,22,HARD,-5.115000,32.358499,63.022585,25.079845,37.942740
215,tv-TFX-230837,tv,TFX,TFX,generalist_tv,23,HARD,25.488273,81.186036,46.626476,1745.129225,-1698.502748


In [8]:
# detailled metrics on categories (hard 2 read)
categories


Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(WSTP, ref_male_dur, )","(WSTP, ref_female_dur, )","(WSTP, hyp_male_dur, )","(WSTP, hyp_female_dur, )","(IER, identification error rate, %)","(IER, total, )","(IER, correct, )","(IER, correct, %)","(IER, false alarm, )",...,"(IER Male, correct, )","(IER Male, correct, %)","(IER Male, false alarm, )","(IER Male, false alarm, %)","(IER Male, missed detection, )","(IER Male, missed detection, %)","(IER Male, confusion, )","(IER Male, confusion, %)",nbfile,IER gender bias
channel_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
generalist_radio,0.670746,1491.856,334.215,1630.3,348.98,5.851559,1699.547,1681.382,98.931186,81.285,...,1388.608,99.371827,67.717,4.845977,3.95,0.282671,9.387,0.671754,35,0.287743
generalist_tv,-1.437044,1971.82,1367.672,2158.52,1588.36,25.517894,2823.646,2496.775,88.423797,393.664,...,1471.996,87.222932,272.873,16.169054,103.127,6.110777,47.946,2.841034,94,0.986839
music_radio,3.610936,218.5525,141.8625,265.16,147.54,16.286951,322.522,313.365,97.160814,43.372,...,193.371,98.655653,40.98,20.907523,2.635,1.344347,2.258,1.152006,47,-18.142881
news_tv,1.042652,1287.691,975.546,1437.84,1043.82,6.466632,2042.114,2019.279,98.881796,109.221,...,1157.665,99.406309,66.289,5.6921,6.871,0.589999,11.283,0.968848,45,-1.82518


In [9]:
# LIUM results are les convincing
details, categories = ge.compare_category(lium, ['channel_category'])#, 'vad_difficulty'])

In [10]:
simplify_results(categories)

Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",nbfile,IER gender bias
channel_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
generalist_radio,-2.404443,9.757482,27.929481,5.828096,35,22.101385
generalist_tv,-10.125963,47.260492,70.070008,31.906318,94,38.163689
music_radio,-3.552042,219.117456,247.118151,201.043846,47,46.074305
news_tv,-2.053974,15.927465,21.493958,11.732995,45,9.760962


In [11]:
#ina speech segmenter again : harder VAD means strong gender biases and low IER
details, categories = ge.compare_category(iss, ['vad_difficulty'])
simplify_results(categories)

Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",nbfile,IER gender bias
vad_difficulty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EASY,-0.829049,4.93339,7.175926,3.582428,96,3.593498
HARD,-0.81912,55.8561,46.418721,62.566244,45,-16.147523
MEDIUM,0.929599,15.962817,16.286284,15.818216,79,0.468069


In [12]:
# multi criterion categories
details, categories = ge.compare_category(iss, ['channel_category', 'vad_difficulty'])
simplify_results(categories)

Unnamed: 0_level_0,Unnamed: 1_level_0,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",nbfile,IER gender bias
channel_category,vad_difficulty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
generalist_radio,EASY,1.016144,2.432442,1.714421,2.611223,26,-0.896802
generalist_radio,HARD,2.540544,35.483808,30.055415,37.800322,4,-7.744907
generalist_radio,MEDIUM,-1.451433,11.638845,77.481359,10.105248,7,67.376111
generalist_tv,EASY,-6.609124,15.989948,23.455468,8.103532,18,15.351936
generalist_tv,HARD,-3.376755,54.486051,48.74182,58.900198,28,-10.158377
generalist_tv,MEDIUM,0.869468,18.293169,17.229385,18.868623,50,-1.639238
music_radio,EASY,-2.263857,2.54583,2.347034,2.679484,26,-0.33245
music_radio,HARD,71.434134,652.633362,62.151487,2240.037477,14,-2177.88599
music_radio,MEDIUM,4.54346,10.021808,6.496364,11.82206,9,-5.325696
news_tv,EASY,0.435185,3.082951,2.725143,3.419553,29,-0.69441


In [13]:
# results per channel
details, categories = ge.compare_category(iss, ['channel_name'])
simplify_results(categories)

Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",nbfile,IER gender bias
channel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arte,3.903907,26.765495,17.727759,29.174302,7,-11.446543
BFM TV,0.717261,5.463057,5.583742,5.390309,9,0.193432
CNEWS,0.243643,4.704669,5.343231,4.346259,12,0.996972
Canal+,1.677911,42.855639,28.477517,52.482291,11,-24.004775
Chérie 25,6.900205,6.25051,0.049898,21.353777,2,-21.303879
FIP,0.0,0.0,0.0,0.0,5,0.0
France 2,2.47806,13.974871,13.293729,14.51586,9,-1.222131
France 24,0.425246,9.256023,9.472003,9.116537,12,0.355467
France 3,-4.280538,26.249833,49.209521,18.497738,9,30.711783
France 4,-21.490039,40.313481,157.508364,19.454062,13,138.054302


In [14]:
# possibilité de comparer des listes de fichier
# la derniere ligne correspond aux stats globales
results = ge.compare_lfiles(lref, liss)
results

Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(WSTP, ref_male_dur, )","(WSTP, ref_female_dur, )","(WSTP, hyp_male_dur, )","(WSTP, hyp_female_dur, )","(IER, identification error rate, %)","(IER, total, )","(IER, correct, )","(IER, correct, %)","(IER, false alarm, )",...,"(IER Male, identification error rate, %)","(IER Male, total, )","(IER Male, correct, )","(IER Male, correct, %)","(IER Male, false alarm, )","(IER Male, false alarm, %)","(IER Male, missed detection, )","(IER Male, missed detection, %)","(IER Male, confusion, )","(IER Male, confusion, %)"
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
radio-FBL-011904,0.000000,0.000,0.000,0.00,0.00,0.000000,0.000,0.000,,0.000,...,0.000000,0.000,0.000,,0.000,,0.000,,0.000,
radio-FBL-012717,0.000000,0.000,0.000,0.00,0.00,0.000000,0.000,0.000,,0.000,...,0.000000,0.000,0.000,,0.000,,0.000,,0.000,
radio-FBL-025455,0.000000,0.000,0.000,0.00,0.00,0.000000,0.000,0.000,,0.000,...,0.000000,0.000,0.000,,0.000,,0.000,,0.000,
radio-FBL-035452,0.000000,0.000,0.000,0.00,0.00,0.000000,0.000,0.000,,0.000,...,0.000000,0.000,0.000,,0.000,,0.000,,0.000,
radio-FBL-044522,0.000000,0.000,0.000,0.00,0.00,0.000000,0.000,0.000,,0.000,...,0.000000,0.000,0.000,,0.000,,0.000,,0.000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tv-TF1-215118,-4.688474,5.724,13.731,5.10,15.52,10.443075,16.882,15.965,94.568179,0.846,...,2.263856,5.124,4.516,88.134270,0.116,2.263856,0.000,0.00000,0.000,0.000000
tv-TF1-225026,-5.115000,37.768,8.924,46.10,14.74,32.358499,43.392,40.673,93.733868,11.322,...,25.079845,35.068,32.379,92.332041,8.765,24.994297,0.000,0.00000,0.030,0.085548
tv-TFX-230837,25.488273,2.828,31.343,13.58,26.64,81.186036,24.721,16.571,67.032078,11.920,...,1745.129225,0.503,0.460,91.451292,8.304,1650.894632,0.000,0.00000,0.474,94.234592
tv-TFX-231251,0.000000,10.775,0.000,21.04,0.00,95.639871,7.775,7.775,100.000000,7.436,...,95.639871,7.775,7.775,100.000000,7.436,95.639871,0.000,0.00000,0.000,0.000000


In [15]:
simplify_results(results)

Unnamed: 0_level_0,"(WSTP, WSTP error, %)","(IER, identification error rate, %)","(IER Female, identification error rate, %)","(IER Male, identification error rate, %)",IER gender bias
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
radio-FBL-011904,0.000000,0.000000,0.000000,0.000000,0.000000
radio-FBL-012717,0.000000,0.000000,0.000000,0.000000,0.000000
radio-FBL-025455,0.000000,0.000000,0.000000,0.000000,0.000000
radio-FBL-035452,0.000000,0.000000,0.000000,0.000000,0.000000
radio-FBL-044522,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...
tv-TF1-215118,-4.688474,10.443075,14.007484,2.263856,11.743628
tv-TF1-225026,-5.115000,32.358499,63.022585,25.079845,37.942740
tv-TFX-230837,25.488273,81.186036,46.626476,1745.129225,-1698.502748
tv-TFX-231251,0.000000,95.639871,0.000000,95.639871,-95.639871
