# Comparing the segmentation performances

This notebook reports the comparison of the L-measure-based evaluations performed on the structual segmentations of the different methods under study. The evaluations were computed for each segmentation elaborated by each procedure, with respect to both SALAMI's annotations and their extended counterparts.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def bold_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

## Loading the evaluations

In [3]:
res_dir = '../experiments/results/'
measures = ['l_measure', 'l_precision', 'l_recall']
use_cols = measures + ['track_id']

# --- first we start with the evaluation of the algos on the non-extended (original) reference hierarchies
inter_anno = pd.read_csv('evaluations/reference_eval.csv', index_col=0)

lsd_ao = pd.read_csv('evaluations/LSD_annotator_one.csv', index_col=0)
lsd_at = pd.read_csv('evaluations/LSD_annotator_two.csv', index_col=0)
lsdm_ao = pd.read_csv('evaluations/LSD_truncated_annotator_one.csv')
lsdm_at = pd.read_csv('evaluations/LSD_truncated_annotator_two.csv')
olda_ao = pd.read_csv('evaluations/OLDA_annotator_one.csv', usecols=use_cols)
olda_at = pd.read_csv('evaluations/OLDA_annotator_two.csv', usecols=use_cols)
mscom_ao = pd.read_csv('evaluations/mscom_annotator_one.csv', usecols=use_cols)
mscom_at = pd.read_csv('evaluations/mscom_annotator_two.csv', usecols=use_cols)
dmscom_ao = pd.read_csv('evaluations/dmscom_annotator_one.csv', usecols=use_cols)
dmscom_at = pd.read_csv('evaluations/dmscom_annotator_two.csv', usecols=use_cols)

# --- and then with the evaluation of the algos on the extended reference hierarchies
inter_anno_ext =pd.read_csv('evaluations/SALAMI-interanno-expanded.csv')
lsd_ext_ao = pd.read_csv('evaluations/LSD-a1-expanded.csv')
lsd_ext_at = pd.read_csv('evaluations/LSD-a2-expanded.csv')
lsdm_ext_ao = pd.read_csv('evaluations/LSD-mono-a1-expanded.csv')
lsdm_ext_at = pd.read_csv('evaluations/LSD-mono-a2-expanded.csv')
olda_ext_ao = pd.read_csv('evaluations/OLDA-a1-expanded.csv')
olda_ext_at = pd.read_csv('evaluations/OLDA-a2-expanded.csv')
mscom_ext_ao = pd.read_csv('evaluations/MSCOM-a1-expanded.csv')
mscom_ext_at = pd.read_csv('evaluations/MSCOM-a2-expanded.csv')
dmscom_ext_ao = pd.read_csv('evaluations/DMSCOM-a1-expanded.csv')
dmscom_ext_at = pd.read_csv('evaluations/DMSCOM-a2-expanded.csv')


In [4]:
# Extracting statistics from the evaluation data [ANNOTATOR 1]
lsd_ao_d = lsd_ao.describe()[measures][1:3]
lsd_at_d = lsd_at.describe()[measures][1:3]

lsdm_ao_d = lsdm_ao.describe()[measures][1:3]
lsdm_at_d = lsdm_at.describe()[measures][1:3]

olda_ao_d = olda_ao.describe()[measures][1:3]
olda_at_d = olda_at.describe()[measures][1:3]

mscom_ao_d = mscom_ao.describe()[measures][1:3]
mscom_at_d = mscom_at.describe()[measures][1:3]

dmscom_ao_d = dmscom_ao.describe()[measures][1:3]
dmscom_at_d = dmscom_at.describe()[measures][1:3]

lsd_ao_d.rename(index={'mean':'LSD', 'std':'LSD-std'}, inplace=True)
lsdm_ao_d.rename(index={'mean':'LSDM', 'std':'LSDM-std'}, inplace=True)
olda_ao_d.rename(index={'mean':'OLDA', 'std':'OLDA-std'}, inplace=True)
mscom_ao_d.rename(index={'mean':'MSCOM', 'std':'MSCOM-std'}, inplace=True)
dmscom_ao_d.rename(index={'mean':'DMSCOM', 'std':'DMSCOM-std'}, inplace=True)

lsd_at_d.rename(index={'mean':'LSD', 'std':'LSD-std'}, inplace=True)
lsdm_at_d.rename(index={'mean':'LSDM', 'std':'LSDM-std'}, inplace=True)
olda_at_d.rename(index={'mean':'OLDA', 'std':'OLDA-std'}, inplace=True)
mscom_at_d.rename(index={'mean':'MSCOM', 'std':'MSCOM-std'}, inplace=True)
dmscom_at_d.rename(index={'mean':'DMSCOM', 'std':'DMSCOM-std'}, inplace=True)


In [5]:
# Extracting statistics from the evaluation data [ANNOTATOR 2]
lsd_ext_ao_d = lsd_ext_ao.describe()[measures][1:3]
lsd_ext_at_d = lsd_ext_at.describe()[measures][1:3]

lsdm_ext_ao_d = lsdm_ext_ao.describe()[measures][1:3]
lsdm_ext_at_d = lsdm_ext_at.describe()[measures][1:3]

olda_ext_ao_d = olda_ext_ao.describe()[measures][1:3]
olda_ext_at_d = olda_ext_at.describe()[measures][1:3]

mscom_ext_ao_d = mscom_ext_ao.describe()[measures][1:3]
mscom_ext_at_d = mscom_ext_at.describe()[measures][1:3]

dmscom_ext_ao_d = dmscom_ext_ao.describe()[measures][1:3]
dmscom_ext_at_d = dmscom_ext_at.describe()[measures][1:3]

lsd_ext_ao_d.rename(index={'mean':'LSD', 'std':'LSD-std'}, inplace=True)
lsdm_ext_ao_d.rename(index={'mean':'LSDM', 'std':'LSDM-std'}, inplace=True)
olda_ext_ao_d.rename(index={'mean':'OLDA', 'std':'OLDA-std'}, inplace=True)
mscom_ext_ao_d.rename(index={'mean':'MSCOM', 'std':'MSCOM-std'}, inplace=True)
dmscom_ext_ao_d.rename(index={'mean':'DMSCOM', 'std':'DMSCOM-std'}, inplace=True)

lsd_ext_at_d.rename(index={'mean':'LSD', 'std':'LSD-std'}, inplace=True)
lsdm_ext_at_d.rename(index={'mean':'LSDM', 'std':'LSDM-std'}, inplace=True)
olda_ext_at_d.rename(index={'mean':'OLDA', 'std':'OLDA-std'}, inplace=True)
mscom_ext_at_d.rename(index={'mean':'MSCOM', 'std':'MSCOM-std'}, inplace=True)
dmscom_ext_at_d.rename(index={'mean':'DMSCOM', 'std':'DMSCOM-std'}, inplace=True)

## Inter-annotator agreement w and w/o hierarchical expansion

We can only compute inter-annotator agreement on those tracks for which we have 2 different annotation performed by two different human experts. Therefore, we only have 490 evaluations for this purpose.

In [6]:
# Extracting statistics on inter-annotator agreement
inter_anno_d = inter_anno.describe()[measures][1:3]
inter_anno_ext_d = inter_anno_ext.describe()[measures][1:3]

inter_anno_impro = inter_anno_ext_d - inter_anno_d

inter_anno_d.rename(index={'mean':'Inter-annotator mean',
                         'std':'Inter-annotator std'}, inplace=True)

inter_anno_ext_d.rename(index={'mean':'Inter-annotator mean (extended)', 
                             'std':'Inter-annotator std (extended)'}, inplace=True)


In [7]:
# INTER-ANNOTATOR comparison
inter_anno_res = pd.concat([inter_anno_d, inter_anno_ext_d], ignore_index=False)
inter_anno_res

Unnamed: 0,l_measure,l_precision,l_recall
Inter-annotator mean,0.639968,0.641177,0.662037
Inter-annotator std,0.198592,0.197409,0.199918
Inter-annotator mean (extended),0.678013,0.683426,0.694269
Inter-annotator std (extended),0.167881,0.17543,0.174753


In [8]:
inter_anno_impro # the L-measures increase when the hierarchies are expanded

Unnamed: 0,l_measure,l_precision,l_recall
mean,0.038045,0.04225,0.032233
std,-0.030711,-0.021979,-0.025166


## Evaluation of the automatic procedures

#### Annotator 1 (original hierarchies)

In [9]:
# ANNOTATOR 1 comparison
ann_one_res = pd.concat([lsd_ao_d, lsdm_ao_d, olda_ao_d, mscom_ao_d,
                         dmscom_ao_d], ignore_index=False)
ann_one_res.style.apply(bold_max)

Unnamed: 0,l_measure,l_precision,l_recall
LSD,0.462849,0.394006,0.584487
LSD-std,0.128759,0.120836,0.150918
LSDM,0.301402,0.377699,0.28968
LSDM-std,0.179654,0.158068,0.205535
OLDA,0.398065,0.325148,0.536181
OLDA-std,0.101791,0.0987637,0.111689
MSCOM,0.460557,0.382006,0.598875
MSCOM-std,0.112485,0.102639,0.13572
DMSCOM,0.4799,0.403425,0.611376
DMSCOM-std,0.111272,0.103347,0.133467


#### Annotator 1 (extended hierarchies)

In [10]:
# ANNOTATOR 1 comparison [EXTENDED]
ann_ext_one_res = pd.concat([lsd_ext_ao_d, lsdm_ext_ao_d, olda_ext_ao_d,
                             mscom_ext_ao_d, dmscom_ext_ao_d], ignore_index=False)
ann_ext_one_res.style.apply(bold_max)

Unnamed: 0,l_measure,l_precision,l_recall
LSD,0.480098,0.420436,0.577297
LSD-std,0.123615,0.12002,0.143063
LSDM,0.30901,0.402956,0.282848
LSDM-std,0.179479,0.15811,0.194113
OLDA,0.415223,0.348403,0.531461
OLDA-std,0.0975342,0.0984237,0.104666
MSCOM,0.4787,0.407986,0.593825
MSCOM-std,0.105515,0.0989488,0.129567
DMSCOM,0.498434,0.430661,0.60687
DMSCOM-std,0.104375,0.100034,0.127108


#### Annotator 1 (improvement)

In [11]:
ann_one_ext_impro = ann_ext_one_res - ann_one_res

ann_one_ext_impro.style.apply(bold_max) # dmscom has the highest improvement over the other methods

Unnamed: 0,l_measure,l_precision,l_recall
LSD,0.0172487,0.0264298,-0.00718976
LSD-std,-0.00514451,-0.000816039,-0.00785507
LSDM,0.00760824,0.0252569,-0.00683286
LSDM-std,-0.000174819,4.15445e-05,-0.0114219
OLDA,0.0171578,0.0232543,-0.00471922
OLDA-std,-0.00425722,-0.000339913,-0.00702295
MSCOM,0.0181429,0.0259796,-0.00505015
MSCOM-std,-0.00697073,-0.00368989,-0.00615247
DMSCOM,0.0185337,0.0272362,-0.00450538
DMSCOM-std,-0.00689634,-0.00331276,-0.0063585


#### Annotator 2 (original hierarchies)

In [12]:
# ANNOTATOR 2 comparison
ann_two_res = pd.concat([lsd_at_d, lsdm_at_d, olda_at_d, mscom_at_d, 
                         dmscom_at_d], ignore_index=False)
ann_two_res.style.apply(bold_max)

Unnamed: 0,l_measure,l_precision,l_recall
LSD,0.462027,0.396296,0.577835
LSD-std,0.136007,0.126853,0.157131
LSDM,0.304704,0.384078,0.290827
LSDM-std,0.178436,0.162119,0.205337
OLDA,0.402286,0.330392,0.536009
OLDA-std,0.108019,0.10329,0.118989
MSCOM,0.455899,0.38175,0.583448
MSCOM-std,0.117616,0.107452,0.139656
DMSCOM,0.476361,0.403794,0.59769
DMSCOM-std,0.12276,0.113229,0.141229


#### Annotator 2 (extended hierarchies)

In [13]:
# ANNOTATOR 2 comparison [EXTENDED]

ann_ext_two_res = pd.concat([lsd_ext_at_d, lsdm_ext_at_d, olda_ext_at_d,
                             mscom_ext_at_d, dmscom_ext_at_d], ignore_index=False)
ann_ext_two_res.style.apply(bold_max)

Unnamed: 0,l_measure,l_precision,l_recall
LSD,0.478554,0.4195,0.5742
LSD-std,0.121486,0.118911,0.139633
LSDM,0.311427,0.405658,0.283352
LSDM-std,0.17228,0.155434,0.187813
OLDA,0.418412,0.350962,0.53433
OLDA-std,0.0970478,0.0995481,0.102004
MSCOM,0.474519,0.406034,0.584935
MSCOM-std,0.102921,0.0999369,0.124216
DMSCOM,0.496077,0.42946,0.601033
DMSCOM-std,0.104875,0.102511,0.123799


#### Annotator 2 (improvement)

In [14]:
ann_two_ext_impro = ann_ext_two_res - ann_two_res

ann_two_ext_impro.style.apply(bold_max) # again, dmscom has the highest improvement over the other methods

Unnamed: 0,l_measure,l_precision,l_recall
LSD,0.0165266,0.0232041,-0.00363499
LSD-std,-0.0145206,-0.0079415,-0.0174979
LSDM,0.00672252,0.0215795,-0.0074752
LSDM-std,-0.00615564,-0.00668492,-0.0175244
OLDA,0.0161264,0.0205694,-0.00167922
OLDA-std,-0.0109716,-0.00374162,-0.0169848
MSCOM,0.0186202,0.0242844,0.00148692
MSCOM-std,-0.0146955,-0.00751537,-0.0154406
DMSCOM,0.0197168,0.0256655,0.00334271
DMSCOM-std,-0.017885,-0.0107178,-0.0174294
