# Inter-Rater Agreement
#### Measure the agreement among pairs of raters of the HFC dataset (focusing only on 5/12 raters to simplify the analysis).

In [1]:
from typing import Optional, List

import numpy as np
import pandas as pd
import tqdm

import peyes
import analysis.utils as u
from analysis._article_results.hfc._helpers import *

GT_LABELERS = [GT1, GT2, GT3, GT4, GT5]

In [2]:
dataset = peyes.datasets.hfc(directory=u.DATASETS_DIR, save=False, verbose=True)

In [3]:
def calc_sample_level_agreement(
        dataframe: pd.DataFrame,
        metrics: List[str] = None,
        gt_labelers: List[str] = GT_LABELERS,
        pos_labels: Optional[np.ndarray] = None
) -> pd.DataFrame:
    metrics = metrics or ["balanced_accuracy", "cohen's_kappa", "mcc", "complement_nld"]
    trial_ids = dataframe[peyes.constants.TRIAL_ID_STR].unique()
    results = {}
    for i, trial_id in tqdm.tqdm(enumerate(trial_ids), total=len(trial_ids)):
        trial_data = dataframe[dataframe["trial_id"] == trial_id]
        labels_per_labeler = {lblr: trial_data[lblr].values for lblr in gt_labelers}
        for i, lblr1 in enumerate(labels_per_labeler.keys()):
            for j, lblr2 in enumerate(labels_per_labeler.keys()):
                if i >= j:
                    continue
                res = peyes.sample_metrics.calculate(labels_per_labeler[lblr1], labels_per_labeler[lblr2], *metrics, pos_labels=pos_labels)
                results[(trial_id, lblr1, lblr2)] = res
    results = pd.DataFrame(results).T
    results.index.names = [peyes.constants.TRIAL_ID_STR, u.GT_STR, u.PRED_STR]
    return results

### Entire Dataset
#### (A) All Annotators

In [4]:
overall_agreement = calc_sample_level_agreement(dataset)
overall_agreement_summary = overall_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
overall_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

overall_agreement_summary

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
100%|██████████| 70/70 [00:01<00:00, 57.43it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MN,JV,balanced_accuracy,70.0,0.955918,0.048932,0.629904,0.954283,0.963527,0.977482,0.999172
MN,JV,cohen's_kappa,70.0,0.872403,0.141078,0.0,0.854058,0.907146,0.937909,0.987264
MN,JV,mcc,70.0,0.876826,0.13493,0.0,0.857048,0.908465,0.939086,0.987273
MN,JV,complement_nld,70.0,0.965791,0.028674,0.829352,0.957331,0.974005,0.983006,0.999172
RA,JV,balanced_accuracy,70.0,0.938831,0.068404,0.624408,0.941198,0.963105,0.973171,0.999172
RA,JV,cohen's_kappa,70.0,0.844764,0.159539,0.0,0.829196,0.893013,0.924774,0.977122
RA,JV,mcc,70.0,0.850913,0.151512,0.0,0.833857,0.897106,0.925568,0.977236
RA,JV,complement_nld,70.0,0.955963,0.043709,0.768848,0.948066,0.970109,0.980948,0.999172
RA,MN,balanced_accuracy,70.0,0.940937,0.064205,0.583651,0.929881,0.95834,0.976835,1.0
RA,MN,cohen's_kappa,69.0,0.883802,0.118583,0.18479,0.861581,0.907979,0.945959,0.986193


#### mean agreement over subset of GT annotators
GT annotators = ["IH", "DN", "JV", "MN", "RA"]

In [5]:
overall_agreement_mean = overall_agreement_summary[["mean", "std", "min", "50%", "max"]].groupby(level="metric").mean()
overall_agreement_mean

Unnamed: 0_level_0,mean,std,min,50%,max
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
balanced_accuracy,0.945229,0.060513,0.612654,0.961657,0.999448
cohen's_kappa,0.86699,0.139733,0.061597,0.902713,0.983526
complement_nld,0.962406,0.038885,0.776312,0.973339,0.999448
mcc,0.867526,0.146833,0.0,0.904899,0.983599


#### (B) GT1-GT2 Agreement

In [6]:
gt1_gt2_overall_agreement = calc_sample_level_agreement(dataset, gt_labelers=[GT1, GT2])
gt1_gt2_overall_agreement_summary = gt1_gt2_overall_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
gt1_gt2_overall_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

gt1_gt2_overall_agreement_summary

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
100%|██████████| 70/70 [00:00<00:00, 164.58it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RA,MN,balanced_accuracy,70.0,0.940937,0.064205,0.583651,0.929881,0.95834,0.976835,1.0
RA,MN,cohen's_kappa,69.0,0.883802,0.118583,0.18479,0.861581,0.907979,0.945959,0.986193
RA,MN,mcc,70.0,0.874839,0.154055,0.0,0.865653,0.909126,0.946398,0.986287
RA,MN,complement_nld,70.0,0.965465,0.044272,0.730737,0.966521,0.975904,0.985951,1.0


### Free-Viewing Subset
#### (A) All Annotators

In [7]:
fv_dataset = dataset[dataset[peyes.constants.STIMULUS_TYPE_STR] == "free_viewing"]

fv_agreement = calc_sample_level_agreement(fv_dataset)
fv_agreement_summary = fv_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
fv_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

fv_agreement_mean = fv_agreement_summary[["mean", "std", "min", "50%", "max"]].groupby(level="metric").mean()
fv_agreement_mean

100%|██████████| 10/10 [00:00<00:00, 22.33it/s]


Unnamed: 0_level_0,mean,std,min,50%,max
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
balanced_accuracy,0.907145,0.089126,0.671922,0.941149,0.966662
cohen's_kappa,0.800166,0.160526,0.377229,0.858747,0.909639
complement_nld,0.947264,0.049298,0.814477,0.964347,0.977868
mcc,0.80366,0.155922,0.392023,0.860203,0.910077


#### (B) GT1-GT2 Agreement

In [8]:
gt1_gt2_fv_agreement = calc_sample_level_agreement(fv_dataset, gt_labelers=[GT1, GT2])
gt1_gt2_fv_agreement_summary = gt1_gt2_fv_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
gt1_gt2_fv_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

gt1_gt2_fv_agreement_summary

100%|██████████| 10/10 [00:00<00:00, 63.72it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RA,MN,balanced_accuracy,10.0,0.880287,0.112928,0.583651,0.887296,0.914027,0.938763,0.973741
RA,MN,cohen's_kappa,10.0,0.774451,0.218439,0.18479,0.796618,0.84791,0.883487,0.927445
RA,MN,mcc,10.0,0.776534,0.215308,0.194232,0.797772,0.848371,0.88353,0.927873
RA,MN,complement_nld,10.0,0.940278,0.069263,0.750834,0.936417,0.968791,0.974736,0.981572


### (C) RA-MN Agreement
(Raters _RA_ and _MN_ are the ground truth raters for dataset _lund2013_, so lets check their agreement in this dataset too)

In [9]:
ra_mn_fv_agreement = calc_sample_level_agreement(fv_dataset, gt_labelers=["RA", "MN"])
ra_mn_fv_agreement_summary = ra_mn_fv_agreement.groupby(level=[u.GT_STR, u.PRED_STR]).describe().stack(0, future_stack=True)
ra_mn_fv_agreement_summary.index.names = [u.GT_STR, u.PRED_STR, "metric"]

ra_mn_fv_agreement_summary

100%|██████████| 10/10 [00:00<00:00, 65.10it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
gt,pred,metric,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RA,MN,balanced_accuracy,10.0,0.880287,0.112928,0.583651,0.887296,0.914027,0.938763,0.973741
RA,MN,cohen's_kappa,10.0,0.774451,0.218439,0.18479,0.796618,0.84791,0.883487,0.927445
RA,MN,mcc,10.0,0.776534,0.215308,0.194232,0.797772,0.848371,0.88353,0.927873
RA,MN,complement_nld,10.0,0.940278,0.069263,0.750834,0.936417,0.968791,0.974736,0.981572
