In [1]:
from typing import Optional, List

import numpy as np
import pandas as pd
import tqdm

import peyes
import analysis.utils as u

## Load Data

In [2]:
dataset = peyes.datasets.lund2013(directory=u.DATASETS_DIR, save=False, verbose=True)
dataset.head()

Unnamed: 0,trial_id,subject_id,stimulus_type,stimulus_name,t,x,y,pupil,pixel_size,viewer_distance,MN,RA
0,1,TH20,moving_dot,1,0.0,123.2532,22.6264,,0.037824,67.0,1.0,1.0
1,1,TH20,moving_dot,1,2.0,123.5395,22.9064,,0.037824,67.0,1.0,1.0
2,1,TH20,moving_dot,1,4.0,123.223,21.9909,,0.037824,67.0,1.0,1.0
3,1,TH20,moving_dot,1,6.0,123.1883,21.774,,0.037824,67.0,1.0,1.0
4,1,TH20,moving_dot,1,8.0,125.054,21.1805,,0.037824,67.0,1.0,1.0


## Sample-Level Analysis
### Label Distribution
Extract the distribution of labels in the dataset, for each type of stimulus (image, video, moving dot) and both human annotators ("_RA_" and "_MN_"). Add the "total" count across all stimuli.

In [3]:
def labeler_stimulus_stats(dataframe: pd.DataFrame, labeler: Optional[str]) -> pd.DataFrame:
    if labeler:
        subset = dataframe[dataframe[labeler].notnull()]
    else:
        subset = dataframe
    counts = pd.concat([
        subset.groupby("stimulus_type").size().rename("num_samples"),
        subset.groupby("stimulus_type")["subject_id"].nunique().rename("num_subjects"),
        subset.groupby("stimulus_type")["trial_id"].nunique().rename("num_trials"),
    ], axis=1)
    total_counts = pd.Series(
        [len(subset), subset["subject_id"].nunique(), subset["trial_id"].nunique()],
        index=counts.columns, name="total"
    )
    counts.loc["total"] = total_counts
    
    if not labeler:
        return counts
    stats = pd.concat([
        subset[labeler].value_counts(dropna=True, normalize=True).sort_index().rename("total"),
        subset.groupby("stimulus_type")[labeler].value_counts(dropna=True, normalize=True).unstack().fillna(0).T
    ], axis=1).T * 100
    stats.index.name = peyes.constants.LABEL_STR
    return pd.concat([counts, stats], axis=1)

In [4]:
global_counts = labeler_stimulus_stats(dataset, None)
ra_stats = labeler_stimulus_stats(dataset, "RA")
mn_stats = labeler_stimulus_stats(dataset, "MN")

full_counts = pd.concat([global_counts, ra_stats, mn_stats], keys=["ALL", "RA", "MN"], axis=0)
full_counts = full_counts.reorder_levels([1, 0]).reindex(
    axis=0, level=0, labels=["image", "video", "moving_dot", "total"]
)
full_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,num_samples,num_subjects,num_trials,0.0,1.0,2.0,3.0,4.0,5.0
stimulus_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
image,ALL,87790,18,20,,,,,,
image,RA,87790,18,20,0.144663,76.455177,9.181,4.759084,4.777309,4.682766
image,MN,63849,13,14,0.198907,79.597175,8.592147,5.243622,0.853576,5.514573
video,ALL,274096,18,19,,,,,,
video,RA,274096,18,19,0.079899,33.62654,4.413417,2.635573,57.883734,1.360837
video,MN,29029,9,9,0.055117,42.974267,5.174136,3.382824,46.381205,2.03245
moving_dot,ALL,21326,19,24,,,,,,
moving_dot,RA,20000,19,23,0.95,12.845,4.72,1.425,79.53,0.53
moving_dot,MN,11867,10,11,1.415691,8.99132,4.533581,2.005562,81.638156,1.415691
total,ALL,383212,30,63,,,,,,


## Rater Agreement
Calculate the agreement between the two human annotators, globally and for each type of stimulus.

In [5]:
def calc_sample_level_agreement(
        dataframe: pd.DataFrame, labeler1: str, labeler2: str,
        metrics: List[str] = None,
        pos_labels: Optional[np.ndarray] = None
) -> pd.DataFrame:
    metrics = metrics or ["balanced_accuracy", "cohen's_kappa", "mcc", "complement_nld"]
    both_not_null = dataframe.groupby("trial_id").filter(lambda x: all(x[labeler1].notnull()) and all(x[labeler2].notnull()))
    trial_ids = both_not_null[peyes.constants.TRIAL_ID_STR].unique()
    results = {}
    for i, trial_id in tqdm.tqdm(enumerate(trial_ids), total=len(trial_ids)):
        trial_data = both_not_null[both_not_null["trial_id"] == trial_id]
        labeler1_labels = trial_data[labeler1].values
        labeler2_labels = trial_data[labeler2].values
        res = peyes.sample_metrics.calculate(labeler1_labels, labeler2_labels, *metrics, pos_labels=pos_labels)
        results[trial_id] = res
    results = pd.DataFrame(results).T
    results.index.name = peyes.constants.TRIAL_ID_STR
    return results

In [6]:
rater_agreement = calc_sample_level_agreement(dataset, "RA", "MN")
rater_agreement.describe()

100%|██████████| 33/33 [00:00<00:00, 106.65it/s]


Unnamed: 0,balanced_accuracy,cohen's_kappa,mcc,complement_nld
count,33.0,33.0,33.0,33.0
mean,0.789322,0.746069,0.766235,0.889908
std,0.113567,0.197002,0.164724,0.117654
min,0.548912,0.221206,0.354871,0.533183
25%,0.706671,0.677382,0.685639,0.883343
50%,0.805947,0.836427,0.838001,0.929044
75%,0.877022,0.872848,0.876027,0.956413
max,0.968041,0.962272,0.962441,0.988962
