# General Analysis
#### Information on dataset features, distribution, rater agreement, etc.

In [1]:
from typing import Optional, Set

import numpy as np
import pandas as pd

import tqdm
import plotly.express as px

import peyes
import analysis.utils as u
from analysis._article_results.hfc._helpers import *

## Load Dataset

In [2]:
dataset = peyes.datasets.hfc(directory=u.DATASETS_DIR, save=False, verbose=True)
print(f"Dataset shape: {dataset.shape}")

dataset

Dataset shape: (105641, 23)


Unnamed: 0,trial_id,subject_id,stimulus_type,t,x,y,pupil,pixel_size,viewer_distance,subject_group,...,JB,JF,JV,KH,MN,MS,PZ,RA,RH,TC
0,1,4591219856350558064,free_viewing,0.000,966.1,543.6,,0.02652,65,adult,...,1,1,1,0,1,0,1,1,1,1
1,1,4591219856350558064,free_viewing,3.333,967.3,546.8,,0.02652,65,adult,...,1,1,1,1,1,1,1,1,1,1
2,1,4591219856350558064,free_viewing,6.667,967.9,537.5,,0.02652,65,adult,...,1,1,1,1,1,1,1,1,1,1
3,1,4591219856350558064,free_viewing,10.000,969.8,538.8,,0.02652,65,adult,...,1,1,1,1,1,1,1,1,1,1
4,1,4591219856350558064,free_viewing,13.333,971.5,548.6,,0.02652,65,adult,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105636,70,4607158533597356135,search_task,4000.000,,,,0.02652,65,infant,...,0,0,0,0,0,0,0,0,0,0
105637,70,4607158533597356135,search_task,4003.333,,,,0.02652,65,infant,...,0,0,0,0,0,0,0,0,0,0
105638,70,4607158533597356135,search_task,4006.667,,,,0.02652,65,infant,...,0,0,0,0,0,0,0,0,0,0
105639,70,4607158533597356135,search_task,4010.000,,,,0.02652,65,infant,...,0,0,0,0,0,0,0,0,0,0


### Dataset Features
#### (1) Pixel Size, Viewer Distance

In [3]:
viewer_distances = dataset["viewer_distance"].unique()
pixel_sizes = dataset["pixel_size"].unique()

print(f"Viewer Distances (cm):\t{viewer_distances}")
print(f"Pixel Sizes (cm):\t\t{pixel_sizes}")

Viewer Distances (cm):	[65]
Pixel Sizes (cm):		[0.02652028]


In [4]:
VIEWER_DISTANCE_CM = viewer_distances[0]
PIXEL_SIZE_CM = pixel_sizes[0]

PIXEL_SIZE_DEG = peyes._utils.pixel_utils.pixels_to_visual_angle(1, VIEWER_DISTANCE_CM, PIXEL_SIZE_CM)
DEG_IN_PIXEL = peyes._utils.pixel_utils.visual_angle_to_pixels(1, VIEWER_DISTANCE_CM, PIXEL_SIZE_CM)

print(f"Viewer distance:\t\t{10 * VIEWER_DISTANCE_CM}mm")
print(f"Pixel size:\t\t\t\t{10 * PIXEL_SIZE_CM:.3f}mm")
print(f"Pixel size:\t\t\t\t{PIXEL_SIZE_DEG:.3f}° (DVA)")
print(f"1° (DVA) in pixels:\t\t{DEG_IN_PIXEL:.2f}px")

Viewer distance:		650mm
Pixel size:				0.265mm
Pixel size:				0.023° (DVA)
1° (DVA) in pixels:		42.78px


#### (2) Sampling Rates, Types of Stimuli

In [5]:
trials_data = dataset.groupby(peyes.constants.TRIAL_ID_STR)[[peyes.constants.T, peyes.constants.STIMULUS_TYPE_STR]]
sampling_rates = trials_data.apply(
    lambda sub: peyes._utils.event_utils.calculate_sampling_rate(sub[peyes.constants.T].values)
).rename(peyes.constants.SAMPLING_RATE_STR).round(6)
stim_types = trials_data.first()[peyes.constants.STIMULUS_TYPE_STR]
trials_data = pd.concat([sampling_rates, stim_types], axis=1).reset_index().groupby(
    [peyes.constants.STIMULUS_TYPE_STR, peyes.constants.SAMPLING_RATE_STR]
).apply(lambda sub: sub.values, include_groups=False).rename(peyes.constants.TRIAL_ID_STR).to_frame().map(
    lambda arr: np.asarray(arr).flatten()
)

trials_data[peyes.constants.COUNT_STR] = trials_data[peyes.constants.TRIAL_ID_STR].apply(lambda arr: len(arr))

sample_counts = dataset[peyes.constants.TRIAL_ID_STR].value_counts().sort_index()
trials_data['num_samples'] = trials_data[peyes.constants.TRIAL_ID_STR].apply(
    lambda arr: np.isin(dataset[peyes.constants.TRIAL_ID_STR], arr).sum()
)

trials_data

Unnamed: 0_level_0,Unnamed: 1_level_0,trial_id,count,num_samples
stimulus_type,sampling_rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
free_viewing,300.030003,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",10,45018
search_task,300.030003,"[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...",60,60623


**To conclude:**  
The sampling rates are equal for all trials (_300Hz_).  
We have 2 types of stimuli:  
- _free viewing_ - recordings of 10 **adults** freely viewing color images.  
- _search task_ - recordings of **infants** performing a visual search task.

## Label Counts

In [6]:
GT_LABELERS = [GT1, GT2, GT3, GT4, GT5]


def sample_stats(dataframe: pd.DataFrame, labeler: Optional[str]) -> pd.DataFrame:
    if labeler:
        subset = dataframe[dataframe[labeler].notnull()]
    else:
        subset = dataframe
    counts = pd.concat([
        subset.groupby("stimulus_type").size().rename("num_samples"),
        subset.groupby("stimulus_type")["subject_id"].nunique().rename("num_subjects"),
        subset.groupby("stimulus_type")["trial_id"].nunique().rename("num_trials"),
    ], axis=1)
    total_counts = pd.Series(
        [len(subset), subset["subject_id"].nunique(), subset["trial_id"].nunique()],
        index=counts.columns, name="total"
    )
    counts.loc["total"] = total_counts
    
    if not labeler:
        return counts
    stats = pd.concat([
        subset[labeler].value_counts(dropna=True, normalize=True).sort_index().rename("total"),
        subset.groupby("stimulus_type")[labeler].value_counts(dropna=True, normalize=True).unstack().fillna(0).T
    ], axis=1).T * 100
    stats.index.name = peyes.constants.LABEL_STR
    return pd.concat([counts, stats], axis=1)


def labels_to_events(dataframe: pd.DataFrame, annotators: Set[str] = None) -> pd.DataFrame:
    trial_ids = dataframe[peyes.constants.TRIAL_ID_STR].unique()
    annotators = annotators or set(dataframe.columns)
    event_dict = {}
    for i, trial_id in tqdm.tqdm(enumerate(trial_ids), total=len(trial_ids)):
        trial_data = dataframe[dataframe["trial_id"] == trial_id]
        stim_type, subj_group = trial_data[[peyes.constants.STIMULUS_TYPE_STR, "subject_group"]].values[0]
        t = trial_data[peyes.constants.T].values
        x = trial_data[peyes.constants.X].values
        y = trial_data[peyes.constants.Y].values
        pupil = trial_data[peyes.constants.PUPIL].values
        ps = trial_data[peyes.constants.PIXEL_SIZE_STR].values[0]
        vd = trial_data[peyes.constants.VIEWER_DISTANCE_STR].values[0]
        for annotator in annotators:
            evnts = peyes.create_events(
                labels=trial_data[annotator].values,
                t=t, x=x, y=y, pupil=pupil, pixel_size=ps, viewer_distance=vd,
            )
            evnts = pd.Series(evnts, name=(trial_id, annotator))
            event_dict[(trial_id, stim_type, subj_group, annotator)] = evnts
    event_df = pd.DataFrame(event_dict).T.dropna(axis=0, how='all')
    event_df.index.names = [
        peyes.constants.TRIAL_ID_STR, peyes.constants.STIMULUS_TYPE_STR, peyes.constants.STIMULUS_NAME_STR, "annotator"
    ]
    return event_df


def events_df_to_series(events_df: pd.DataFrame, min_num_samples: int = 2) -> pd.Series:
    events_as_series = events_df.groupby(
        level=np.arange(events_df.index.nlevels).tolist()
    ).apply(
        lambda sub: pd.Series(sub.values.flatten()).dropna()
    )
    events_as_series = events_as_series[events_as_series.map(lambda x: x.num_samples >= min_num_samples)]
    return events_as_series


def count_events(series: pd.Series) -> pd.DataFrame:
    lbl_counts = {}
    for lbl in peyes._DataModels.EventLabelEnum.EventLabelEnum:
        lbl_counts[lbl.name] = _count_events_for_label(series, lbl)
    res = pd.concat(lbl_counts, axis=0, keys=lbl_counts.keys()).dropna(axis=0, how='all')
    res = res.unstack(0).stack(0, future_stack=True)
    
    all_stim_counts = res.groupby(level=1).sum()
    all_stim_counts.index = [("ALL", lblr) for lblr in all_stim_counts.index]
    res = pd.concat([res, all_stim_counts], axis=0)
    # res['total'] = res.sum(axis=1)
    return res


def _count_events_for_label(series: pd.Series, lbl) -> pd.DataFrame:
    sub_series = series[series.map(lambda evnt: evnt.label == lbl)]
    counts = sub_series.groupby(level=[peyes.constants.STIMULUS_TYPE_STR, "annotator"]).size().unstack(1)
    counts["mean"] = counts.mean(axis=1)
    return counts

### (1) Sample-Label Counts
#### (A) All Annotators

In [7]:
label_counts_all = pd.concat(
    [sample_stats(dataset, gt) for gt in u.DATASET_ANNOTATORS['hfc']],
    keys=u.DATASET_ANNOTATORS['hfc'], axis=0
).reorder_levels([1, 0]).sort_index()
label_counts_all.index.names = [peyes.constants.STIMULUS_TYPE_STR, peyes.constants.LABELER_STR]

label_mean_all = label_counts_all.groupby(peyes.constants.STIMULUS_TYPE_STR).mean()
label_mean_all.index = pd.MultiIndex.from_product(
    [label_mean_all.index, ["mean"]], names=[peyes.constants.STIMULUS_TYPE_STR, peyes.constants.LABELER_STR]
)
label_counts_all = pd.concat([label_counts_all, label_mean_all], axis=0).sort_index()

label_counts_all

Unnamed: 0_level_0,Unnamed: 1_level_0,num_samples,num_subjects,num_trials,0,1
stimulus_type,labeler,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
free_viewing,DN,45018.0,10.0,10.0,12.292861,87.707139
free_viewing,IH,45018.0,10.0,10.0,10.14261,89.85739
free_viewing,JB,45018.0,10.0,10.0,16.317917,83.682083
free_viewing,JF,45018.0,10.0,10.0,18.73695,81.26305
free_viewing,JV,45018.0,10.0,10.0,17.12204,82.87796
free_viewing,KH,45018.0,10.0,10.0,16.271269,83.728731
free_viewing,MN,45018.0,10.0,10.0,15.975832,84.024168
free_viewing,MS,45018.0,10.0,10.0,15.709272,84.290728
free_viewing,PZ,45018.0,10.0,10.0,22.657604,77.342396
free_viewing,RA,45018.0,10.0,10.0,17.981696,82.018304


#### (B) Five Annotators Subset
Annotators: IH, DN, JV, RA, MN

In [8]:
label_counts = pd.concat([sample_stats(dataset, gt) for gt in GT_LABELERS], keys=GT_LABELERS, axis=0).reorder_levels([1, 0]).sort_index()
label_counts.index.names = [peyes.constants.STIMULUS_TYPE_STR, peyes.constants.LABELER_STR]

# label_mean = label_counts.groupby(peyes.constants.STIMULUS_TYPE_STR).mean()
# label_mean.index = pd.MultiIndex.from_product([label_mean.index, ["mean"]], names=[peyes.constants.STIMULUS_TYPE_STR, peyes.constants.LABELER_STR])
# label_counts = pd.concat([label_counts, label_mean], axis=0)

label_counts = label_counts.sort_index()
label_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,num_samples,num_subjects,num_trials,0,1
stimulus_type,labeler,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
free_viewing,JV,45018,10,10,17.12204,82.87796
free_viewing,MN,45018,10,10,15.975832,84.024168
free_viewing,MN,45018,10,10,15.975832,84.024168
free_viewing,RA,45018,10,10,17.981696,82.018304
free_viewing,RA,45018,10,10,17.981696,82.018304
search_task,JV,60623,60,60,37.738152,62.261848
search_task,MN,60623,60,60,36.687396,63.312604
search_task,MN,60623,60,60,36.687396,63.312604
search_task,RA,60623,60,60,37.619385,62.380615
search_task,RA,60623,60,60,37.619385,62.380615


### (2) Event-Label Counts

In [9]:
all_events = labels_to_events(dataset)
all_events_series = events_df_to_series(all_events, min_num_samples=2)

all_event_counts = count_events(all_events_series).sort_index()
all_event_counts.index.names = [peyes.constants.STIMULUS_TYPE_STR, peyes.constants.LABELER_STR]

all_event_counts = all_event_counts.sort_index()
all_event_counts

100%|██████████| 70/70 [00:02<00:00, 27.48it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,FIXATION,SACCADE,PSO,SMOOTH_PURSUIT,BLINK
stimulus_type,labeler,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALL,DN,757.0,0.0,0.0,0.0,0.0
ALL,IH,786.0,0.0,0.0,0.0,0.0
ALL,JB,755.0,0.0,0.0,0.0,0.0
ALL,JF,831.0,0.0,0.0,0.0,0.0
ALL,JV,787.0,0.0,0.0,0.0,0.0
ALL,KH,779.0,0.0,0.0,0.0,0.0
ALL,MN,750.0,0.0,0.0,0.0,0.0
ALL,MS,718.0,0.0,0.0,0.0,0.0
ALL,PZ,849.0,0.0,0.0,0.0,0.0
ALL,RA,753.0,0.0,0.0,0.0,0.0


#### (B) Five Annotators Subset
Annotators: IH, DN, JV, RA, MN

In [10]:
events = labels_to_events(dataset, set(GT_LABELERS))
events_series = events_df_to_series(events, min_num_samples=2)

event_counts = count_events(events_series).sort_index()
event_counts.index.names = [peyes.constants.STIMULUS_TYPE_STR, peyes.constants.LABELER_STR]

event_counts = event_counts.sort_index()
event_counts

100%|██████████| 70/70 [00:00<00:00, 187.31it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,FIXATION
stimulus_type,labeler,Unnamed: 2_level_1
ALL,JV,787.0
ALL,MN,750.0
ALL,RA,753.0
ALL,mean,763.333333
free_viewing,JV,423.0
free_viewing,MN,411.0
free_viewing,RA,416.0
free_viewing,mean,416.666667
search_task,JV,364.0
search_task,MN,339.0
