# Introduction to statistical features and features with scanpaths

In [2]:
import os
from tqdm import tqdm

import requests
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [3]:
import sys
sys.path.append('/home/perkyfever/study/projects/eyelib/EyeFeatures')

## Getting simple dataset to work with. 

In [5]:
def get_paris_dataset():
    '''
    Download and load the Paris experiment dataset from Zenodo.
    The dataset contains scanpaths data from 15 participants reading approximately 180 texts.
    The dataset is normalized and split into X (fixations data), Y (target), and other features.
    Deatiled description of variables and task can be found at: https://zenodo.org/records/4655840
    '''
    if not os.path.exists("data/em-y35-fasttext.csv"):
        url = "https://zenodo.org/records/4655840/files/em-y35-fasttext.csv?download=1"
        response = requests.get(url, stream=True)

        os.makedirs("data", exist_ok=True)
        with open("data/em-y35-fasttext.csv", "wb") as handle:
            for data in tqdm(response.iter_content(chunk_size=1024)):
                handle.write(data)

    df = pd.read_csv("data/em-y35-fasttext.csv")
    df.X = df.X / df.X.max()
    df.Y = df.Y / df.Y.max()
    df = df.rename(columns={'FDUR': 'duration', 'X': 'norm_pos_x', 'Y': 'norm_pos_y'})
    df['dispersion'] = df['duration']
    df['timestamp'] = df.duration.cumsum()  # timestamps of fixations
    df['timestamp'] /= 1e3                    # milliseconds

    return df.drop(columns=['Unnamed: 0'])

In [6]:
data = get_paris_dataset()
data

Unnamed: 0,SUBJ,SUBJ_NAME,TEXT_NO,TEXT,ANSWER,FIX_NUM,FIX_LATENCY,norm_pos_x,norm_pos_y,duration,...,WFREQ_RANK_FASTTEXT_2016,COS_INST_FASTTEXT_2018,COS_CUM_FASTTEXT_2018,WFREQ_RANK_FASTTEXT_2018,WFREQ_RANK_FASTTEXT_1618,COS_INST_FASTTEXT_1618,COS_CUM_FASTTEXT_1618,TEXT_TYPE_2,dispersion,timestamp
0,1,s01,1,chasse_oiseaux-a1,1,1,202,0.376268,0.384969,96,...,8205.0,0.186901,0.186901,5590.0,6897.5,0.185782,0.185782,a,96,0.096
1,1,s01,1,chasse_oiseaux-a1,1,2,321,0.437754,0.383532,129,...,8205.0,0.186901,0.186901,5590.0,6897.5,0.185782,0.185782,a,129,0.225
2,1,s01,1,chasse_oiseaux-a1,1,3,477,0.546146,0.382957,280,...,12071.0,0.221362,0.228615,18406.0,15238.5,0.214195,0.225632,a,280,0.505
3,1,s01,1,chasse_oiseaux-a1,1,4,792,0.706643,0.399626,278,...,1217.0,0.256207,0.254959,2094.0,1655.5,0.213694,0.247522,a,278,0.783
4,1,s01,1,chasse_oiseaux-a1,1,5,1085,0.724645,0.397615,266,...,1217.0,0.256207,0.268313,2094.0,1655.5,0.213694,0.254901,a,266,1.049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39559,15,s21,57,conflit_israelo_palestinien-f2,1,10,2268,0.420385,0.796091,142,...,200185.0,0.522610,0.331133,3706.0,101945.5,0.426449,0.445799,f+,142,7279.520
39560,15,s21,57,conflit_israelo_palestinien-f2,1,11,2442,0.536004,0.806581,171,...,8832.0,0.251470,0.340826,11554.0,10193.0,0.263324,0.455220,f+,171,7279.691
39561,15,s21,57,conflit_israelo_palestinien-f2,1,12,2638,0.526749,0.882885,152,...,15043.0,0.127237,0.345502,11359.0,13201.0,0.167958,0.457360,f+,152,7279.843
39562,15,s21,57,conflit_israelo_palestinien-f2,1,13,2827,0.757860,0.875126,276,...,1245.0,0.741338,0.395152,2263.0,1754.0,0.720622,0.511603,f+,276,7280.119


##### In order to extract features using the EyeFeatures methods, we only need the following columns: coordinates of fixations on the screen (that is x, y coordinates) and columns that identify the unique objects in the dataset. You can preprocess a dataset of raw gazes into the required format using a preprocessing module.

In [7]:
data = data[['SUBJ', 'norm_pos_x', 'norm_pos_y', 'timestamp', 'duration', 'dispersion', 'ANSWER']]
data['group'] = 1                           # dummy column for grouping purposes (we operate with single group)
data

Unnamed: 0,SUBJ,norm_pos_x,norm_pos_y,timestamp,duration,dispersion,ANSWER,group
0,1,0.376268,0.384969,0.096,96,96,1,1
1,1,0.437754,0.383532,0.225,129,129,1,1
2,1,0.546146,0.382957,0.505,280,280,1,1
3,1,0.706643,0.399626,0.783,278,278,1,1
4,1,0.724645,0.397615,1.049,266,266,1,1
...,...,...,...,...,...,...,...,...
39559,15,0.420385,0.796091,7279.520,142,142,1,1
39560,15,0.536004,0.806581,7279.691,171,171,1,1
39561,15,0.526749,0.882885,7279.843,152,152,1,1
39562,15,0.757860,0.875126,7280.119,276,276,1,1


## Statistical Features

##### Computes statistical features regarding saccades, fixations, as well as microsaccades and regressions such as max length, mean acceleration, and other which are available in `stats` module of `eyetracking.features`. 

##### **Note**: One can calculate statistics using any aggregation function supported by `pandas`.

In [8]:
import eyetracking.features.stats as eye_stats

##### Here's an example of how saccades can be computed: desired features should be represented as a dictionary, with saccade properties as keys and lists of statistics as values.

##### In this example we prepare to calculate saccade length, speed and acceleration features, where

$$
\text{Length(Saccade}_i\text{)} = ||\text{Fixation}_{i+1} - \text{Fixation}_{i} ||_{2}\, \quad \text{Speed(Saccade}_i\text{)} = \frac{\text{Length(Saccade}_i\text{)}}{\text{Time}_{i+1} - \text{Time}_{i}}, \quad 
\text{Acceleration(Saccade}_i\text{)} = \frac{1}{2} \frac{\text{Speed(Saccade}_i\text{)} }{\text{Time}_{i+1} - \text{Time}_{i}}
$$

In [9]:
sac_feats_stats = {
    'length': ['min', 'max'],
    'speed': ['mean', 'kurtosis'],
    'acceleration': ['mean']
}

##### Also, one would like to see the similarity of object and its group. We have people ('SUBJ') which are divided into groups ('group', here we have a single group, but there could be many groups, for example, age-based). Thus, we can calculate shift features, which are a difference of object's feature value and its group's mean value. The rest remains the same except the way these statistics to be computed.

In [10]:
sac_feats_stats_shift = {
    'length': ['max'],
    'acceleration': ['mean']
}

##### Finally, define a transformer and get the desired features.

In [11]:
sf = eye_stats.SaccadeFeatures(
    x='norm_pos_x',
    y='norm_pos_y',
    t='timestamp',
    pk=['SUBJ', 'group'],
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=['group']
)

##### Here, each row represents the features of one of 15 different `SUBJ` groups. 

In [12]:
sf.fit_transform(data)

Unnamed: 0,sac_length_min,sac_length_max,sac_length_max_shift,sac_acceleration_mean,sac_acceleration_mean_shift,sac_speed_mean,sac_speed_kurtosis
1_1,0.006033,0.735187,0.0,3.200016,-0.412047,1.036293,4.289506
2_1,0.004946,0.695758,-0.039428,4.411436,0.799373,1.18899,3.949788
3_1,0.002494,0.691093,-0.044094,3.351796,-0.260267,1.073377,2.275953
4_1,0.004396,0.637963,-0.097223,3.246972,-0.36509,0.999678,4.342265
5_1,0.004814,0.693513,-0.041673,3.874696,0.262634,1.111982,4.78879
6_1,0.014949,0.659511,-0.075676,4.518683,0.906621,1.423984,2.408947
7_1,0.00388,0.579964,-0.155222,4.047229,0.435167,1.190862,2.696679
8_1,0.012126,0.661617,-0.073569,4.614863,1.002801,1.307412,1.579665
9_1,0.006906,0.656098,-0.079089,2.537365,-1.074698,0.87249,6.404846
10_1,0.002115,0.656778,-0.078408,1.736126,-1.875936,0.707411,6.400686


## Scanpath Measures

##### This module offers classes and methods which calculate various measures of scanpaths. 

In [4]:
import eyetracking.features.measures as eye_measures

##### Let's calculate some basic measures in order to demonstrate the usecase process.

##### The HurstExponent class estimates the Hurst exponent of a time series using Rescaled Range (R/S) analysis. The Hurst exponent is a measure of the long-term memory of time series data, indicating whether the data is trending, mean-reverting, or behaving in a random walk. Parameter `n_iters` regulates the number of iterations for the R/S analysis, while `fill_strategy` is the strategy to adjust data to the power of 2.

##### The algorithm is derived from the Rescaled Range (R/S) Analysis and works as follows:

* Time series is divided into segments (blocks) of equal size
* The mean is subtracted from each segment to center the data
* Compute the cumulative sum of the mean-adjusted data and determine the range (maximum - minimum) of the cumulative deviation
* Calculate the standard deviation of the original segment and the ratio of the range to the standard deviation
* The slope of he log of the block size and the log of the R/S ratio estimates the Hurst Exponent

$$
\log(R/S) = \text{HurstExponent} \cdot \log(n) + C, \, \text{where } R/S \text{ is the rescaled range}, \, n \text{ is the block size and } C \text{ is some constant}
$$

In [13]:
hurst_exponent = eye_measures.HurstExponent(
    var=['norm_pos_x', 'norm_pos_y'],
    n_iters=10,
    fill_strategy='reduce',
    pk=['SUBJ', 'group'],
    return_df=True
)

hurst_exponent.fit_transform(data)

Unnamed: 0,"he_['norm_pos_x', 'norm_pos_y']"
0,0.00159
1,0.001716
2,0.000698
3,0.001475
4,0.001473
5,0.002331
6,0.001375
7,0.001384
8,0.001553
9,0.001521


##### We can also calculate the entropy of a 2D spatial distribution. 

* Given a set of 2D points $\left\{(x_i, y_i) \right\}_{i=1}^N$ algorithm partitions the space into a grid consisting of $g \times g$ cells
* Define the edges of the cells for each dimension: $\text{Edges}_x = \left\{x_0, \dots, x_g \right\}, \, \text{Edges}_y = \left\{y_0, \dots, y_g \right\}$
* Then, each bin is basically a $B_{jk} = \left\{ (x, y): x_{j-1} \leq x < x_j, \, y_{k-1} \leq y < y_k \right\}$
* Construct a **multi-dimensional histogram** $H$ where each element $H_{jk}$ represents the count of data points falling to the $B_{jk}\,$:
$$
H_{jk} = \sum_{i=1}^N \mathbb{I}\left\{(x_i, y_i) \in B_{jk} \right\}
$$
* Normalize the histogram to obtain a probability distribution $P \sim P_{jk} = \frac{H_{jk}}{N}$ and calculate its entropy 
$$
S = -\sum_{i=1}^g\sum_{j=1}^g P_{jk}\log(P_{jk})
$$

In [14]:
gridded_entropy = eye_measures.GriddedDistributionEntropy(
    x='norm_pos_x',
    y='norm_pos_y',
    pk=['SUBJ', 'group'],
    return_df=True
)

gridded_entropy.fit_transform(data)

Unnamed: 0,grid_entropy
1_1,3.960535
2_1,3.829708
3_1,4.002632
4_1,3.99743
5_1,3.972908
6_1,4.002764
7_1,4.139917
8_1,4.158252
9_1,3.79989
10_1,4.022404


##### There are also some more complicated features.


##### One of them is RQA (Recurrence Quantification Analysis) for time-series or spatial data. 
The metrics calculated include Recurrence (REC), Determinism (DET), Laminarity (LAM), and Center of Recurrence Mass (CORM). These measures help to quantify the complexity and structure of the recurrence patterns within the data. In this example we use a default euclidean metric as `metric`. Parameters `rho` and `min_length` correspond for RQA matrix threshold radius and threshold length of its diagonal. In `measures` we specify the required features to calculate.

Recurrence matrix $R$ is defined as $R_{ij} = \mathbb{I}\left\{d(x_i, x_j) \leq \rho \right\}$:

- Reccurence Rate counts the total number of recurrence points above the main diagonal of $R$:
$$
\text{REC} = \frac{2}{n(n-1)} \sum_{i=1}^n \sum_{j=i+1}^n R_{ij}
$$
- Determinism measures the percentage of recurrence points forming diagonal lines of length at least $L_{min}$:
$$
\text{DET} = \frac{100 \cdot \sum_{l \geq L_{min}} l \cdot P(l)}{\sum_{i=1}^n \sum_{j=i+1}^n R_{ij}},
$$
$$
\text{ where } L_{min} - \text{ minimum line length}, \, P(l) - \text{probability of diagonal lines of length } l
$$
- Liminarity measures the percentage of recurrence points forming vertical or horizontal lines of length at least $L_{min}$:
$$
\text{LAM} = \frac{50 \left( \sum_{\text{HL}} \text{HL} + \sum_{\text{VL}} \text{VL}\right)}{\sum_{i=1}^n \sum_{j=i+1}^n R_{ij}},
$$
$$
\text{where HL and VL represents the sums of horizontal and vertical lines of length at least } L_{min}
$$
- Center of Recurrence Mass measures the weighted average of the distances between recurrence points, emphasizing the central tendency of recurrences in the matrix:
$$
\text{CORM} = \frac{100 \cdot \sum_{i=1}^{n-1} \sum_{j=i+1}^n (j-i) R_{ij}}{(n-1) \cdot \sum_{i=1}^n \sum_{j=i+1}^n R_{ij}}
$$

In [15]:
rqa_measures = eye_measures.RQAMeasures(
    metric=lambda p, q: np.linalg.norm(p - q),
    rho=0.10,
    min_length=1,
    measures=["rec", "corm"],
    x='norm_pos_x',
    y='norm_pos_y',
    pk=['SUBJ', 'group'],
    return_df=True
)

rqa_measures.fit_transform(data)

Unnamed: 0,rec,corm
1_1,11.811201,33.640767
2_1,12.874153,34.33096
3_1,11.805156,33.379707
4_1,12.986185,33.76707
5_1,12.815115,34.050548
6_1,12.493029,34.067843
7_1,13.414963,33.543729
8_1,10.81608,33.598875
9_1,13.173418,33.557892
10_1,13.326421,33.787375


## Scanpath Distances

##### Let us describe the core idea of extrator-classes in this module. Each class calculates the "expected paths" for each path-group which are further used in distance functions. That is the resulting features for each group are simply the distances between two scanpaths: expected and given one.

In [16]:
import eyetracking.features.scanpath_dist as eye_dist

##### As for now, there are two ways to compute the expected path. 
- The first one simply aligns the paths by time and takes the pointwise mean at each timestamp. This method is used by passing `'mean'` to `expected_paths_method`
- The second algorithm seeks to find the so-called Fermat-Weber point (geometric median) of the series at each timestamp. This point basically minimizes the sum of distances from each observation. Use it by passing `'fwp'` to `expected_paths_method`.

##### See the example of calculating some basic distances. 
**Note:** primary key `path_pk` is set to be `'group'` so there is a separate expected path for each unique group. Primary key `pk` is also set to `['SUBJ', 'group']` which determines the way to distinguish between unique paths.

##### Explanation of distances used in this illustation:
- Euclidean distance is simply the sum of pairwise distances of two sequences at each timestamp:
$$
\text{EUC}(p, q) = \sum_{i=1}^n ||p_i - q_i||_2 \quad \text{ ($p$ and $q$ are alligned)}
$$
- EyeDist distance is calculated as follows:
$$
\text{EYE}(p, q) = \frac{1}{\max\{n, m\}} \left(\sum_{i=1}^n \min_{1 \leq j \leq m} ||p_i - q_j||_2^2 + \sum_{j=1}^m \min_{1 \leq i \leq n} ||q_j - p_i||_2^2\right)
$$
- Mannan distance is somewhat a more complex version of EyeDist since it considers the weighted distance:
$$
\text{MAN}(p, q) = \frac{1}{4 \cdot n \cdot m} \left(m \cdot \sum_{i=1}^n \min_{1 \leq j \leq m} ||p_i - q_j||_2^2 + n \cdot \sum_{j=1}^m \min_{1 \leq i \leq n} ||q_j - p_i||_2^2  \right)
$$

In [17]:
transformer = eye_dist.SimpleDistances(
    x='norm_pos_x',
    y='norm_pos_y',
    path_pk=['group'],
    pk=['SUBJ', 'group'],
    methods=["euc", "eye", "man"],
    expected_paths_method="fwp",
    return_df=True
)

transformer.fit_transform(data)

100%|██████████| 15/15 [00:00<00:00, 68.10it/s]
100%|██████████| 15/15 [00:12<00:00,  1.21it/s]
100%|██████████| 15/15 [00:10<00:00,  1.39it/s]


Unnamed: 0,euc_dist,eye_dist,man_dist
1_1,200.623121,0.058312,0.015358
2_1,122.518147,0.064941,0.017263
3_1,1162.275204,0.071679,0.01792
4_1,412.36612,0.058348,0.015121
5_1,261.495549,0.066644,0.017448
6_1,72.612316,0.051765,0.0142
7_1,112.304684,0.067186,0.01775
8_1,198.17411,0.068108,0.018178
9_1,255.661709,0.061571,0.016137
10_1,143.557332,0.050267,0.013282


##### Note that the expected paths are recalculated for each new distance class when the `fit` method is called, which takes up most of the runtime. To speed up the process, one can grep the expected paths from the previous class and reuse it with the new class. It is important that the primary keys for the new class match those of the previous class from which you obtained the expected paths.

In [18]:
expected_paths = transformer.expected_paths
expected_paths

{'1':          x_est     y_est
 0     0.362373  0.408229
 1     0.449865  0.404311
 2     0.547887  0.407606
 3     0.623107  0.411228
 4     0.616489  0.429458
 ...        ...       ...
 4497  0.037297  0.037889
 4498  0.047262  0.038253
 4499  0.053338  0.037946
 4500  0.027975  0.042276
 4501  0.041751  0.043311
 
 [4502 rows x 2 columns]}

##### The same logic can be applied to the filling path (`fill_path`), which is used when no expected path is found for a particular group whenever `transform` is called. It is calculated as the mean of all known expected paths (basically the expected path over the expected paths with `expected_paths_method` set to `'mean'`).

In [19]:
fill_path = transformer.fill_path
fill_path

Unnamed: 0,x_est,y_est
0,0.362373,0.408229
1,0.449865,0.404311
2,0.547887,0.407606
3,0.623107,0.411228
4,0.616489,0.429458
...,...,...
4497,0.037297,0.037889
4498,0.047262,0.038253
4499,0.053338,0.037946
4500,0.027975,0.042276


##### There are also methods in `scanpath_complex` module that return features not in the form of numbers that can be used for inference. This is usually some kind of structure (for instance, a matrix) that can be used for analyzing data or further feature extracting.

In [20]:
import eyetracking.features.scanpath_complex as eye_complex

##### Let's see one of the possible usecases. 

##### We get the list of scanpaths of form `(x_coord, y_coord)` in order to calculate the pairwise distance matrix. One can use custom metric or those already implemented in `scanpath_dist`.

In [21]:
list_of_scanpaths = [scanpath[['norm_pos_x', 'norm_pos_y']].reset_index(drop=True) for _, scanpath in data.groupby('SUBJ')]
len(list_of_scanpaths)

15

In [22]:
euc_matrix = eye_complex.get_dist_matrix(list_of_scanpaths, dist_metric=eye_dist.calc_euc_dist)
euc_matrix

100%|██████████| 15/15 [00:00<00:00, 972.63it/s]


q,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.0,216.891445,241.739752,236.518306,253.582253,157.548907,195.508599,237.403918,245.282704,224.949497,263.351295,166.859744,206.992784,80.401605,167.280796
1,216.891445,0.0,212.105848,213.882556,226.860971,158.614094,202.045338,224.791482,227.851853,212.250657,232.041838,177.053103,218.198329,83.403611,176.055411
2,241.739752,212.105848,0.0,294.448064,273.577526,148.392904,189.853201,235.524909,281.726542,239.65329,316.759147,174.36723,206.031155,84.577833,162.951707
3,236.518306,213.882556,294.448064,0.0,272.753285,154.945192,192.192167,242.065233,266.397787,226.71647,298.556201,169.659725,209.946643,76.707918,173.897121
4,253.582253,226.860971,273.577526,272.753285,0.0,165.565762,208.633328,253.776842,280.771001,231.457743,290.333419,184.852322,220.774162,84.520301,172.443492
5,157.548907,158.614094,148.392904,154.945192,165.565762,0.0,144.433993,162.285554,158.155702,151.596783,167.261651,150.152013,157.527088,79.499392,160.638707
6,195.508599,202.045338,189.853201,192.192167,208.633328,144.433993,0.0,203.690246,206.412659,193.93133,208.8258,159.639914,199.382172,81.846284,158.035375
7,237.403918,224.791482,235.524909,242.065233,253.776842,162.285554,203.690246,0.0,264.77415,239.876712,252.577683,177.639797,225.430046,92.829167,168.7743
8,245.282704,227.851853,281.726542,266.397787,280.771001,158.155702,206.412659,264.77415,0.0,232.927866,282.382338,180.432626,229.559072,78.292232,178.308103
9,224.949497,212.250657,239.65329,226.71647,231.457743,151.596783,193.93133,239.876712,232.927866,0.0,247.045637,169.114434,208.02316,77.574107,173.81733


In [23]:
eye_matrix = eye_complex.get_dist_matrix(list_of_scanpaths, dist_metric=eye_dist.calc_eye_dist)
eye_matrix

100%|██████████| 15/15 [00:09<00:00,  1.52it/s]


q,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
p,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.0,9.3e-05,9.5e-05,9e-05,0.000188,0.00013,0.000138,0.000114,0.000103,0.000106,8.8e-05,7.1e-05,0.000106,0.000325,0.00014
1,9.3e-05,0.0,9.5e-05,5.6e-05,8.6e-05,0.000149,0.00018,0.000109,9.8e-05,0.00013,0.000116,0.000101,0.000174,0.000417,0.000224
2,9.5e-05,9.5e-05,0.0,7.6e-05,9.4e-05,0.000112,0.0001,6.5e-05,0.000117,0.000122,0.000106,8.9e-05,0.000105,0.00033,0.000117
3,9e-05,5.6e-05,7.6e-05,0.0,7.4e-05,9.6e-05,0.000123,0.00011,0.000102,0.000129,0.000121,7e-05,0.000166,0.000376,0.000144
4,0.000188,8.6e-05,9.4e-05,7.4e-05,0.0,0.000126,0.000239,0.000126,0.000116,0.000144,0.000131,0.000114,0.000172,0.000383,0.00021
5,0.00013,0.000149,0.000112,9.6e-05,0.000126,0.0,0.000129,0.000153,9.8e-05,0.000108,9.2e-05,0.00013,0.000141,0.000345,0.000188
6,0.000138,0.00018,0.0001,0.000123,0.000239,0.000129,0.0,0.000133,0.000156,0.000167,0.00014,0.00015,0.000208,0.000431,0.000217
7,0.000114,0.000109,6.5e-05,0.00011,0.000126,0.000153,0.000133,0.0,0.000131,0.000118,9.2e-05,0.000101,0.000115,0.000391,0.000155
8,0.000103,9.8e-05,0.000117,0.000102,0.000116,9.8e-05,0.000156,0.000131,0.0,8.6e-05,8.6e-05,8.8e-05,0.000103,0.000294,0.000174
9,0.000106,0.00013,0.000122,0.000129,0.000144,0.000108,0.000167,0.000118,8.6e-05,0.0,9.2e-05,8.9e-05,9.9e-05,0.000245,0.00013


##### Now we can calculate the compromise matrix using these pairwise distances matrices. 

##### The compromise matrix serves as a summary that captures the most common structure or pattern shared across all the individual distance matrices. It is used in applications where you need to summarize information from multiple sources (in our case these are different metrics), providing a single matrix that best represents the consensus of all input matrices.

##### Some elaboration on how compromise matrix is built:

- Given a weight vector $\mathbf{w} = [w_1, w_2, \dots, w_n]$ where $w_i \geq 0$ and $\sum_{i=1}^{n} w_i = 1$, the **centering matrix** $\Theta$ which centers the data is defined as:

$$
\Theta = I_n - \mathbf{1}_n \mathbf{w}^T
$$
$$
\text{where } I_n \text{ is the identity matrix of size } n \times n,\, \mathbf{1}_n \text{ is a column vector of ones of size } n \times 1
$$

- For a given distance matrix $D$ and weight vector $\mathbf{w}$, the **cross-product matrix** $S$ is calculated as:

$$
S = -\frac{1}{2} \Theta D \Theta^T
$$
$$
\text{where } D \text{ is the distance matrix}, \, \Theta \text{ is the centering matrix from previous step}
$$

- The **RV coefficient** measures the similarity between two cross-product matrices $S_1$ and $S_2$. It is defined as:

$$
\text{RV}(S_1, S_2) = \frac{\text{Tr}(S_1 S_2^T)}{\sqrt{\text{Tr}(S_1 S_1^T) \cdot \text{Tr}(S_2 S_2^T)}}
$$
$$
\text{where } \text{Tr}(\cdot) \text{ is the trace (sum of diagonal elements) of a matrix}
$$

- Finally, to obtain the compromise matrix, we first compute the **similarity matrix** using the RV coefficients between all pairs of cross-product matrices. Then, we perform **eigen-decomposition** on this similarity matrix to find the principal eigenvector $\mathbf{w}\,$:

$$
S_{\text{compromise}} = \sum_{i=1}^{k} w_i S_i
$$
$$
\text{ where} w_i \text{ is the weight from the principal eigenvector}, \, S_i \text{ is the cross-product matrix for the $i$-th distance matrix}
$$

In [24]:
eye_complex.get_compromise_matrix([euc_matrix, eye_matrix])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,73.176261,-6.152585,-8.346088,-7.278371,-11.161435,-0.856152,-4.193279,-8.7051,-8.3833,-7.896376,-11.57678,0.682575,-3.485281,4.086666,0.089247
1,-6.152585,67.884046,-0.515029,-1.92152,-4.360107,-3.878867,-9.150487,-6.892036,-4.866669,-6.052774,-3.153333,-5.567439,-10.093171,0.379157,-5.659185
2,-8.346088,-0.515029,81.067447,-23.814035,-14.285205,6.326584,1.751813,-4.095159,-17.322554,-9.149364,-26.51372,1.973866,0.8003,6.555738,5.565413
3,-7.278371,-1.92152,-23.814035,79.51076,-14.772128,3.231662,0.146512,-7.185873,-12.681359,-5.353854,-20.856356,2.859883,-1.3624,8.559813,0.917271
4,-11.161435,-4.360107,-14.285205,-14.772128,83.810733,1.6267,-3.51637,-9.176571,-15.613076,-4.880165,-15.799181,-0.36154,-3.040521,7.947703,3.58117
5,-0.856152,-3.878867,6.326584,3.231662,1.6267,36.515428,-4.466092,-0.477178,4.090333,-0.292692,4.065622,-11.740787,-4.326944,-13.924776,-15.892853
6,-4.193279,-9.150487,1.751813,0.146512,-3.51637,-4.466092,56.682736,-5.032286,-2.887445,-5.176582,-0.545887,-5.011619,-9.041321,-4.670904,-4.888792
7,-8.7051,-6.892036,-4.095159,-7.185873,-9.176571,-0.477178,-5.032286,77.28354,-13.220937,-11.120308,-5.714095,-1.075119,-7.9502,1.746477,1.614849
8,-8.3833,-4.866669,-17.322554,-12.681359,-15.613076,4.090333,-2.887445,-13.220937,83.498276,-5.55614,-13.144262,1.044839,-6.30266,9.993461,1.351501
9,-7.896376,-6.052774,-9.149364,-5.353854,-4.880165,-0.292692,-5.176582,-11.120308,-5.55614,70.094378,-7.352802,-1.655526,-5.390513,3.545425,-3.762704


##### One can also calculate a similarity matrix of the scanpaths. See the example of using it with a custom metric. 

In [25]:
def sim(p, q) -> float:
    return 1 / eye_dist.calc_euc_dist(p, q)

sim_matrix = eye_complex.get_sim_matrix(list_of_scanpaths, sim_metric=sim)
pd.DataFrame(sim_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.502305,0.502068,0.502114,0.501972,0.503174,0.502557,0.502106,0.502038,0.502223,0.501899,0.502997,0.502416,0.506219,0.502989
1,0.502305,1.0,0.502357,0.502338,0.502204,0.503152,0.502475,0.502224,0.502194,0.502356,0.502155,0.502824,0.502291,0.505995,0.50284
2,0.502068,0.502357,1.0,0.501698,0.501828,0.503369,0.502634,0.502123,0.501775,0.502086,0.501578,0.502868,0.502427,0.505912,0.503068
3,0.502114,0.502338,0.501698,1.0,0.501833,0.503227,0.502602,0.502066,0.501877,0.502205,0.501675,0.502947,0.502382,0.506518,0.502875
4,0.501972,0.502204,0.501828,0.501833,1.0,0.50302,0.502397,0.50197,0.501781,0.50216,0.501722,0.502705,0.502265,0.505916,0.5029
5,0.503174,0.503152,0.503369,0.503227,0.50302,1.0,0.503462,0.503081,0.503161,0.503298,0.502989,0.50333,0.503174,0.506289,0.503113
6,0.502557,0.502475,0.502634,0.502602,0.502397,0.503462,1.0,0.502455,0.502422,0.502578,0.502394,0.503132,0.502508,0.506109,0.503164
7,0.502106,0.502224,0.502123,0.502066,0.50197,0.503081,0.502455,1.0,0.501888,0.502084,0.50198,0.502815,0.502218,0.505386,0.502963
8,0.502038,0.502194,0.501775,0.501877,0.501781,0.503161,0.502422,0.501888,1.0,0.502147,0.501771,0.502771,0.502178,0.506386,0.502804
9,0.502223,0.502356,0.502086,0.502205,0.50216,0.503298,0.502578,0.502084,0.502147,1.0,0.502024,0.502957,0.502404,0.506445,0.502877


##### There are 4 methods implemented in `scanpath_complex` for similarity matrix reordering. For example, we can use `dimensionality_reduction_order` for the matrix calculated above. This function applies a dimensionality reduction technique, such as Multi-Dimensional Scaling (MDS), to the input similarity matrix. The goal is to project the items into a lower-dimensional space (typically 1D) where the order of items reflects their dissimilarities as closely as possible. The indices of items are then reordered according to their positions in this lower-dimensional space, resulting in an ordering that preserves the structure of the original similarities.

In [26]:
mds_reordered_matrix = eye_complex.dimensionality_reduction_order(sim_matrix)
pd.DataFrame(mds_reordered_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.502338,0.502824,0.502204,0.502475,0.503152,0.502357,0.505995,0.502291,0.50284,0.502155,0.502194,0.502356,0.502305,0.502224
1,0.502338,1.0,0.502947,0.501833,0.502602,0.503227,0.501698,0.506518,0.502382,0.502875,0.501675,0.501877,0.502205,0.502114,0.502066
2,0.502824,0.502947,1.0,0.502705,0.503132,0.50333,0.502868,0.506218,0.502775,0.503145,0.502601,0.502771,0.502957,0.502997,0.502815
3,0.502204,0.501833,0.502705,1.0,0.502397,0.50302,0.501828,0.505916,0.502265,0.5029,0.501722,0.501781,0.50216,0.501972,0.50197
4,0.502475,0.502602,0.503132,0.502397,1.0,0.503462,0.502634,0.506109,0.502508,0.503164,0.502394,0.502422,0.502578,0.502557,0.502455
5,0.503152,0.503227,0.50333,0.50302,0.503462,1.0,0.503369,0.506289,0.503174,0.503113,0.502989,0.503161,0.503298,0.503174,0.503081
6,0.502357,0.501698,0.502868,0.501828,0.502634,0.503369,1.0,0.505912,0.502427,0.503068,0.501578,0.501775,0.502086,0.502068,0.502123
7,0.505995,0.506518,0.506218,0.505916,0.506109,0.506289,0.505912,1.0,0.505733,0.505565,0.505436,0.506386,0.506445,0.506219,0.505386
8,0.502291,0.502382,0.502775,0.502265,0.502508,0.503174,0.502427,0.505733,1.0,0.502828,0.502266,0.502178,0.502404,0.502416,0.502218
9,0.50284,0.502875,0.503145,0.5029,0.503164,0.503113,0.503068,0.505565,0.502828,1.0,0.502705,0.502804,0.502877,0.502989,0.502963


## Extractor Class

##### Finally, we can combine several extractor classes into one `Extractor` class to calculate all the features at once.

In [27]:
from eyetracking.features.extractor import Extractor

extractor = Extractor(
    features=[
        eye_dist.SimpleDistances(
            methods=["euc", "eye", "man"],
            expected_paths_method="fwp",
        ),
        eye_measures.GriddedDistributionEntropy(),
        eye_stats.SaccadeFeatures(
            features_stats=sac_feats_stats,
            shift_features=sac_feats_stats_shift,
            shift_pk=['group']
        )
    ],
    x='norm_pos_x',
    y='norm_pos_y',
    t='timestamp',
    duration='duration',
    dispersion='dispersion',
    path_pk=['group'],
    pk=['SUBJ', 'group'],
    return_df=True
)

extractor.fit_transform(data)

100%|██████████| 15/15 [00:00<00:00, 1597.34it/s]
100%|██████████| 15/15 [00:04<00:00,  3.01it/s]
100%|██████████| 15/15 [00:05<00:00,  2.80it/s]


Unnamed: 0,euc_dist,eye_dist,man_dist,grid_entropy,sac_length_min,sac_length_max,sac_length_max_shift,sac_acceleration_mean,sac_acceleration_mean_shift,sac_speed_mean,sac_speed_kurtosis
1_1,200.623121,0.058312,0.015358,3.960535,0.006033,0.735187,0.0,1.736375e+16,-4.088451e+16,1.93762e+16,1409.816333
2_1,122.518147,0.064941,0.017263,3.829708,0.004946,0.695758,-0.039428,5.750563e+16,-742628300000000.0,8.341979e+16,623.452215
3_1,1162.275204,0.071679,0.01792,4.002632,0.002494,0.691093,-0.044094,4.692676e+16,-1.13215e+16,7.542848e+16,679.47253
4_1,412.36612,0.058348,0.015121,3.99743,0.004396,0.637963,-0.097223,8.420396e+16,2.59557e+16,1.207275e+17,375.534401
5_1,261.495549,0.066644,0.017448,3.972908,0.004814,0.693513,-0.041673,3.597491e+16,-2.227334e+16,6.661781e+16,1494.364756
6_1,72.612316,0.051765,0.0142,4.002764,0.014949,0.659511,-0.075676,6.432294e+16,6074690000000000.0,7.196951e+16,567.907161
7_1,112.304684,0.067186,0.01775,4.139917,0.00388,0.579964,-0.155222,8.772772e+16,2.947947e+16,1.261946e+17,392.914133
8_1,198.17411,0.068108,0.018178,4.158252,0.012126,0.661617,-0.073569,8.43099e+16,2.606164e+16,1.068958e+17,376.451656
9_1,255.661709,0.061571,0.016137,3.79989,0.006906,0.656098,-0.079089,9964382000000000.0,-4.828387e+16,1.150047e+16,1618.912271
10_1,143.557332,0.050267,0.013282,4.022404,0.002115,0.656778,-0.078408,3.178135e+16,-2.64669e+16,4.584361e+16,1447.565585


##### Extractor class can be easily integrated into the `sklearn Pipeline` as it fully follows the sklearn API.

##### In this example, the pipeline calls the extractor to calculate the desired features first and then passes them to the model. Note that the extractor can save additional features from the input `DataFrame` before passing them to the model, if needed.

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pipeline = Pipeline([
    ('extractor', extractor),
    ('classifier', LogisticRegression())
])

In [29]:
target = data.drop_duplicates(subset=['SUBJ'])['ANSWER'].reset_index(drop=True)

predictions = pipeline.fit(data, target).predict(data)

100%|██████████| 15/15 [00:00<00:00, 1671.26it/s]
100%|██████████| 15/15 [00:05<00:00,  2.98it/s]
100%|██████████| 15/15 [00:04<00:00,  3.00it/s]
100%|██████████| 15/15 [00:00<00:00, 1675.44it/s]
100%|██████████| 15/15 [00:05<00:00,  2.62it/s]
100%|██████████| 15/15 [00:05<00:00,  2.68it/s]


In [32]:
accuracy_score(target, predictions)

1.0