In [1]:
import os
from os.path import join

import numpy as np
import pandas as pd

In [2]:
DATA_PATH = join("..", "data", "Dyslexia_1_ready_data_fixations.csv")

In [3]:
def read_dataset(path: str = DATA_PATH):
    df = pd.read_csv(path, index_col="Unnamed: 0")
    
    df["SentenceID"] = df["Sentence_ID"]
    df["x"] = df["FIX_X"].apply(lambda x: float(x.replace(",", ".")))
    df["y"] = df["FIX_Y"].apply(lambda x: float(x.replace(",", ".")))
    df.x /= df.x.max()
    df.y /= df.y.max()
    df["duration"] = df["FIX_DURATION"].apply(lambda x: float(x) / 1000)
    df["timestamp"] = df.duration.cumsum()
    df["AOI_1"] = np.random.choice([0, 1], size=len(df))
    df["AOI_2"] = np.random.choice([11, 22, 33], size=len(df))
    df["AOI_3"] = np.random.choice(['A', 'B', 'C'], size=len(df))
    
    df.drop(["Word_Number", "IQ", "FIX_X", "FIX_Y", "FIX_DURATION", "Age", "Sentence_ID"], axis=1, inplace=True)
    return df

In [4]:
df = read_dataset()
df.head()

Unnamed: 0,SubjectID,Group,Sex,SentenceID,x,y,duration,timestamp,AOI_1,AOI_2,AOI_3
0,nnr8,3,fem,27,0.101158,0.726088,0.345,0.345,1,33,B
1,nnr8,3,fem,27,0.102406,0.754083,0.221,0.566,1,11,C
2,nnr8,3,fem,27,0.126049,0.755455,0.204,0.77,1,33,B
3,nnr8,3,fem,27,0.142134,0.761356,0.289,1.059,1,11,B
4,nnr8,3,fem,27,0.109963,0.766433,0.6,1.659,1,33,A


### 0. `Extractor` над нормальным датасетом.

In [5]:
import eyefeatures.features.stats as eye_stats
from eyefeatures.features.extractor import Extractor

sac_feats_stats = {
    'length': ['min', 'max'],
    'acceleration': ['mean']
}

sac_feats_stats_shift = {
    'acceleration': ['mean']
}

sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID', 'SentenceID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  2.35it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID
nnr26_4,0.007354,0.129626,0.66291,-0.147094
nnr26_5,0.018238,0.114133,0.631904,-0.006173
nnr26_6,0.020591,0.157276,0.785294,-0.080868
nnr26_7,0.021866,0.164827,0.522326,-0.232872
nnr26_8,0.031671,0.127567,0.598728,-0.096059


### 1. Обработка `NaN`. Ожидается: читать changes.md.

In [6]:
dfn = df.copy()
dfn.loc[0, "x"] = None
dfn.loc[0, "y"] = np.nan
extractor.fit_transform(dfn).head()

  extractor.fit_transform(dfn).head()
100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID
nnr26_4,0.007354,0.129626,0.66291,-0.147094
nnr26_5,0.018238,0.114133,0.631904,-0.006173
nnr26_6,0.020591,0.157276,0.785294,-0.080868
nnr26_7,0.021866,0.164827,0.522326,-0.232872
nnr26_8,0.031671,0.127567,0.598728,-0.096059


In [7]:
dfn.loc[0, "SubjectID"] = None
try:
    extractor.fit_transform(dfn).head()
except ValueError as e:
    print(e)
    print("OK")

Found missing values in pk.
OK


### 2. Несколько `shift_pk` и проверка, что можно не подмножество `pk`.

#### 2.0. Дефолтные случаи.

##### 2.0.0. Дефолтный случай `pk=None`, `shift_features=None`. Ожидается: одна строка и 3 фичи.

In [8]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    # shift_features=sac_feats_stats_shift,
    # shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    # pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 737.52it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean
0,0.0,1.255172,14.24703


##### 2.0.1. Дефолтный случай `pk=...`, `shift_features=None`. Ожидается: 3 фичи.

In [9]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    # shift_features=sac_feats_stats_shift,
    # shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID', 'SentenceID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean
nnr26_4,0.007354,0.129626,0.66291
nnr26_5,0.018238,0.114133,0.631904
nnr26_6,0.020591,0.157276,0.785294
nnr26_7,0.021866,0.164827,0.522326
nnr26_8,0.031671,0.127567,0.598728


#### 2.1. Можно `shift_pk` не подмножество `shift_features`.

##### 2.1.1. Можно `shift_pk` не подмножество `shift_features`. Ожидается: 3 фичи + 1 shift фича.

In [10]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 54.21it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID
nnr26,0.000981,1.242229,1.006598,0.12767
nnr27,9.8e-05,1.223697,0.661369,-0.568315
nnr33,0.001667,1.218795,8.554904,7.688316
nnr36,0.0,1.108976,0.522238,-0.287766
nnr41,9.8e-05,1.202714,211.992586,211.006405


#### 2.2. Комбинации `shift_pk` и `shift_features`.

##### 2.2.1. Можно несколько `shift_pk` на один `shift_features`. Ожидается: 3 фичи + 3 shift фичи.

In [11]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID', 'SubjectID'], ['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 46.33it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID_SubjectID,sac_acceleration_mean_shift_SentenceID,sac_acceleration_mean_shift_SubjectID
nnr26,0.000981,1.242229,1.006598,0.321214,0.12767,0.0
nnr27,9.8e-05,1.223697,0.661369,-0.405512,-0.568315,-1.110223e-16
nnr33,0.001667,1.218795,8.554904,8.078148,7.688316,0.0
nnr36,0.0,1.108976,0.522238,0.319145,-0.287766,0.0
nnr41,9.8e-05,1.202714,211.992586,210.697448,211.006405,0.0


##### 2.2.2. Можно несколько `shift_pk` и несколько `shift_features`. Ожидается: 3 фичи + 3 shift фичи.

In [12]:
sf = eye_stats.SaccadeFeatures(
    features_stats={'length': ['min', 'max', 'median']},
    shift_features=({'length': ['min']}, {'length': ['max', 'median']}),
    shift_pk=(['SentenceID', 'SubjectID'], ['SentenceID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 62.43it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_length_median,sac_length_min_shift_SentenceID_SubjectID,sac_length_max_shift_SentenceID,sac_length_median_shift_SentenceID
nnr26,0.000981,1.242229,0.058783,-0.008923,0.193262,0.011178
nnr27,9.8e-05,1.223697,0.051674,-0.014806,0.329359,-0.000784
nnr33,0.001667,1.218795,0.050595,-0.013335,0.263173,-0.003726
nnr36,0.0,1.108976,0.031622,-0.005197,0.222481,-0.024219
nnr41,9.8e-05,1.202714,0.065695,-0.000588,0.154335,0.010884


### 3. Можно несколько разных `AOI` колонок.

#### 3.0. Дефолтные случаи.

##### 3.0.1. `aoi=None`. Ожидается: 3 фичи + 2 shift фичи.

In [13]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi=None,
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 46.49it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID,sac_acceleration_mean_shift_SubjectID
nnr26,0.000981,1.242229,1.006598,0.12767,0.0
nnr27,9.8e-05,1.223697,0.661369,-0.568315,-1.110223e-16
nnr33,0.001667,1.218795,8.554904,7.688316,0.0
nnr36,0.0,1.108976,0.522238,-0.287766,0.0
nnr41,9.8e-05,1.202714,211.992586,211.006405,0.0


#### 3.1. Одно и более значений `AOI` + `calc_without_aoi`.

##### 3.1.1. Одно значение `AOI`. Ожидается: (3 фичи + 2 shift фичи) x 2 для `AOI_1=0` и `AOI_1=1`.

In [14]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi='AOI_1',
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 20.56it/s]


Unnamed: 0,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_max,sac_acceleration_AOI_1[1]_mean,sac_acceleration_AOI_1[1]_mean_shift_SentenceID,sac_acceleration_AOI_1[1]_mean_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_max,sac_acceleration_AOI_1[0]_mean,sac_acceleration_AOI_1[0]_mean_shift_SentenceID,sac_acceleration_AOI_1[0]_mean_shift_SubjectID
nnr26,0.009021,1.084462,0.795194,-0.371862,0.0,0.003824,1.020826,1.117584,0.683955,0.0
nnr27,0.000882,1.013178,0.626557,-0.492811,0.0,9.8e-05,1.223697,0.753947,-0.164501,0.0
nnr33,0.007746,1.108093,30.87944,29.917985,0.0,0.003334,1.218795,1.314676,0.47418,0.0
nnr36,0.0,0.936991,0.756374,0.003742,0.0,0.001471,1.013276,0.315395,-0.566926,0.0
nnr41,9.8e-05,0.986998,742.585476,741.709904,0.0,0.000294,1.115643,3.352276,2.287328,0.0


##### 3.1.2. Одно значение `AOI`. Ожидается: (3 фичи + 2 shift фичи) x (1 + 2) для `calc_with_aoi=True`, `AOI_1=0` и `AOI_1=1`.

In [15]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID'], ['SubjectID']),
    calc_without_aoi=True
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi='AOI_1',
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 16.58it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID,sac_acceleration_mean_shift_SubjectID,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_max,sac_acceleration_AOI_1[1]_mean,sac_acceleration_AOI_1[1]_mean_shift_SentenceID,sac_acceleration_AOI_1[1]_mean_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_max,sac_acceleration_AOI_1[0]_mean,sac_acceleration_AOI_1[0]_mean_shift_SentenceID,sac_acceleration_AOI_1[0]_mean_shift_SubjectID
nnr26,0.000981,1.242229,1.006598,0.12767,0.0,0.009021,1.084462,0.795194,-0.371862,0.0,0.003824,1.020826,1.117584,0.683955,0.0
nnr27,9.8e-05,1.223697,0.661369,-0.568315,-1.110223e-16,0.000882,1.013178,0.626557,-0.492811,0.0,9.8e-05,1.223697,0.753947,-0.164501,0.0
nnr33,0.001667,1.218795,8.554904,7.688316,0.0,0.007746,1.108093,30.87944,29.917985,0.0,0.003334,1.218795,1.314676,0.47418,0.0
nnr36,0.0,1.108976,0.522238,-0.287766,0.0,0.0,0.936991,0.756374,0.003742,0.0,0.001471,1.013276,0.315395,-0.566926,0.0
nnr41,9.8e-05,1.202714,211.992586,211.006405,0.0,9.8e-05,0.986998,742.585476,741.709904,0.0,0.000294,1.115643,3.352276,2.287328,0.0


##### 3.1.3. Три значения `AOI`. Ожидается: (1 фича + 2 shift фичи) x (2 + 3 + 3) = 24 для `AOI_1` $\in [0, 1]$, `AOI_2` $\in [11, 22, 33]$, `AOI_3` $\in [A, B, C]$.


In [16]:
sf = eye_stats.SaccadeFeatures(
    features_stats={'length': ['min']},
    shift_features={'length': ['min']},
    shift_pk=(['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi=['AOI_1', 'AOI_2', 'AOI_3'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  8.00it/s]


Unnamed: 0,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_min_shift_SentenceID,sac_length_AOI_1[1]_min_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_min_shift_SentenceID,sac_length_AOI_1[0]_min_shift_SubjectID,sac_length_AOI_2[33]_min,sac_length_AOI_2[33]_min_shift_SentenceID,sac_length_AOI_2[33]_min_shift_SubjectID,sac_length_AOI_2[11]_min,...,sac_length_AOI_2[22]_min_shift_SubjectID,sac_length_AOI_3[B]_min,sac_length_AOI_3[B]_min_shift_SentenceID,sac_length_AOI_3[B]_min_shift_SubjectID,sac_length_AOI_3[C]_min,sac_length_AOI_3[C]_min_shift_SentenceID,sac_length_AOI_3[C]_min_shift_SubjectID,sac_length_AOI_3[A]_min,sac_length_AOI_3[A]_min_shift_SentenceID,sac_length_AOI_3[A]_min_shift_SubjectID
nnr26,0.009021,0.003138,0.0,0.003824,0.000686,0.0,0.000981,-0.00706,0.0,0.0151,...,0.0,0.009903,0.000686,0.0,0.01559,0.009707,0.0,0.008825,0.005687,0.0
nnr27,0.000882,-0.000882,0.0,9.8e-05,-0.004314,0.0,9.8e-05,-0.002255,0.0,0.004608,...,0.0,0.010296,-0.004608,0.0,0.006471,-0.001275,0.0,0.014218,0.008433,0.0
nnr33,0.007746,0.007648,0.0,0.003334,0.000686,0.0,0.003334,0.000686,0.0,0.003138,...,0.0,0.015002,0.000294,0.0,0.003628,-0.001569,0.0,0.009903,0.008825,0.0
nnr36,0.0,-0.001863,0.0,0.001471,0.000686,0.0,0.002255,-0.005393,0.0,0.002157,...,0.0,0.000588,-0.004608,0.0,0.006177,0.005393,0.0,0.000392,-0.016081,0.0
nnr41,9.8e-05,-0.001275,0.0,0.000294,-0.000588,0.0,0.000882,-9.8e-05,0.0,0.000882,...,0.0,0.000882,-0.00657,0.0,0.000294,-0.010099,0.0,0.000882,-0.011178,0.0


##### 3.1.4. Три значения `AOI`. Ожидается: (1 фича + 2 shift фичи) x (2 + 3 + 3 + 1) = 27 для `AOI_1` $\in [0, 1]$, `AOI_2` $\in [11, 22, 33]$, `AOI_3` $\in [A, B, C]$, `calc_without_aoi=True`.


In [17]:
sf = eye_stats.SaccadeFeatures(
    features_stats={'length': ['min']},
    shift_features={'length': ['min']},
    shift_pk=(['SentenceID'], ['SubjectID']),
    calc_without_aoi=True
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi=['AOI_1', 'AOI_2', 'AOI_3'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  7.11it/s]


Unnamed: 0,sac_length_min,sac_length_min_shift_SentenceID,sac_length_min_shift_SubjectID,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_min_shift_SentenceID,sac_length_AOI_1[1]_min_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_min_shift_SentenceID,sac_length_AOI_1[0]_min_shift_SubjectID,sac_length_AOI_2[33]_min,...,sac_length_AOI_2[22]_min_shift_SubjectID,sac_length_AOI_3[B]_min,sac_length_AOI_3[B]_min_shift_SentenceID,sac_length_AOI_3[B]_min_shift_SubjectID,sac_length_AOI_3[C]_min,sac_length_AOI_3[C]_min_shift_SentenceID,sac_length_AOI_3[C]_min_shift_SubjectID,sac_length_AOI_3[A]_min,sac_length_AOI_3[A]_min_shift_SentenceID,sac_length_AOI_3[A]_min_shift_SubjectID
nnr26,0.000981,-0.00049,0.0,0.009021,0.003138,0.0,0.003824,0.000686,0.0,0.000981,...,0.0,0.009903,0.000686,0.0,0.01559,0.009707,0.0,0.008825,0.005687,0.0
nnr27,9.8e-05,-0.000686,0.0,0.000882,-0.000882,0.0,9.8e-05,-0.004314,0.0,9.8e-05,...,0.0,0.010296,-0.004608,0.0,0.006471,-0.001275,0.0,0.014218,0.008433,0.0
nnr33,0.001667,0.001569,0.0,0.007746,0.007648,0.0,0.003334,0.000686,0.0,0.003334,...,0.0,0.015002,0.000294,0.0,0.003628,-0.001569,0.0,0.009903,0.008825,0.0
nnr36,0.0,-0.000784,0.0,0.0,-0.001863,0.0,0.001471,0.000686,0.0,0.002255,...,0.0,0.000588,-0.004608,0.0,0.006177,0.005393,0.0,0.000392,-0.016081,0.0
nnr41,9.8e-05,-0.000588,0.0,9.8e-05,-0.001275,0.0,0.000294,-0.000588,0.0,0.000882,...,0.0,0.000882,-0.00657,0.0,0.000294,-0.010099,0.0,0.000882,-0.011178,0.0


#### Проверка названий фичей

In [18]:
extractor.features[0].feature_names_in_

['sac_length_min',
 'sac_length_min_shift_SentenceID',
 'sac_length_min_shift_SubjectID',
 'sac_length_AOI_1[1]_min',
 'sac_length_AOI_1[1]_min_shift_SentenceID',
 'sac_length_AOI_1[1]_min_shift_SubjectID',
 'sac_length_AOI_1[0]_min',
 'sac_length_AOI_1[0]_min_shift_SentenceID',
 'sac_length_AOI_1[0]_min_shift_SubjectID',
 'sac_length_AOI_2[33]_min',
 'sac_length_AOI_2[33]_min_shift_SentenceID',
 'sac_length_AOI_2[33]_min_shift_SubjectID',
 'sac_length_AOI_2[11]_min',
 'sac_length_AOI_2[11]_min_shift_SentenceID',
 'sac_length_AOI_2[11]_min_shift_SubjectID',
 'sac_length_AOI_2[22]_min',
 'sac_length_AOI_2[22]_min_shift_SentenceID',
 'sac_length_AOI_2[22]_min_shift_SubjectID',
 'sac_length_AOI_3[B]_min',
 'sac_length_AOI_3[B]_min_shift_SentenceID',
 'sac_length_AOI_3[B]_min_shift_SubjectID',
 'sac_length_AOI_3[C]_min',
 'sac_length_AOI_3[C]_min_shift_SentenceID',
 'sac_length_AOI_3[C]_min_shift_SubjectID',
 'sac_length_AOI_3[A]_min',
 'sac_length_AOI_3[A]_min_shift_SentenceID',
 'sac_len