In [1]:
import os
from os.path import join

import numpy as np
import pandas as pd

In [2]:
DATA_PATH = join("..", "data", "Dyslexia_1_ready_data_fixations.csv")

In [3]:
def read_dataset(path: str = DATA_PATH):
    df = pd.read_csv(path, index_col="Unnamed: 0")
    
    df["SentenceID"] = df["Sentence_ID"]
    df["x"] = df["FIX_X"].apply(lambda x: float(x.replace(",", ".")))
    df["y"] = df["FIX_Y"].apply(lambda x: float(x.replace(",", ".")))
    df.x /= df.x.max()
    df.y /= df.y.max()
    df["duration"] = df["FIX_DURATION"].apply(lambda x: float(x) / 1000)
    df["timestamp"] = df.duration.cumsum()
    df["AOI_1"] = np.random.choice([0, 1], size=len(df))
    df["AOI_2"] = np.random.choice([11, 22, 33], size=len(df))
    df["AOI_3"] = np.random.choice(['A', 'B', 'C'], size=len(df))
    
    df.drop(["Word_Number", "IQ", "FIX_X", "FIX_Y", "FIX_DURATION", "Age", "Sentence_ID"], axis=1, inplace=True)
    return df

In [4]:
df = read_dataset()
df.head()

Unnamed: 0,SubjectID,Group,Sex,SentenceID,x,y,duration,timestamp,AOI_1,AOI_2,AOI_3
0,nnr8,3,fem,27,0.101158,0.726088,0.345,0.345,1,11,B
1,nnr8,3,fem,27,0.102406,0.754083,0.221,0.566,1,11,C
2,nnr8,3,fem,27,0.126049,0.755455,0.204,0.77,0,11,C
3,nnr8,3,fem,27,0.142134,0.761356,0.289,1.059,0,22,A
4,nnr8,3,fem,27,0.109963,0.766433,0.6,1.659,1,33,B


### 0. `Extractor` над нормальным датасетом.

In [5]:
import eyefeatures.features.stats as eye_stats
from eyefeatures.features.extractor import Extractor

sac_feats_stats = {
    'length': ['min', 'max'],
    'acceleration': ['mean']
}

sac_feats_stats_shift = {
    'acceleration': ['mean']
}

sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID', 'SentenceID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  2.44it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID
nnr26_4,0.007354,0.129626,0.66291,-0.147094
nnr26_5,0.018238,0.114133,0.631904,-0.006173
nnr26_6,0.020591,0.157276,0.785294,-0.080868
nnr26_7,0.021866,0.164827,0.522326,-0.232872
nnr26_8,0.031671,0.127567,0.598728,-0.096059


### 1. Обработка `NaN`. Ожидается: читать changes.md.

In [6]:
dfn = df.copy()
dfn.loc[0, "x"] = None
dfn.loc[0, "y"] = np.nan
extractor.fit_transform(dfn).head()

  extractor.fit_transform(dfn).head()
100%|██████████| 1/1 [00:00<00:00,  2.25it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID
nnr26_4,0.007354,0.129626,0.66291,-0.147094
nnr26_5,0.018238,0.114133,0.631904,-0.006173
nnr26_6,0.020591,0.157276,0.785294,-0.080868
nnr26_7,0.021866,0.164827,0.522326,-0.232872
nnr26_8,0.031671,0.127567,0.598728,-0.096059


In [7]:
dfn.loc[0, "SubjectID"] = None
try:
    extractor.fit_transform(dfn).head()
except ValueError as e:
    print(e)
    print("OK")

Found missing values in pk.
OK


### 2. Несколько `shift_pk` и проверка, что можно не подмножество `pk`.

#### 2.0. Дефолтные случаи.

##### 2.0.0. Дефолтный случай `pk=None`, `shift_features=None`. Ожидается: одна строка и 3 фичи.

In [8]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    # shift_features=sac_feats_stats_shift,
    # shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    # pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 528.25it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean
0,0.0,1.255172,14.24703


##### 2.0.1. Дефолтный случай `pk=...`, `shift_features=None`. Ожидается: 3 фичи.

In [9]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    # shift_features=sac_feats_stats_shift,
    # shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID', 'SentenceID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  2.75it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean
nnr26_4,0.007354,0.129626,0.66291
nnr26_5,0.018238,0.114133,0.631904
nnr26_6,0.020591,0.157276,0.785294
nnr26_7,0.021866,0.164827,0.522326
nnr26_8,0.031671,0.127567,0.598728


#### 2.1. Можно `shift_pk` не подмножество `shift_features`.

##### 2.1.1. Можно `shift_pk` не подмножество `shift_features`. Ожидается: 3 фичи + 1 shift фича.

In [10]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=['SentenceID']
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 28.37it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID
nnr26,0.000981,1.242229,1.006598,-2.224055
nnr27,9.8e-05,1.223697,0.661369,-1.484686
nnr33,0.001667,1.218795,8.554904,6.793463
nnr36,0.0,1.108976,0.522238,-2.474636
nnr41,9.8e-05,1.202714,211.992586,208.564133


#### 2.2. Комбинации `shift_pk` и `shift_features`.

##### 2.2.1. Можно несколько `shift_pk` на один `shift_features`. Ожидается: 3 фичи + 3 shift фичи.

In [11]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID', 'SubjectID'], ['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 12.20it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID_SubjectID,sac_acceleration_mean_shift_SentenceID,sac_acceleration_mean_shift_SubjectID
nnr26,0.000981,1.242229,1.006598,0.246305,-2.224055,0.0
nnr27,9.8e-05,1.223697,0.661369,0.225492,-1.484686,-1.110223e-16
nnr33,0.001667,1.218795,8.554904,7.992622,6.793463,0.0
nnr36,0.0,1.108976,0.522238,0.142837,-2.474636,0.0
nnr41,9.8e-05,1.202714,211.992586,210.547696,208.564133,0.0


##### 2.2.2. Можно несколько `shift_pk` и несколько `shift_features`. Ожидается: 3 фичи + 3 shift фичи.

In [12]:
sf = eye_stats.SaccadeFeatures(
    features_stats={'length': ['min', 'max', 'median']},
    shift_features=({'length': ['min']}, {'length': ['max', 'median']}),
    shift_pk=(['SentenceID', 'SubjectID'], ['SentenceID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 20.12it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_length_median,sac_length_min_shift_SentenceID_SubjectID,sac_length_max_shift_SentenceID,sac_length_median_shift_SentenceID
nnr26,0.000981,1.242229,0.058783,-0.015779,0.259375,0.008595
nnr27,9.8e-05,1.223697,0.051674,-0.012229,0.234035,0.001828
nnr33,0.001667,1.218795,0.050595,-0.012024,0.227801,0.000759
nnr36,0.0,1.108976,0.031622,-0.005636,0.125408,-0.01775
nnr41,9.8e-05,1.202714,0.065695,-0.01035,0.224268,0.015378


### 3. Можно несколько разных `AOI` колонок.

#### 3.0. Дефолтные случаи.

##### 3.0.1. `aoi=None`. Ожидается: 3 фичи + 2 shift фичи.

In [13]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi=None,
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00, 18.80it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID,sac_acceleration_mean_shift_SubjectID
nnr26,0.000981,1.242229,1.006598,-2.224055,0.0
nnr27,9.8e-05,1.223697,0.661369,-1.484686,-1.110223e-16
nnr33,0.001667,1.218795,8.554904,6.793463,0.0
nnr36,0.0,1.108976,0.522238,-2.474636,0.0
nnr41,9.8e-05,1.202714,211.992586,208.564133,0.0


#### 3.1. Одно и более значений `AOI` + `calc_without_aoi`.

##### 3.1.1. Одно значение `AOI`. Ожидается: (3 фичи + 2 shift фичи) x 2 для `AOI_1=0` и `AOI_1=1`.

In [14]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi='AOI_1',
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  7.93it/s]


Unnamed: 0,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_max,sac_acceleration_AOI_1[1]_mean,sac_acceleration_AOI_1[1]_mean_shift_SentenceID,sac_acceleration_AOI_1[1]_mean_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_max,sac_acceleration_AOI_1[0]_mean,sac_acceleration_AOI_1[0]_mean_shift_SentenceID,sac_acceleration_AOI_1[0]_mean_shift_SubjectID
nnr26,0.007354,0.97582,0.755757,-0.077476,0.0,0.003824,1.084462,0.966335,0.065272,0.0
nnr27,0.006668,1.223697,0.664412,-0.145907,0.0,0.003726,0.99494,0.661719,-0.217515,0.0
nnr33,0.003138,0.89022,29.68103,28.878591,0.0,0.003334,0.93101,1.56436,0.641526,0.0
nnr36,0.000392,0.996313,0.338206,-0.462028,0.0,0.0,1.013276,0.503107,-0.355475,0.0
nnr41,0.000294,1.115643,2.166084,1.321617,0.0,9.8e-05,0.976506,17.153674,16.254921,0.0


##### 3.1.2. Одно значение `AOI`. Ожидается: (3 фичи + 2 shift фичи) x (1 + 2) для `calc_with_aoi=True`, `AOI_1=0` и `AOI_1=1`.

In [15]:
sf = eye_stats.SaccadeFeatures(
    features_stats=sac_feats_stats,
    shift_features=sac_feats_stats_shift,
    shift_pk=(['SentenceID'], ['SubjectID']),
    calc_without_aoi=True
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi='AOI_1',
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  6.16it/s]


Unnamed: 0,sac_length_min,sac_length_max,sac_acceleration_mean,sac_acceleration_mean_shift_SentenceID,sac_acceleration_mean_shift_SubjectID,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_max,sac_acceleration_AOI_1[1]_mean,sac_acceleration_AOI_1[1]_mean_shift_SentenceID,sac_acceleration_AOI_1[1]_mean_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_max,sac_acceleration_AOI_1[0]_mean,sac_acceleration_AOI_1[0]_mean_shift_SentenceID,sac_acceleration_AOI_1[0]_mean_shift_SubjectID
nnr26,0.000981,1.242229,1.006598,-2.224055,0.0,0.007354,0.97582,0.755757,-0.077476,0.0,0.003824,1.084462,0.966335,0.065272,0.0
nnr27,9.8e-05,1.223697,0.661369,-1.484686,-1.110223e-16,0.006668,1.223697,0.664412,-0.145907,0.0,0.003726,0.99494,0.661719,-0.217515,0.0
nnr33,0.001667,1.218795,8.554904,6.793463,0.0,0.003138,0.89022,29.68103,28.878591,0.0,0.003334,0.93101,1.56436,0.641526,0.0
nnr36,0.0,1.108976,0.522238,-2.474636,0.0,0.000392,0.996313,0.338206,-0.462028,0.0,0.0,1.013276,0.503107,-0.355475,0.0
nnr41,9.8e-05,1.202714,211.992586,208.564133,0.0,0.000294,1.115643,2.166084,1.321617,0.0,9.8e-05,0.976506,17.153674,16.254921,0.0


##### 3.1.3. Три значения `AOI`. Ожидается: (1 фича + 2 shift фичи) x (2 + 3 + 3) = 24 для `AOI_1` $\in [0, 1]$, `AOI_2` $\in [11, 22, 33]$, `AOI_3` $\in [A, B, C]$.


In [16]:
sf = eye_stats.SaccadeFeatures(
    features_stats={'length': ['min']},
    shift_features={'length': ['min']},
    shift_pk=(['SentenceID'], ['SubjectID'])
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi=['AOI_1', 'AOI_2', 'AOI_3'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  2.51it/s]


Unnamed: 0,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_min_shift_SentenceID,sac_length_AOI_1[1]_min_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_min_shift_SentenceID,sac_length_AOI_1[0]_min_shift_SubjectID,sac_length_AOI_2[11]_min,sac_length_AOI_2[11]_min_shift_SentenceID,sac_length_AOI_2[11]_min_shift_SubjectID,sac_length_AOI_2[22]_min,...,sac_length_AOI_2[33]_min_shift_SubjectID,sac_length_AOI_3[B]_min,sac_length_AOI_3[B]_min_shift_SentenceID,sac_length_AOI_3[B]_min_shift_SubjectID,sac_length_AOI_3[C]_min,sac_length_AOI_3[C]_min_shift_SentenceID,sac_length_AOI_3[C]_min_shift_SubjectID,sac_length_AOI_3[A]_min,sac_length_AOI_3[A]_min_shift_SentenceID,sac_length_AOI_3[A]_min_shift_SubjectID
nnr26,0.007354,0.003708,0.0,0.003824,0.000684,0.0,0.007354,-0.001044,0.0,0.009021,...,0.0,0.009707,0.001911,0.0,0.020199,0.013485,0.0,0.008825,0.00383,0.0
nnr27,0.006668,0.002867,0.0,0.003726,0.000309,0.0,0.012649,0.004437,0.0,0.012747,...,0.0,0.006864,-0.000724,0.0,0.003726,-0.002898,0.0,0.004608,-0.000427,0.0
nnr33,0.003138,-0.000435,0.0,0.003334,5e-06,0.0,0.007746,0.000102,0.0,0.001667,...,0.0,0.005589,-0.002267,0.0,0.003334,-0.003114,0.0,0.002745,-0.002134,0.0
nnr36,0.000392,-0.003506,0.0,0.0,-0.003162,0.0,0.001275,-0.007186,0.0,0.004216,...,0.0,0.000784,-0.006466,0.0,0.000392,-0.005543,0.0,0.000294,-0.004878,0.0
nnr41,0.000294,-0.00347,0.0,9.8e-05,-0.003204,0.0,0.00304,-0.005132,0.0,0.007158,...,0.0,0.000686,-0.006878,0.0,0.009707,0.00299,0.0,0.000882,-0.004309,0.0


##### 3.1.4. Три значения `AOI`. Ожидается: (1 фича + 2 shift фичи) x (2 + 3 + 3 + 1) = 27 для `AOI_1` $\in [0, 1]$, `AOI_2` $\in [11, 22, 33]$, `AOI_3` $\in [A, B, C]$, `calc_without_aoi=True`.


In [17]:
sf = eye_stats.SaccadeFeatures(
    features_stats={'length': ['min']},
    shift_features={'length': ['min']},
    shift_pk=(['SentenceID'], ['SubjectID']),
    calc_without_aoi=True
)

extractor = Extractor(
    features=[sf],
    x='x',
    y='x',
    t='timestamp',
    duration='duration',
    # dispersion='dispersion',
    # path_pk=['SubjectID'],
    aoi=['AOI_1', 'AOI_2', 'AOI_3'],
    pk=['SubjectID'],
    return_df=True
)

extractor.fit_transform(df).head()

100%|██████████| 1/1 [00:00<00:00,  2.16it/s]


Unnamed: 0,sac_length_min,sac_length_min_shift_SentenceID,sac_length_min_shift_SubjectID,sac_length_AOI_1[1]_min,sac_length_AOI_1[1]_min_shift_SentenceID,sac_length_AOI_1[1]_min_shift_SubjectID,sac_length_AOI_1[0]_min,sac_length_AOI_1[0]_min_shift_SentenceID,sac_length_AOI_1[0]_min_shift_SubjectID,sac_length_AOI_2[11]_min,...,sac_length_AOI_2[33]_min_shift_SubjectID,sac_length_AOI_3[B]_min,sac_length_AOI_3[B]_min_shift_SentenceID,sac_length_AOI_3[B]_min_shift_SubjectID,sac_length_AOI_3[C]_min,sac_length_AOI_3[C]_min_shift_SentenceID,sac_length_AOI_3[C]_min_shift_SubjectID,sac_length_AOI_3[A]_min,sac_length_AOI_3[A]_min_shift_SentenceID,sac_length_AOI_3[A]_min_shift_SubjectID
nnr26,0.000981,-8.3e-05,0.0,0.007354,0.003708,0.0,0.003824,0.000684,0.0,0.007354,...,0.0,0.009707,0.001911,0.0,0.020199,0.013485,0.0,0.008825,0.00383,0.0
nnr27,9.8e-05,-0.001024,0.0,0.006668,0.002867,0.0,0.003726,0.000309,0.0,0.012649,...,0.0,0.006864,-0.000724,0.0,0.003726,-0.002898,0.0,0.004608,-0.000427,0.0
nnr33,0.001667,0.000617,0.0,0.003138,-0.000435,0.0,0.003334,5e-06,0.0,0.007746,...,0.0,0.005589,-0.002267,0.0,0.003334,-0.003114,0.0,0.002745,-0.002134,0.0
nnr36,0.0,-0.00102,0.0,0.000392,-0.003506,0.0,0.0,-0.003162,0.0,0.001275,...,0.0,0.000784,-0.006466,0.0,0.000392,-0.005543,0.0,0.000294,-0.004878,0.0
nnr41,9.8e-05,-0.001018,0.0,0.000294,-0.00347,0.0,9.8e-05,-0.003204,0.0,0.00304,...,0.0,0.000686,-0.006878,0.0,0.009707,0.00299,0.0,0.000882,-0.004309,0.0


#### Проверка названий фичей

In [18]:
extractor.features[0].feature_names_in_

['sac_length_min',
 'sac_length_min_shift_SentenceID',
 'sac_length_min_shift_SubjectID',
 'sac_length_AOI_1[1]_min',
 'sac_length_AOI_1[1]_min_shift_SentenceID',
 'sac_length_AOI_1[1]_min_shift_SubjectID',
 'sac_length_AOI_1[0]_min',
 'sac_length_AOI_1[0]_min_shift_SentenceID',
 'sac_length_AOI_1[0]_min_shift_SubjectID',
 'sac_length_AOI_2[11]_min',
 'sac_length_AOI_2[11]_min_shift_SentenceID',
 'sac_length_AOI_2[11]_min_shift_SubjectID',
 'sac_length_AOI_2[22]_min',
 'sac_length_AOI_2[22]_min_shift_SentenceID',
 'sac_length_AOI_2[22]_min_shift_SubjectID',
 'sac_length_AOI_2[33]_min',
 'sac_length_AOI_2[33]_min_shift_SentenceID',
 'sac_length_AOI_2[33]_min_shift_SubjectID',
 'sac_length_AOI_3[B]_min',
 'sac_length_AOI_3[B]_min_shift_SentenceID',
 'sac_length_AOI_3[B]_min_shift_SubjectID',
 'sac_length_AOI_3[C]_min',
 'sac_length_AOI_3[C]_min_shift_SentenceID',
 'sac_length_AOI_3[C]_min_shift_SubjectID',
 'sac_length_AOI_3[A]_min',
 'sac_length_AOI_3[A]_min_shift_SentenceID',
 'sac_len