In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from functools import reduce

In [3]:
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))
from config import *
from features.feature_utils import *

In [4]:
experiment = "ANTI_SACCADE"

df_event = pd.read_parquet(PREPROCESSED_DIR / f"{experiment}_events.pq").reset_index(drop=True)
df_sample = (pd.read_parquet(PREPROCESSED_DIR / f'{experiment}_samples.pq')
 .sort_values(["experiment", "participant_id", "trial_id","time"])
)

In [None]:
def rename_columns(df):
    """Renames columns by joining multi-level column names with different delimiters."""
    # Iterate over all column names
    df.columns = [f"{col[0]}" if col[1] == '' else f"{col[0]}_{col[1]}" for col in df.columns.values]
    return df

# Event features

In [23]:

def get_trial_correctness_df(df:pd.DataFrame) -> pd.DataFrame:
    df_trials = (df
        .query('stimulus_active == True')
        .sort_values(by=["participant_id", "trial_id","time"])
        .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.time, None]))
        .assign(stimulus_time = lambda x: x["stimulus_time"].ffill())
        .assign(saccade_direction = lambda x: np.select([(x["event"] == 'ESACC') & (np.abs(x["end_x"] - x["start_x"]) < 50),
                                                        (x["event"] == 'ESACC') & (x["end_x"] > x["start_x"]),
                                                        (x["event"] == 'ESACC') & (x["end_x"] < x["start_x"])],
                                                        ['no_direction',"right", "left"], default=None))
        .assign(saccade_end_area = lambda x: np.select([(x["event"] == 'ESACC') & ( 840 < x["end_x"]) & (x["end_x"] < 1080),
                                                        (x["event"] == 'ESACC') & (1080 <= x["end_x"]),
                                                        (x["event"] == 'ESACC') & (x["end_x"] <= 840)],
                                                    ['middle',"right", "left"], default=None))
        .assign(is_saccade_correct = lambda x: np.select([(x["saccade_direction"] == 'no_direction')
                                                        , (x["saccade_end_area"] == 'middle')
                                                        , (x["saccade_direction"] == x["saccade_end_area"]) & (x["saccade_direction"] != x["side"]) & (x["saccade_end_area"] != x["side"])
                                                        , (x["saccade_direction"] == x["saccade_end_area"]) & (x["saccade_direction"] == x["side"]) & (x["saccade_end_area"] == x["side"])
                                                        ],
                                                            [None, None, True, False], default=None)) 
        .assign(is_trial_correct = lambda x: (
                x.sort_values(by=["participant_id", "trial_id", "time"])
                .groupby(["participant_id", "trial_id"])["is_saccade_correct"]
                .transform(lambda group: (
                    True if not group.dropna().empty and group.dropna().iloc[0] == True else
                    False if not group.dropna().empty and group.dropna().iloc[0] == False else
                    None
                ))
            ))
    )
    return df_trials


def get_n_correct_trials_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns pd.Dataframe with columns ['experiment', 'participant_id', 'n_correct_trials']
    """
    
    feature_df = (df
     .pipe(get_trial_correctness_df)
     .groupby(["experiment","participant_id", "trial_id"])
     .agg(is_trial_correct = ('is_trial_correct', 'min')) 
     .reset_index()
     .groupby(["experiment", "participant_id"])
     .agg(n_correct_trials = ('is_trial_correct', 'sum'))
     .reset_index()
    [["experiment", "participant_id", "n_correct_trials"]]
    )
    
    return feature_df


def get_prop_trials_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns pd.Dataframe with columns ['experiment', 'participant_id', 'prop_correct_trials']
    """
    
    feature_df = (df
     .pipe(get_trial_correctness_df)
     .groupby(["experiment","participant_id", "trial_id"])
     .agg(is_trial_correct = ('is_trial_correct', 'min')) 
     .reset_index()
     .groupby(["experiment", "participant_id"])
     .agg(n_correct_trials = ('is_trial_correct', 'sum'),
          n_trials = ('is_trial_correct', 'count'))
     .reset_index()
     .assign(prop_correct_trials = lambda x: x["n_correct_trials"] / x["n_trials"])
     [["experiment", "participant_id", "prop_correct_trials"]]
    )
    
    return feature_df

def get_reaction_time_feature(df: pd.DataFrame) -> pd.DataFrame:
    return (df
        .query('stimulus_active == True')
        .pipe(get_trial_correctness_df)
        .sort_values(by=["participant_id", "trial_id", "time"])
        .assign(is_saccade_correct = lambda x: np.select([ (x["is_saccade_correct"] == True) ], [True], default=None))
        .query("is_saccade_correct == True")
        .groupby(["experiment","participant_id", "trial_id"])
        .first()
        .reset_index()
        .assign(reaction_time = lambda group: group["start_time"] - group["stimulus_time"])
        .groupby(["experiment", "participant_id"])
        .agg(reaction_time_avg = ('reaction_time', 'mean'),
             reaction_time_std = ('reaction_time', 'std'))
        .reset_index()
    )
    


         

In [24]:

def get_pre_calculated_metrics_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns pd.Dataframe with columns ['experiment','participant_id', X_FEATURES],
    where X_FEATURES is a collection of features found by the following cartesian product:
    {'peak_velocity', 'amplitude', 'duration', 'avg_pupil_size'} x {np.mean, np.min, np.max, np.median, np.std}
    """
    features_df = (df.groupby(["experiment", "participant_id"])
    .agg(
        mean_peak_velocity_sacc = ('peak_velocity', lambda x: x[df.loc[x.index, 'event'] == 'ESACC'].mean()),
        mean_amplitude_sacc = ('amplitude', lambda x: x[df.loc[x.index, 'event'] == 'ESACC'].mean()),
        mean_duration_sacc = ('duration', lambda x: x[df.loc[x.index, 'event'] == 'ESACC'].mean()),
        mean_duration_fix = ('duration', lambda x: x[df.loc[x.index, 'event'] == 'EFIX'].mean()),
        mean_pupil_size_fix = ('avg_pupil_size', lambda x: x[df.loc[x.index, 'event'] == 'EFIX'].mean()),
    )
    .reset_index()
    )    
    return features_df


# Sample features

In [None]:
def get_acceleration_feature(df: pd.DataFrame) -> pd.DataFrame:
    """Finds acceleration features for anti saccade experiment

    Args:
        df (pd.DataFrame): Dataframe with raw samples

    Returns:
        pd.DataFrame: Dataframe with columns ['experiment','participant_id', X_FEATURES]
        where X_FEATURES is a collection of features found by the following cartesian product:
        {'total_acceleration_magnitude_left', 'total_acceleration_magnitude_right'} x {np.mean, np.min, np.max, np.median, np.std}
    """
    logging.info("Extracting acceleration")
    acceleration = (df.join((df
    .groupby(["experiment", "participant_id", "trial_id"])[['x_velocity_left', 'y_velocity_left', 'x_velocity_right', 'y_velocity_right']].shift(1)
    .rename(columns={'x_velocity_left': 'x_velocity_left_lagged'
            , 'y_velocity_left': 'y_velocity_left_lagged'
            , 'x_velocity_right': 'x_velocity_right_lagged'
            , 'y_velocity_right': 'y_velocity_right_lagged'}))
    ).assign(x_acceleration_left = lambda x: (x["x_velocity_left"] - x["x_velocity_left_lagged"]) / (1/2000),
            y_acceleration_left = lambda x: (x["y_velocity_left"] - x["y_velocity_left_lagged"]) / (1/2000),
            x_acceleration_right = lambda x: (x["x_velocity_right"] - x["x_velocity_right_lagged"]) / (1/2000),
            y_acceleration_right = lambda x: (x["y_velocity_right"] - x["y_velocity_right_lagged"]) / (1/2000))
    .assign(total_acceleration_magnitude_left = lambda x: np.sqrt( np.power(x["x_acceleration_left"], 2) + np.power(x["y_acceleration_left"], 2)),
            total_acceleration_magnitude_right = lambda x: np.sqrt( np.power(x["x_acceleration_right"], 2) + np.power(x["y_acceleration_right"], 2)))
    .groupby(["experiment", "participant_id"])
    .agg({'total_acceleration_magnitude_left': [np.mean, np.min, np.max, np.median, np.std],
        'total_acceleration_magnitude_right': [np.mean, np.min, np.max, np.median, np.std]
        })
    .reset_index()
    .pipe(rename_columns)
    )
    return acceleration


# Eye disconjugacy
# Paper: https://www.liebertpub.com/doi/full/10.1089/neu.2014.3687

def get_disconjugacy_feature(df:pd.DataFrame) -> pd.DataFrame:
    logging.info("Extracting disconjugacy")
    disconjugacy = (df
        .sort_values(["experiment", "participant_id", "trial_id", "time"])
        .query("x_left == x_left & x_right == x_right & y_left == y_left & y_right == y_right") # same as not null
        .groupby(["experiment", "participant_id"])
        .apply(lambda group: group.assign(
            x_left_rolling=group["x_left"].rolling(window=5, min_periods=1).mean(),
            x_right_rolling=group["x_right"].rolling(window=5, min_periods=1).mean(),
            y_left_rolling=group["y_left"].rolling(window=5, min_periods=1).mean(),
            y_right_rolling=group["y_right"].rolling(window=5, min_periods=1).mean()
        ))
        .reset_index(drop=True)
        .assign(
            X_diffs = lambda x: ((x["x_left_rolling"] - x["x_right_rolling"]) - 0)**2,
            Y_diffs = lambda x: ((x["y_left_rolling"] - x["y_right_rolling"]) - 0)**2
        )
        .groupby(["experiment", "participant_id"])
        .apply(lambda group: group.assign(
            X_squared_scaled = group["X_diffs"] / group.shape[0],
            Y_squared_scaled = group["Y_diffs"] / group.shape[0]
        ))
        .reset_index(drop=True)
        .groupby(["experiment", "participant_id"])
        .agg(
            Var_X = ("X_squared_scaled", "sum"),
            Var_Y = ("Y_squared_scaled", "sum")
        )
        .assign(
            Var_total = lambda x: x["Var_X"] + x["Var_Y"]
        )
        .reset_index()
        [["experiment", "participant_id", "Var_total"]]
    )
    return disconjugacy



# Get all features

In [None]:
def get_anti_saccade_features(df_event: pd.DataFrame, df_sample:pd.DataFrame) -> pd.DataFrame:
    """Runs all anti saccade features extractions

    Args:
        df (pd.DataFrame): The preprocessed dataframe

    Returns:
        pd.DataFrame: Dataframe with columns ["experiment", "participant_id", X_FEATURES], where X_FEATURES is a collection of features
    """
    
    event_feature_functions = [get_n_correct_trials_feature, get_prop_trials_feature, get_reaction_time_feature]
    df_event_features_list = [f(df=df_event) for f in event_feature_functions]
    
    sample_feature_functions = [get_acceleration_feature, get_disconjugacy_feature]
    df_sample_features_list = [f(df=df_sample) for f in sample_feature_functions]
    
    df_features_list = df_event_features_list + df_sample_features_list
    
    df_features = reduce(lambda x, y: pd.merge(x, y, on = ["experiment", "participant_id"]), df_features_list)
    
    return df_features


features = get_anti_saccade_features(df_event=df_event, df_sample=df_sample)
    
    

  .apply(lambda group: group.assign(
  .apply(lambda group: group.assign(


# Save

In [None]:
features.to_parquet(FEATURES_DIR / f"{experiment}_features.pq")


In [7]:
df_sample

Unnamed: 0,experiment,participant_id,trial_id,time,x_left,y_left,pupil_size_left,x_velocity_left,y_velocity_left,x_right,y_right,pupil_size_right,x_velocity_right,y_velocity_right,x_resolution,y_resolution,error_message
0,ANTI_SACCADE,106,0,0,1014.2,585.5,2000.0,,,1004.1,541.1,1938.0,,,54.7,55.9,.....
1,ANTI_SACCADE,106,0,0,1014.8,585.5,1998.0,,,1004.8,539.6,1935.0,,,54.7,55.9,.....
2,ANTI_SACCADE,106,0,1,1014.9,585.6,1997.0,,,1005.3,538.2,1935.0,,,54.7,55.9,.....
3,ANTI_SACCADE,106,0,1,1015.0,585.7,2006.0,,,1004.9,538.2,1942.0,,,54.7,55.9,.....
4,ANTI_SACCADE,106,0,2,1015.4,584.5,2016.0,,,1005.6,538.4,1948.0,,,54.7,55.9,.....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17109089,ANTI_SACCADE,404,15,2610,1970.3,20.5,1040.0,,,,,0.0,,,59.6,57.0,...C.
17109090,ANTI_SACCADE,404,15,2610,1970.4,18.8,1041.0,,,,,0.0,,,59.6,57.0,...C.
17109091,ANTI_SACCADE,404,15,2611,1970.6,17.5,1039.0,,,,,0.0,,,59.6,57.0,...C.
17109092,ANTI_SACCADE,404,15,2611,1970.2,18.7,1037.0,,,,,0.0,,,59.6,57.0,...C.


In [8]:
get_acceleration_feature(df_sample[df_sample["participant_id"]==106])

2025-04-28 17:54:27,703 - INFO - feature_utils.get_acceleration_feature:277 - Extracting acceleration


Unnamed: 0,experiment,participant_id,total_acceleration_magnitude_mean,total_acceleration_magnitude_min,total_acceleration_magnitude_max,total_acceleration_magnitude_median,total_acceleration_magnitude_std,x_acceleration_mean,x_acceleration_min,x_acceleration_max,x_acceleration_median,x_acceleration_std,y_acceleration_mean,y_acceleration_min,y_acceleration_max,y_acceleration_median,y_acceleration_std
0,ANTI_SACCADE,106,0.363455,0.363455,0.363455,0.363455,0.0,-0.312808,-0.312808,-0.312808,-0.312808,0.0,-0.18507,-0.18507,-0.18507,-0.18507,0.0
