# Evil Bastard Features

In [1]:
import pandas as pd
import numpy as np

from config import *
from features.feature_utils import *

In [16]:
experiment = "EVIL_BASTARD"
participant_ids = pd.read_parquet(
        f"{PREPROCESSED_DIR}/{experiment}_samples.pq", 
        columns=["participant_id"]
    )

participant_ids = participant_ids["participant_id"].unique()

In [15]:
def get_samples_df(participant_id):
    df_samples = pd.read_parquet(
        f"{PREPROCESSED_DIR}/{experiment}_samples.pq",
        filters=[('participant_id', '=', participant_id)]
        )
    return df_samples
    
def get_events_df(participant_id):
    df_events = pd.read_parquet(
        f"{PREPROCESSED_DIR}/{experiment}_events.pq",
        filters=[('participant_id', '=', participant_id)]
        )
    
    return df_events

In [17]:
participant_id = 147

df_sample = get_samples_df(participant_id)
df_event = get_events_df(participant_id)

In [None]:
def combine_samples_events(df_sample: pd.DataFrame, df_event: pd.DataFrame) -> pd.DataFrame:
    """Combine sample data and event data to get fixpoints.
    
    Args:
        df_sample (pd.DataFrame): Dataframe with preprocessed sample data
        df_event (pd.DataFrame): Dataframe with preprocessed event data

    Returns:
        pd.DataFrame: Dataframe with fixpoints added to the sample data.
    """
    
    # Extract fixpoints
    df_fixpoints = df_event[df_event["event"]=="FIXPOINT"].loc[:,["participant_id", "trial_id", "time", "event", "colour", "stimulus_x", "stimulus_y"]]

    # Insert fixpoints in sample data
    df_sample = df_sample.copy()
    df_fixpoints = df_fixpoints.copy()

    # Make sure both DataFrames are sorted by time
    df_sample = df_sample.sort_values(["time", "trial_id", "participant_id"])
    df_fixpoints = df_fixpoints.sort_values(["time", "trial_id", "participant_id"])

    # Rename 'colour' column to 'fixpoint' so it's ready to merge
    df_fixpoints = df_fixpoints.rename(columns={"colour": "fixpoint"})

    # Perform a backward-looking join: for each row in sample_df, find the most recent fixpoint time
    df_sample = pd.merge_asof(
        df_sample,
        df_fixpoints,
        on="time",
        by=["participant_id", "trial_id"],
        direction="nearest",
        tolerance=10
    )

    df_sample["fixpoint"] = df_sample["fixpoint"].map({RED:"red", GREEN:"green", BLUE:"blue", WHITE:"white"})
    
    return df_sample

In [22]:
def evil_bastard_get_distance_to_stimulus_features(df: pd.DataFrame) -> pd.DataFrame:
    features = (df
        .assign(
            distance_to_fixpoint_left = lambda x: (x["x_left"]-x["stimulus_x"])**2+(x["y_left"]-x["stimulus_y"])**2,
            distance_to_fixpoint_right = lambda x: (x["x_right"]-x["stimulus_x"])**2+(x["y_right"]-x["stimulus_y"])**2
        )
        .assign(
            distance_to_fixpoint = lambda x: 
                np.where(
                    ~x["distance_to_fixpoint_left"].isna() & ~x["distance_to_fixpoint_right"].isna(),
                    (x["distance_to_fixpoint_left"]+x["distance_to_fixpoint_right"])/2,
                
                    np.where(
                        ~x["distance_to_fixpoint_left"].isna(),
                        x["distance_to_fixpoint_left"],
                        x["distance_to_fixpoint_right"]
                    )
                )
        )
        .groupby(["experiment", "participant_id"])
        .agg({
            'distance_to_fixpoint': ["mean", "min", "max", "median", "std"],
        })
        .reset_index()
        .pipe(rename_columns)
    )
    
    return features

In [28]:
def get_distance_between_fixations(df: pd.DataFrame) -> pd.DataFrame:
    """Finds acceleration features for anti saccade experiment

    Args:
        df (pd.DataFrame): Dataframe with preprocessed events

    Returns:
        pd.DataFrame: Dataframe with columns ['experiment','participant_id', X_FEATURES]
        where X_FEATURES is a collection of features found by the following cartesian product:
    """

    df = (df.query("event == 'EFIX'")
    .join((df
        .query("event == 'EFIX'")
        .groupby(["experiment", "participant_id", "trial_id", "eye"])[['x','y']].shift(1)
        .rename(columns={"x": "x_lagged", 
            "y": "y_lagged"})))
    .assign(
        x_fixation_dist = lambda x: x["x"] - x["x_lagged"],
        y_fixation_dist = lambda x: x["y"] - x["y_lagged"])
    .assign(
        fixation_distance = lambda x: np.sqrt( np.power(x["x_fixation_dist"],2) + np.power(x["y_fixation_dist"],2))
    )
    .groupby(["experiment", "participant_id"])
    .agg({'fixation_distance': [np.mean, np.std],
    })
    .reset_index()
    .pipe(rename_columns)
    )
    return df

In [77]:
def get_evil_bastard_features() -> pd.DataFrame:
    """Runs all evil features extractions

    Returns:
        pd.DataFrame: Dataframe with columns ["experiment", "participant_id", X_FEATURES], where X_FEATURES is a collection of features
    """

    logging.info("Extracting anti saccade features")
    
    experiment = "EVIL_BASTARD"
    
    # Read participant and trial id to identify unique participants
    df_index = pd.read_parquet(
        f"{PREPROCESSED_DIR}/{experiment}_events.pq", 
        columns=["participant_id"]
    )
    participant_groups = df_index["participant_id"].unique()
    
    df_features_all_participants = []
    for participant_id in tqdm(participant_groups, total=len(participant_groups)):
        logging.info(f"Processing participant {participant_id}")

        filters = [('participant_id', '=', participant_id)]
        df_event = pd.read_parquet(PREPROCESSED_DIR / f"{experiment}_events.pq", filters=filters)
        df_sample = (pd.read_parquet(PREPROCESSED_DIR / f'{experiment}_samples.pq', filters=filters)
        .sort_values(["experiment", "participant_id", "trial_id","time"])
        )
        df_combined = combine_samples_events(df_sample, df_event)
        
        logging.info("Starting event feature extraction")
        event_feature_functions = [get_pre_calculated_metrics_feature, get_distance_between_fixations]
        df_event_features_list = [f(df=df_event) for f in event_feature_functions]

        logging.info("Starting sample feature extraction")
        sample_feature_functions = [get_acceleration_feature, get_disconjugacy_feature]
        df_sample_features_list = [f(df=df_sample) for f in sample_feature_functions]
        
        logging.info("Starting combined feature extraction")
        combined_feature_functions = [evil_bastard_get_distance_to_stimulus_features]
        df_combined_features_list = [f(df=df_combined) for f in combined_feature_functions]
    
        df_features_par_list = df_event_features_list + df_sample_features_list + df_combined_features_list
    
        df_features_par = reduce(lambda x, y: pd.merge(x, y, on = ["experiment", "participant_id"]), df_features_par_list)

        df_features_all_participants.append(df_features_par)
    
    df_features = pd.concat(df_features_all_participants, ignore_index=True)
    
    logging.info("Finished extracting anti saccade features")
    
    return df_features

In [None]:
features=get_evil_bastard_features()

2025-04-21 17:16:01,778 - INFO - 2255764734.get_evil_bastard_features:8 - Extracting anti saccade features
  0%|          | 0/160 [00:00<?, ?it/s]2025-04-21 17:16:02,069 - INFO - 2255764734.get_evil_bastard_features:21 - Processing participant 106
2025-04-21 17:16:05,228 - INFO - 2255764734.get_evil_bastard_features:30 - Starting event feature extraction
2025-04-21 17:16:05,347 - INFO - 2255764734.get_evil_bastard_features:34 - Starting sample feature extraction
2025-04-21 17:16:05,348 - INFO - feature_utils.get_acceleration_feature:52 - Extracting acceleration
2025-04-21 17:16:05,897 - INFO - feature_utils.get_disconjugacy_feature:78 - Extracting disconjugacy
2025-04-21 17:16:07,816 - INFO - 2255764734.get_evil_bastard_features:38 - Starting combined feature extraction
  1%|          | 1/160 [00:06<16:06,  6.08s/it]2025-04-21 17:16:08,157 - INFO - 2255764734.get_evil_bastard_features:21 - Processing participant 111
2025-04-21 17:16:09,825 - INFO - 2255764734.get_evil_bastard_features: