In [131]:
import pandas as pd
import numpy as np
from config import *
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
df_event = pd.read_parquet(PREPROCESSED_DIR / "ANTI_SACCADE_events.pq")
df_sample = (pd.read_parquet(RAW_DIR / 'ANTI_SACCADE_SAMPLES.pq')
 .sort_values(["experiment", "participant_id", "trial_id","time"])
)

def rename_columns(df):
    """Renames columns by joining multi-level column names with different delimiters."""
    # Iterate over all column names
    df.columns = [f"{col[0]}" if col[1] == '' else f"{col[0]}_{col[1]}" for col in df.columns.values]
    return df

# Event features

In [None]:
def get_n_correct_trials_feature(df: pd.DataFrame) -> pd.DataFrame:
     """
     Returns pd.Dataframe with columns ['experiment', 'participant_id', 'n_correct_trials']
     """
     
     feature_df = (df
     .query("stimulus_active == True")
     .sort_values(by=["participant_id", "trial_id", "stand_time"])
     .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.stand_time, None]))
     .ffill()
     .assign(saccade_direction = lambda x: np.where(x["sacc_end_x"] > x["sacc_start_x"], "right", "left"))
     .assign(is_trial_correct = lambda x: np.where(x["saccade_direction"] != x["stimulus_side"], True, False))
     .query("event == 'ESACC'")
     .groupby(["experiment","participant_id", "trial_id"])
     .first()
     .reset_index()
     .groupby(["experiment","participant_id"])
     .agg(n_correct_trials = ('is_trial_correct', 'sum'))
     .reset_index()
     [["experiment", "participant_id", "n_correct_trials"]]
     )
     
     return feature_df

def get_n_prop_trials_feature(df: pd.DataFrame) -> pd.DataFrame:
     """
     Returns pd.Dataframe with columns ['experiment','participant_id', 'prop_correct_trials']
     """
     
     feature_df = (df
     .query("stimulus_active == True")
     .sort_values(by=["participant_id", "trial_id", "stand_time"])
     .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.stand_time, None]))
     .ffill()
     .assign(saccade_direction = lambda x: np.where(x["sacc_end_x"] > x["sacc_start_x"], "right", "left"))
     .assign(is_trial_correct = lambda x: np.where(x["saccade_direction"] != x["stimulus_side"], True, False))
     .query("event == 'ESACC'")
     .groupby(["experiment","participant_id", "trial_id"])
     .first()
     .reset_index()
     .groupby(["experiment","participant_id"])
     .agg(n_correct_trials = ('is_trial_correct', 'sum'),
          n_trials = ('is_trial_correct', 'count'))
     .reset_index()
     .assign(prop_correct_trials = lambda x: x["n_correct_trials"] / x["n_trials"])
     [["experiment", "participant_id", "prop_correct_trials"]]
     )
     return feature_df

def get_reaction_time_feature(df: pd.DataFrame) -> pd.DataFrame:
     """
     Returns pd.Dataframe with columns ['experiment','participant_id', 'correct_mean_reaction_time', 'incorrect_mean_reaction_time']
     """
     feature_df = (df
     .query("stimulus_active == True")
     .sort_values(by=["participant_id", "trial_id", "stand_time"])
     .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.stand_time, None]))
     .ffill()
     .assign(saccade_direction = lambda x: np.where(x["sacc_end_x"] > x["sacc_start_x"], "right", "left"))
     .assign(is_trial_correct = lambda x: np.where(x["saccade_direction"] != x["stimulus_side"], True, False))
     .query("event == 'ESACC'")
     .groupby(["experiment","participant_id", "trial_id", "is_trial_correct"])
     .first()
     .reset_index()
     .assign(reaction_time = lambda x: x["stand_start_time"] - x["stimulus_time"])
     .groupby(["experiment","participant_id","is_trial_correct"])
     .agg(mean_reaction_time = ('reaction_time', 'mean'))
     .reset_index()
     .pivot(index=["experiment", "participant_id"], columns="is_trial_correct",values="mean_reaction_time")
     .reset_index()
     .rename({True: 'correct_reaction_time',
               False: 'incorrect_reaction_time'}, axis=1)
     )

     return feature_df

def get_pre_calculated_metrics_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns pd.Dataframe with columns ['experiment','participant_id', X_FEATURES],
    where X_FEATURES is a collection of features found by the following cartesian product:
    {'peak_velocity', 'amplitude', 'duration', 'avg_pupil_size'} x {np.mean, np.min, np.max, np.median, np.std}
    """
    features_df = (df.groupby(["experiment", "participant_id"])
    .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
        'amplitude': [np.mean, np.min, np.max, np.median, np.std],
        'duration': [np.mean, np.min, np.max, np.median, np.std],
        'avg_pupil_size': [np.mean, np.min, np.max, np.median, np.std]
        })
    .reset_index()
    .pipe(rename_columns))
    
    return features_df




# Sample features

In [None]:
def get_acceleration_feature(df: pd.DataFrame) -> pd.DataFrame:
    """Finds acceleration features for anti saccade experiment

    Args:
        df (pd.DataFrame): Dataframe with raw samples

    Returns:
        pd.DataFrame: Dataframe with columns ['experiment','participant_id', X_FEATURES]
        where X_FEATURES is a collection of features found by the following cartesian product:
        {'total_acceleration_magnitude_left', 'total_acceleration_magnitude_right'} x {np.mean, np.min, np.max, np.median, np.std}
    """

    acceleration = (df.join((df
    .groupby(["experiment", "participant_id", "trial_id"])[['x_velocity_left', 'y_velocity_left', 'x_velocity_right', 'y_velocity_right']].shift(1)
    .rename(columns={'x_velocity_left': 'x_velocity_left_lagged'
            , 'y_velocity_left': 'y_velocity_left_lagged'
            , 'x_velocity_right': 'x_velocity_right_lagged'
            , 'y_velocity_right': 'y_velocity_right_lagged'}))
    ).assign(x_acceleration_left = lambda x: (x["x_velocity_left"] - x["x_velocity_left_lagged"]) / (1/2000),
            y_acceleration_left = lambda x: (x["y_velocity_left"] - x["y_velocity_left_lagged"]) / (1/2000),
            x_acceleration_right = lambda x: (x["x_velocity_right"] - x["x_velocity_right_lagged"]) / (1/2000),
            y_acceleration_right = lambda x: (x["y_velocity_right"] - x["y_velocity_right_lagged"]) / (1/2000))
    .assign(total_acceleration_magnitude_left = lambda x: np.sqrt( np.power(x["x_acceleration_left"], 2) + np.power(x["y_acceleration_left"], 2)),
            total_acceleration_magnitude_right = lambda x: np.sqrt( np.power(x["x_acceleration_right"], 2) + np.power(x["y_acceleration_right"], 2)))
    .groupby(["experiment", "participant_id", "trial_id"])
    .agg({'total_acceleration_magnitude_left': [np.mean, np.min, np.max, np.median, np.std],
        'total_acceleration_magnitude_right': [np.mean, np.min, np.max, np.median, np.std]
        })
    .reset_index()
    .pipe(rename_columns)
    )


# Get all features

In [None]:
def get_anti_saccade_features(df_event: pd.DataFrame, df_sample:pd.DataFrame) -> pd.DataFrame:
    """Runs all anti saccade features extractions

    Args:
        df (pd.DataFrame): The preprocessed dataframe

    Returns:
        pd.DataFrame: Dataframe with columns ["experiment", "participant_id", X_FEATURES], where X_FEATURES is a collection of features
    """
    
    event_feature_functions = [get_pre_calculated_metrics_feature, get_n_correct_trials_feature, get_n_prop_trials_feature, get_reaction_time_feature]
    df_event_features_list = [f(df=df_event) for f in event_feature_functions]
    
    sample_feature_functions = [get_acceleration_feature]
    df_sample_features_list = [f(df=df_sample) for f in sample_feature_functions]
    
    df_features_list = df_event_features_list + df_sample_features_list
    
    df_features = reduce(lambda x, y: pd.merge(x, y, on = ["experiment", "participant_id"]), df_features_list)
    
    return df_features


features = get_anti_saccade_features(df_event=df_event, df_sample=df_sample)
    
    

  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],


UndefinedVariableError: name 'stimulus_active' is not defined

# Join demographic info on features

In [None]:
def load_demographic_info() -> pd.DataFrame:
    demographics = pd.read_excel(DATA_DIR / "demographic_info.xlsx")[["ID", "Group"]]

    demographics["y"] = (demographics["Group"] == "PATIENT").astype(int)
    demographics["participant_id"] = demographics["ID"].astype(str)
    demographics = demographics[["participant_id", "y"]]
    return demographics


def join_demographic_info_on_features(feature_df: pd.DataFrame) -> pd.DataFrame:
    demographics = load_demographic_info()
    return pd.merge(feature_df, demographics, how='left', on='participant_id')
    
    
join_demographic_info_on_features(feature_df=features)