In [514]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from config import *
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
df_event = pd.read_parquet(PREPROCESSED_DIR / "ANTI_SACCADE_events.pq")
df_sample = (pd.read_parquet(PREPROCESSED_DIR / 'ANTI_SACCADE_samples.pq')
 .sort_values(["experiment", "participant_id", "trial_id","time"])
)
df_sample["participant_id"] = df_sample["participant_id"].astype(int)

def rename_columns(df):
    """Renames columns by joining multi-level column names with different delimiters."""
    # Iterate over all column names
    df.columns = [f"{col[0]}" if col[1] == '' else f"{col[0]}_{col[1]}" for col in df.columns.values]
    return df

# Event features

In [515]:
def get_trial_correctness_df(df:pd.DataFrame) -> pd.DataFrame:
    df_trials = (df
        .query('stimulus_active == True')
        .sort_values(by=["participant_id", "trial_id", "eye","time"])
        .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.time, None]))
        .assign(stimulus_time = lambda x: x["stimulus_time"].ffill())
        .assign(saccade_direction = lambda x: np.select([(x["event"] == 'ESACC') & (np.abs(x["end_x"] - x["start_x"]) < 50),
                                                        (x["event"] == 'ESACC') & (x["end_x"] > x["start_x"]),
                                                        (x["event"] == 'ESACC') & (x["end_x"] < x["start_x"])],
                                                        ['no_direction',"right", "left"], default=None))
        .assign(saccade_end_area = lambda x: np.select([(x["event"] == 'ESACC') & ( 840 < x["end_x"]) & (x["end_x"] < 1080),
                                                        (x["event"] == 'ESACC') & (1080 <= x["end_x"]),
                                                        (x["event"] == 'ESACC') & (x["end_x"] <= 840)],
                                                    ['middle',"right", "left"], default=None))
        .assign(is_saccade_correct = lambda x: np.select([(x["saccade_direction"] == 'no_direction')
                                                        , (x["saccade_end_area"] == 'middle')
                                                        , (x["saccade_direction"] == x["saccade_end_area"]) & (x["saccade_direction"] != x["side"]) & (x["saccade_end_area"] != x["side"])
                                                        , (x["saccade_direction"] == x["saccade_end_area"]) & (x["saccade_direction"] == x["side"]) & (x["saccade_end_area"] == x["side"])
                                                        ],
                                                            [None, None, True, False], default=None)) 
    )
    trial_correctness = (
        df_trials
        .sort_values(by=["participant_id", "trial_id", "time"])
        .groupby(["participant_id", "trial_id"])["is_saccade_correct"]
        .apply(lambda group: (
            True if not group.dropna().empty and group.dropna().iloc[0] == True else
            False if not group.dropna().empty and group.dropna().iloc[0] == False else
            None
        ))
        .reset_index()
        .rename(columns={"is_saccade_correct": "is_trial_correct"})
    )

    df_trials = df_trials.merge(trial_correctness, on=["participant_id", "trial_id"], how="left")

    return df_trials

get_trial_correctness_df(df_event)

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,colour,stimulus_x,stimulus_y,start_time,...,amplitude,peak_velocity,side,time_elapsed,stimulus_active,stimulus_time,saccade_direction,saccade_end_area,is_saccade_correct,is_trial_correct
0,ANTI_SACCADE,106,0.0,3068.000000,EFIX,L,255 0 0,1290.0,540.0,428.0,...,,,right,,True,,,,,True
1,ANTI_SACCADE,106,0.0,3120.000000,ESACC,L,255 0 0,1290.0,540.0,3069.0,...,7.34,291.0,right,,True,,left,left,True,True
2,ANTI_SACCADE,106,0.0,3275.000000,EFIX,L,255 0 0,1290.0,540.0,3121.0,...,,,right,,True,,,,,True
3,ANTI_SACCADE,106,0.0,3309.000000,ESACC,L,255 0 0,1290.0,540.0,3276.0,...,3.83,233.0,right,,True,,left,left,True,True
4,ANTI_SACCADE,106,0.0,3862.000000,EFIX,L,255 0 0,1290.0,540.0,3310.0,...,,,right,,True,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40980,ANTI_SACCADE,404,15.0,2476.000000,SBLINK,R,255 0 0,629.0,540.0,,...,,,left,,True,1997.516395,,,,True
40981,ANTI_SACCADE,404,15.0,2631.000000,EBLINK,R,255 0 0,629.0,540.0,2477.0,...,,,left,,True,1997.516395,,,,True
40982,ANTI_SACCADE,404,15.0,1523.709179,FIXPOINT,,255 0 0,629.0,540.0,,...,,,left,,True,1523.709179,,,,True
40983,ANTI_SACCADE,404,15.0,2529.000000,TRIAL_VAR_DATA,,255 0 0,629.0,540.0,,...,,,left,1.462709,True,1523.709179,,,,True


In [516]:


def get_n_correct_trials_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns pd.Dataframe with columns ['experiment', 'participant_id', 'n_correct_trials']
    """
    
    feature_df = (df
     .pipe(get_trial_correctness_df)
     .groupby(["experiment","participant_id", "trial_id"])
     .agg(is_trial_correct = ('is_trial_correct', 'min')) 
     .reset_index()
     .groupby(["experiment", "participant_id"])
     .agg(n_correct_trials = ('is_trial_correct', 'sum'))
     .reset_index()
    [["experiment", "participant_id", "n_correct_trials"]]
    )
    
    return feature_df


def get_prop_trials_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns pd.Dataframe with columns ['experiment', 'participant_id', 'prop_correct_trials']
    """
    
    feature_df = (df
     .pipe(get_trial_correctness_df)
     .groupby(["experiment","participant_id", "trial_id"])
     .agg(is_trial_correct = ('is_trial_correct', 'min')) 
     .reset_index()
     .groupby(["experiment", "participant_id"])
     .agg(n_correct_trials = ('is_trial_correct', 'sum'),
          n_trials = ('is_trial_correct', 'count'))
     .reset_index()
     .assign(prop_correct_trials = lambda x: x["n_correct_trials"] / x["n_trials"])
     [["experiment", "participant_id", "prop_correct_trials"]]
    )
    
    return feature_df

def get_reaction_time_feature(df: pd.DataFrame) -> pd.DataFrame:
    return (df
        .query('stimulus_active == True')
        .pipe(get_trial_correctness_df)
        .sort_values(by=["participant_id", "trial_id", "time"])
        .assign(is_saccade_correct = lambda x: np.select([ (x["is_saccade_correct"] == True) ], [True], default=None))
        .query("is_saccade_correct == True")
        .groupby(["experiment","participant_id", "trial_id"])
        .first()
        .reset_index()
        .assign(reaction_time = lambda group: group["start_time"] - group["stimulus_time"])
        .groupby(["experiment", "participant_id"])
        .agg(reaction_time_avg = ('reaction_time', 'mean'),
             reaction_time_std = ('reaction_time', 'std'))
        .reset_index()
    )
    


         

In [517]:

def get_pre_calculated_metrics_feature(df: pd.DataFrame) -> pd.DataFrame:

    df_sacc = df[df['event'] == 'ESACC']
    df_fix = df[df['event'] == 'EFIX']
    
    sacc_metrics = df_sacc.groupby(['experiment', 'participant_id']).agg(
        mean_peak_velocity_sacc=('peak_velocity', 'mean'),
        mean_amplitude_sacc=('amplitude', 'mean'),
        mean_duration_sacc=('duration', 'mean')
    )
    
    fix_metrics = df_fix.groupby(['experiment', 'participant_id']).agg(
        mean_duration_fix=('duration', 'mean'),
        mean_pupil_size_fix=('avg_pupil_size', 'mean')
    )
    
    features_df = sacc_metrics.join(fix_metrics, how='outer').reset_index()
    
    return features_df


# Sample features

In [518]:
def get_acceleration_feature(df: pd.DataFrame) -> pd.DataFrame:
    """Finds acceleration features for anti saccade experiment

    Args:
        df (pd.DataFrame): Dataframe with raw samples

    Returns:
        pd.DataFrame: Dataframe with columns ['experiment','participant_id', X_FEATURES]
        where X_FEATURES is a collection of features found by the following cartesian product:
        {'total_acceleration_magnitude_left', 'total_acceleration_magnitude_right'} x {np.mean, np.min, np.max, np.median, np.std}
    """

    acceleration = (df.join((df
    .groupby(["experiment", "participant_id", "trial_id"])[['x_velocity_left', 'y_velocity_left', 'x_velocity_right', 'y_velocity_right']].shift(1)
    .rename(columns={'x_velocity_left': 'x_velocity_left_lagged'
            , 'y_velocity_left': 'y_velocity_left_lagged'
            , 'x_velocity_right': 'x_velocity_right_lagged'
            , 'y_velocity_right': 'y_velocity_right_lagged'}))
    ).assign(x_acceleration_left = lambda x: (x["x_velocity_left"] - x["x_velocity_left_lagged"]) / (1/2000),
            y_acceleration_left = lambda x: (x["y_velocity_left"] - x["y_velocity_left_lagged"]) / (1/2000),
            x_acceleration_right = lambda x: (x["x_velocity_right"] - x["x_velocity_right_lagged"]) / (1/2000),
            y_acceleration_right = lambda x: (x["y_velocity_right"] - x["y_velocity_right_lagged"]) / (1/2000))
    .assign(total_acceleration_magnitude_left = lambda x: np.sqrt( np.power(x["x_acceleration_left"], 2) + np.power(x["y_acceleration_left"], 2)),
            total_acceleration_magnitude_right = lambda x: np.sqrt( np.power(x["x_acceleration_right"], 2) + np.power(x["y_acceleration_right"], 2)))
    .groupby(["experiment", "participant_id"])
    .agg({'total_acceleration_magnitude_left': [np.mean, np.min, np.max, np.median, np.std],
        'total_acceleration_magnitude_right': [np.mean, np.min, np.max, np.median, np.std]
        })
    .reset_index()
    .pipe(rename_columns)
    )
    return acceleration


# Eye disconjugacy
# Paper: https://www.liebertpub.com/doi/full/10.1089/neu.2014.3687

def get_disconjugacy_feature(df:pd.DataFrame) -> pd.DataFrame:
    disconjugacy = (df_sample
        .sort_values(["experiment", "participant_id", "trial_id", "time"])
        .query("x_left == x_left & x_right == x_right & y_left == y_left & y_right == y_right") # same as not null
        .groupby(["experiment", "participant_id"])
        .apply(lambda group: group.assign(
            x_left_rolling=group["x_left"].rolling(window=5, min_periods=1).mean(),
            x_right_rolling=group["x_right"].rolling(window=5, min_periods=1).mean(),
            y_left_rolling=group["y_left"].rolling(window=5, min_periods=1).mean(),
            y_right_rolling=group["y_right"].rolling(window=5, min_periods=1).mean()
        ))
        .reset_index(drop=True)
        .assign(
            X_diffs = lambda x: ((x["x_left_rolling"] - x["x_right_rolling"]) - 0)**2,
            Y_diffs = lambda x: ((x["y_left_rolling"] - x["y_right_rolling"]) - 0)**2
        )
        .groupby(["experiment", "participant_id"])
        .apply(lambda group: group.assign(
            X_squared_scaled = group["X_diffs"] / group.shape[0],
            Y_squared_scaled = group["Y_diffs"] / group.shape[0]
        ))
        .reset_index(drop=True)
        .groupby(["experiment", "participant_id"])
        .agg(
            Var_X = ("X_squared_scaled", "sum"),
            Var_Y = ("Y_squared_scaled", "sum")
        )
        .assign(
            Var_total = lambda x: x["Var_X"] + x["Var_Y"]
        )
        .reset_index()
        [["experiment", "participant_id", "Var_total"]]
    )
    return disconjugacy



# Get all features

In [519]:
def get_anti_saccade_features(df_event: pd.DataFrame, df_sample:pd.DataFrame) -> pd.DataFrame:
    """Runs all anti saccade features extractions

    Args:
        df (pd.DataFrame): The preprocessed dataframe

    Returns:
        pd.DataFrame: Dataframe with columns ["experiment", "participant_id", X_FEATURES], where X_FEATURES is a collection of features
    """
    
    event_feature_functions = [get_pre_calculated_metrics_feature, get_n_correct_trials_feature, get_prop_trials_feature, get_reaction_time_feature]
    df_event_features_list = [f(df=df_event) for f in event_feature_functions]
    
    sample_feature_functions = [get_acceleration_feature, get_disconjugacy_feature]
    df_sample_features_list = [f(df=df_sample) for f in sample_feature_functions]
    
    df_features_list = df_event_features_list + df_sample_features_list
    
    df_features = reduce(lambda x, y: pd.merge(x, y, on = ["experiment", "participant_id"]), df_features_list)
    
    return df_features


features = get_anti_saccade_features(df_event=df_event, df_sample=df_sample)
    
    

  .apply(lambda group: group.assign(
  .apply(lambda group: group.assign(


# Join demographic info on features

In [521]:
def load_demographic_info() -> pd.DataFrame:
    demographics = pd.read_excel(DATA_DIR / "demographic_info.xlsx")[["ID", "Group"]]

    demographics["y"] = (demographics["Group"] == "PATIENT").astype(int)
    demographics["participant_id"] = demographics["ID"].astype(int)
    demographics = demographics[["participant_id", "y"]]
    return demographics


def join_demographic_info_on_features(feature_df: pd.DataFrame) -> pd.DataFrame:
    demographics = load_demographic_info()
    return pd.merge(feature_df, demographics, how='left', on='participant_id')
    
data = join_demographic_info_on_features(feature_df=features)


# Train model

In [522]:
y_data = data["y"]
X_data = data.drop(["experiment", "participant_id", "y"], axis=1)
X_data = X_data[["reaction_time_avg", "prop_correct_trials"]]

In [523]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=.2)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(max_depth=2))
])


print(pipe.fit(X_train, y_train).score(X_test, y_test))

results=pd.DataFrame()
results['columns']=X_train.columns
results['importances'] = pipe["clf"].feature_importances_
results.sort_values(by='importances',ascending=False,inplace=True)

results


0.6470588235294118


Unnamed: 0,columns,importances
0,reaction_time_avg,0.659924
1,prop_correct_trials,0.340076
