In [150]:
import pandas as pd
import numpy as np
from config import *
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import seaborn as sns

In [122]:
df = pd.read_parquet(DATA_DIR / "anti_saccade_processed.pq")

In [203]:
reaction_time_df = (df
 .query("stimulus_active == True")
 .sort_values(by=["participant_id", "trial_id", "stand_time"])
 .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.stand_time, None]))
 .ffill()
 .assign(saccade_direction = lambda x: np.where(x["sacc_end_x"] > x["sacc_start_x"], "right", "left"))
 .assign(is_trial_correct = lambda x: np.where(x["saccade_direction"] == x["stimulus_side"], True, False))
 .query("event == 'ESACC'")
 .groupby(["experiment","participant_id", "trial_id", "is_trial_correct"])
 .first()
 .reset_index()
 .assign(reaction_time = lambda x: x["stand_start_time"] - x["stimulus_time"])
 .groupby(["experiment","participant_id","is_trial_correct"])
 .agg(mean_reaction_time = ('reaction_time', 'mean'))
 .reset_index()
 .pivot(index=["experiment", "participant_id"], columns="is_trial_correct",values="mean_reaction_time")
 .reset_index()
 .rename({True: 'correct_reaction_time',
          False: 'correct_reaction_time'}, axis=1)
)

reaction_time_df

  .ffill()


is_trial_correct,experiment,participant_id,correct_reaction_time,correct_reaction_time.1
0,ANTI_SACCADE,106,284.381760,254.851530
1,ANTI_SACCADE,111,307.381760,210.245131
2,ANTI_SACCADE,113,160.612346,288.445679
3,ANTI_SACCADE,121,717.879329,418.045996
4,ANTI_SACCADE,122,252.131656,851.298322
...,...,...,...,...
165,ANTI_SACCADE,399,285.631760,394.128375
166,ANTI_SACCADE,401,269.819260,523.551108
167,ANTI_SACCADE,402,303.881760,341.257276
168,ANTI_SACCADE,403,313.756760,351.841997


In [204]:
features = (df.groupby(["experiment", "participant_id"])
 .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
       'amplitude': [np.mean, np.min, np.max, np.median, np.std],
       'duration': [np.mean, np.min, np.max, np.median, np.std],
       'avg_pupil_size': [np.mean, np.min, np.max, np.median, np.std]
       })
 .reset_index()
)
    
features.columns = [''.join(col).strip() for col in features.columns.values]


  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],


In [205]:
features = pd.merge(features, reaction_time_df, left_on=["experiment","participant_id"], right_on=["experiment", "participant_id"], how='left')

# Load outcome

In [206]:
demographics = pd.read_excel(DATA_DIR / "demographic_info.xlsx")[["ID", "Group"]]

demographics["y"] = (demographics["Group"] == "PATIENT").astype(int)
demographics["participant_id"] = demographics["ID"].astype(str)
demographics = demographics[["participant_id", "y"]]



# Model training

In [207]:
data = pd.merge(features, demographics, how='left', on='participant_id')
y_data = data["y"]
X_data = data.drop(["experiment", "participant_id", "y"], axis=1)

In [211]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=.2)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", XGBClassifier(n_estimators=10, max_depth=6, learning_rate=1, objective='binary:logistic'))
])

print(pipe.fit(X_train, y_train).score(X_test, y_test))

results=pd.DataFrame()
results['columns']=X_train.columns
results['importances'] = pipe["clf"].feature_importances_
results.sort_values(by='importances',ascending=False,inplace=True)

results

0.5882352941176471


Unnamed: 0,columns,importances
18,avg_pupil_sizemedian,0.08949
8,amplitudemedian,0.083699
20,correct_reaction_time,0.075454
6,amplitudemin,0.071931
17,avg_pupil_sizemax,0.065201
3,peak_velocitymedian,0.06337
5,amplitudemean,0.05567
19,avg_pupil_sizestd,0.05471
10,durationmean,0.052693
4,peak_velocitystd,0.047738
