In [150]:
import pandas as pd
import numpy as np
from config import *
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import seaborn as sns

In [122]:
df = pd.read_parquet(DATA_DIR / "anti_saccade_processed.pq")

In [123]:
reaction_time_df = (df
 .query("stimulus_active == True")
 .sort_values(by=["participant_id", "trial_id", "stand_time"])
 .assign(stimulus_time = lambda x: np.select([x.event == "FIXPOINT", x.event != "FIXPOINT"], [x.stand_time, None]))
 .ffill()
 .query("event == 'ESACC'")
 .groupby(["experiment","participant_id", "trial_id"])
 .first()
 .reset_index()
 .assign(reaction_time = lambda x: x.stand_start_time - x.stimulus_time)
 .groupby(["experiment","participant_id"])
 .agg(mean_reaction_time = ('reaction_time', 'mean'))
 .reset_index()
)

  .ffill()


In [124]:
features = (df.groupby(["experiment", "participant_id"])
 .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
       'amplitude': [np.mean, np.min, np.max, np.median, np.std],
       'duration': [np.mean, np.min, np.max, np.median, np.std],
       'avg_pupil_size': [np.mean, np.min, np.max, np.median, np.std]
       })
 .reset_index()
)
    
features.columns = [''.join(col).strip() for col in features.columns.values]


  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],
  .agg({'peak_velocity': [np.mean, np.min, np.max, np.median, np.std],


In [125]:
features = pd.merge(features, reaction_time_df, left_on=["experiment","participant_id"], right_on=["experiment", "participant_id"], how='left')

# Load outcome

In [131]:
demographics = pd.read_excel(DATA_DIR / "demographic_info.xlsx")[["ID", "Group"]]

demographics["y"] = (demographics["Group"] == "PATIENT").astype(int)
demographics["participant_id"] = demographics["ID"].astype(str)
demographics = demographics[["participant_id", "y"]]



# Model training

In [146]:
data = pd.merge(features, demographics, how='left', on='participant_id')
y_data = data["y"]
X_data = data.drop(["experiment", "participant_id", "y"], axis=1)

In [148]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=.2)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", XGBClassifier(n_estimators=10, max_depth=6, learning_rate=1, objective='binary:logistic'))
])

print(pipe.fit(X_train, y_train).score(X_test, y_test))

results=pd.DataFrame()
results['columns']=X_train.columns
results['importances'] = pipe["clf"].feature_importances_
results.sort_values(by='importances',ascending=False,inplace=True)

results

0.5882352941176471


Unnamed: 0,columns,importances
13,durationmedian,0.089109
3,peak_velocitymedian,0.082238
16,avg_pupil_sizemin,0.075968
9,amplitudestd,0.068574
10,durationmean,0.066571
12,durationmax,0.064284
8,amplitudemedian,0.060498
20,mean_reaction_time,0.059416
2,peak_velocitymax,0.057138
6,amplitudemin,0.056531
