In [None]:
import pandas as pd
import numpy as np
from config import *
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_parquet(DATA_DIR / "anti_saccade_processed.pq")

df.head()

experiment,participant_id,trial_id,stand_time,eye,event,delay,stimulus_active,stimulus_colour,stimulus_x,stimulus_y,fix_x,fix_y,sacc_start_x,sacc_start_y,sacc_end_x,sacc_end_y,stand_start_time,stand_end_time,avg_pupil_size,peak_velocity,amplitude,duration
str,str,f64,f64,str,str,str,bool,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""ANTI_SACCADE""","""103""",0.0,0.0,"""L""","""TRIALID""","""0.6313317779859062""",,,,,,,,,,,,,,,,
"""ANTI_SACCADE""","""103""",0.0,32.0,"""L""","""START""","""0.6313317779859062""",,,,,,,,,,,,,,,,
"""ANTI_SACCADE""","""103""",0.0,67.0,"""L""","""FIXPOINT""","""0.6313317779859062""",False,"""255 255 255""",960.0,540.0,,,,,,,,,,,,
"""ANTI_SACCADE""","""103""",0.0,366.0,"""L""","""EFIX""","""0.6313317779859062""",False,"""255 255 255""",960.0,540.0,917.3,682.8,,,,,212.0,366.0,2313.0,,,154.0
"""ANTI_SACCADE""","""103""",0.0,377.0,"""L""","""ESACC""","""0.6313317779859062""",False,"""255 255 255""",960.0,540.0,,,910.4,584.1,905.7,560.3,367.0,377.0,,50.0,0.43,10.0


In [None]:
df_grouped = df.groupby(["experiment", "participant_id"])
features = df_grouped["peak_velocity", "amplitude", "duration", "avg_pupil_size"].agg([np.min, np.max, np.mean, np.median, np.std])

In [None]:
demographics = pd.read_excel(DATA_DIR / "demographic_info.xlsx")[["ID", "Group"]]

demographics["y"] = np.select(
    [
        df['Group'] == "CONTROL",
        df['Group'] == "PATIENT"
    ], 
    [
        '0', 
        '1'
    ], 
    default='Unknown'
)

participant_id,y
i64,i32
105,1
138,1
103,1
148,1
139,0


In [None]:
data = features.join(demographics, how = 'left', on='participant_id').drop("participant_id")
X_data = data.drop("y")
y_data = data["y"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=.2)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", XGBClassifier(n_estimators=10, max_depth=6, learning_rate=1, objective='binary:logistic'))
])

pipe.fit(X_train, y_train).score(X_test, y_test)

results=pd.DataFrame()
results['columns']=X_train.columns
results['importances'] = pipe["clf"].feature_importances_
results.sort_values(by='importances',ascending=False,inplace=True)

results

Unnamed: 0,columns,importances
4,med_peak_velocity,0.159242
5,med_avg_pupil_size,0.108929
13,min_avg_pupil_size,0.079247
11,max_amplitude,0.070206
7,med_amplitude,0.067673
9,max_avg_pupil_size,0.056809
12,min_peak_velocity,0.054328
16,std_peak_velocity,0.049478
8,max_peak_velocity,0.042532
18,std_duration,0.041552
