# Preprocess event data

In [1]:
import pandas as pd
from config import *

In [None]:
experiment = "ANTI_SACCADE_SAMPLES"
df_raw = pd.read_parquet(f"{PROCESSED_DIR}/{experiment}.pq")

In [76]:
df_raw.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,colour,stimulus_x,stimulus_y,eye,start_time,...,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed,delay
0,ANTI_SACCADE,103,0.0,2354775.0,TRIALID,,,,,,...,,,,,,,,,,
1,ANTI_SACCADE,103,0.0,2354807.0,START,,,,,,...,,,,,,,,,,
2,ANTI_SACCADE,103,0.0,2354842.0,FIXPOINT,255 255 255,960.0,540.0,,,...,,,,,,,,,,
3,ANTI_SACCADE,103,0.0,2354987.0,SFIX,,,,L,,...,,,,,,,,,,
4,ANTI_SACCADE,103,0.0,2355008.0,SFIX,,,,R,,...,,,,,,,,,,


In [4]:
df_raw.columns

Index(['experiment', 'participant_id', 'trial_id', 'time', 'event', 'colour',
       'stimulus_x', 'stimulus_y', 'eye', 'start_time', 'end_time', 'duration',
       'x', 'y', 'avg_pupil_size', 'start_x', 'start_y', 'end_x', 'end_y',
       'amplitude', 'peak_velocity', 'side', 'time_elapsed', 'delay'],
      dtype='object')

## General preprocessing

In [77]:
def transform_numeric_columns(df):
    nummeric_columns = ['participant_id', 'trial_id', 'time', 'stimulus_x', 'stimulus_y', 'start_time', 'end_time', 
                        'duration', 'x', 'y', 'avg_pupil_size', 'start_x', 'start_y', 'end_x', 'end_y', 'amplitude', 
                        'peak_velocity', 'time_elapsed', 'delay']
    for col in nummeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    return df

def coalesce_time(df):
    df.loc[:,"time"] = df[["time", "end_time"]].bfill(axis=1)
    
    return df
    
def remove_start_events(df):
    mask = (df["event"] == "SFIX") | (df["event"] == "SSACC")
    df_masked = df.loc[~mask,:]
    
    return df_masked

def group_df(df):
    grouped_df = df.sort_values(["participant_id", "trial_id", "time"]).groupby(["participant_id", "trial_id"])#[df.columns]
    
    return grouped_df

def standardise_time(df):
    
    grouped_df = group_df(df)
    
    df.loc[:,"time"] = df["time"] - grouped_df.time.transform('min')
    df.loc[:,"start_time"] = df["start_time"] - grouped_df.time.transform('min')
    df.loc[:,"end_time"] = df["end_time"] - grouped_df.time.transform('min')
    
    return df

def fill_values(df):
    
    grouped_df = group_df(df)
    df.loc[:,"colour"] = grouped_df["colour"].ffill()
    df.loc[:,"stimulus_x"] = grouped_df["stimulus_x"].ffill()
    df.loc[:,"stimulus_y"] = grouped_df["stimulus_y"].ffill()
    df.loc[:,"side"] = grouped_df["side"].ffill().bfill()
    
    return df

In [6]:
df_raw.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,colour,stimulus_x,stimulus_y,eye,start_time,...,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed,delay
0,ANTI_SACCADE,103,0.0,2354775.0,TRIALID,,,,,,...,,,,,,,,,,
1,ANTI_SACCADE,103,0.0,2354807.0,START,,,,,,...,,,,,,,,,,
2,ANTI_SACCADE,103,0.0,2354842.0,FIXPOINT,255 255 255,960.0,540.0,,,...,,,,,,,,,,
3,ANTI_SACCADE,103,0.0,2354987.0,SFIX,,,,L,,...,,,,,,,,,,
4,ANTI_SACCADE,103,0.0,2355008.0,SFIX,,,,R,,...,,,,,,,,,,


In [86]:
df_transformed = (
    df_raw.pipe(transform_numeric_columns)
    .pipe(remove_start_events)
    .pipe(coalesce_time)
    .pipe(standardise_time)
    .pipe(fill_values)
)

In [87]:
df_transformed.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,colour,stimulus_x,stimulus_y,eye,start_time,...,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed,delay
0,ANTI_SACCADE,103,0.0,0.0,TRIALID,,,,,,...,,,,,,,,left,,
1,ANTI_SACCADE,103,0.0,32.0,START,,,,,,...,,,,,,,,left,,
2,ANTI_SACCADE,103,0.0,67.0,FIXPOINT,255 255 255,960.0,540.0,,,...,,,,,,,,left,,
23,ANTI_SACCADE,103,0.0,698.331778,FIXPOINT,255 0 0,624.0,540.0,,,...,,,,,,,,left,,
30,ANTI_SACCADE,103,0.0,6592.0,TRIAL_VAR_DATA,255 0 0,624.0,540.0,,,...,,,,,,,,left,0.631332,


## Experiment specific preprocessing

### Anti-saccade

In [95]:
def coalesce_time_elapsed(df):
    df.loc[:,"time_elapsed"] = df[["time_elapsed", "delay"]].bfill(axis=1)
    df = df.drop("delay", axis=1)
    
    return df

def stimulus_onset_time(df):
    for participant_id in df["participant_id"].unique():
        
        participant_m = df["participant_id"] == participant_id
        participant_df = df[participant_m]
        
        for trial_id in participant_df["trial_id"].unique():
            print(participant_id, trial_id)
            # Masks
            trial_m = participant_df["trial_id"] == trial_id
            white_fixpoint_m = participant_df["colour"] == "255 255 255"
            red_fixpoint_m = participant_df["colour"] == "255 0 0"
            trial_var_data_m = participant_df["event"] == "TRIAL_VAR_DATA"
            
            # Extraxt time for white fixpoint and time elapsed
            white_fix_point_time = participant_df[trial_m & white_fixpoint_m]["time"].values[0]
            time_elasped = participant_df[trial_var_data_m]["time_elapsed"].values[0]
            
            # Set time for red fixpoint
            df.loc[participant_m & trial_m & red_fixpoint_m, "time"] = white_fix_point_time + 1000*time_elasped

In [96]:
df_anti_saccade = (
    df_transformed.pipe(coalesce_time_elapsed)
    .pipe(stimulus_onset_time)
)

103 0.0
103 1.0
103 2.0
103 3.0
103 4.0
103 5.0
105 0.0
105 1.0
105 2.0
105 3.0
105 4.0
105 5.0
106 0.0
106 1.0
106 2.0
106 3.0
106 4.0
106 5.0
106 6.0
106 7.0
106 8.0
106 9.0
106 10.0
106 11.0
106 12.0
106 13.0
106 14.0
106 15.0
109 0.0
109 1.0
109 2.0
109 3.0
109 4.0
109 5.0
111 0.0
111 1.0
111 2.0
111 3.0
111 4.0
111 5.0
111 6.0
111 7.0
111 8.0
111 9.0
111 10.0
111 11.0
111 12.0
111 13.0
111 14.0
111 15.0
113 0.0
113 1.0
113 2.0
113 3.0
113 4.0
113 5.0
117 0.0
117 1.0
117 2.0
117 3.0
117 4.0
117 5.0
118 0.0
118 1.0
118 2.0
118 3.0
118 4.0
118 5.0
119 0.0
119 1.0
119 2.0
119 3.0
119 4.0
119 5.0
120 0.0
120 1.0
120 2.0
120 3.0
120 4.0
120 5.0
121 0.0
121 1.0
121 2.0
121 3.0
121 4.0
121 5.0
122 0.0
122 1.0
122 2.0
122 3.0
122 4.0
122 5.0
125 0.0
125 1.0
125 2.0
125 3.0
125 4.0
125 5.0
125 6.0
125 7.0
125 8.0
125 9.0
125 10.0
125 11.0
125 12.0
125 13.0
125 14.0
125 15.0
127 0.0
127 1.0
127 2.0
127 3.0
127 4.0
127 5.0
128 0.0
128 1.0
128 2.0
128 3.0
128 4.0
128 5.0
133 0.0
133 1.0
133 2.

IndexError: index 0 is out of bounds for axis 0 with size 0

In [10]:
df_transformed.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,colour,stimulus_x,stimulus_y,eye,start_time,...,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed,delay
0,ANTI_SACCADE,103,0.0,0.0,TRIALID,,,,,,...,,,,,,,,left,,
1,ANTI_SACCADE,103,0.0,32.0,START,,,,,,...,,,,,,,,left,,
2,ANTI_SACCADE,103,0.0,67.0,FIXPOINT,255 255 255,960.0,540.0,,,...,,,,,,,,left,,
23,ANTI_SACCADE,103,0.0,5960.0,FIXPOINT,255 0 0,624.0,540.0,,,...,,,,,,,,left,,
30,ANTI_SACCADE,103,0.0,6592.0,TRIAL_VAR_DATA,255 0 0,624.0,540.0,,,...,,,,,,,,left,0.631332,


In [280]:
df["time"] - df_transformed.time.transform('min')

0          0.0
1         32.0
2         67.0
3          NaN
4          NaN
          ... 
127091     NaN
127092     NaN
127093     NaN
127094     NaN
127095     NaN
Name: time, Length: 127096, dtype: float64

In [221]:
df_transformed["time"] = df_transformed["time"].transform(lambda x : x - min(df_transformed["time"]))

KeyboardInterrupt: 