# Preprocess event data

In [34]:
import pandas as pd
from config import *

In [35]:
experiment = "ANTI_SACCADE"
df_raw = pd.read_parquet(f"{RAW_DIR}/{experiment}_events.pq")

In [36]:
df_raw

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,colour,stimulus_x,stimulus_y,start_time,...,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed,delay
0,ANTI_SACCADE,103,0.0,2354775.0,TRIALID,,,,,,...,,,,,,,,,,
1,ANTI_SACCADE,103,0.0,2354807.0,START,,,,,,...,,,,,,,,,,
2,ANTI_SACCADE,103,0.0,2354819.0,SBLINK,R,,,,,...,,,,,,,,,,
3,ANTI_SACCADE,103,0.0,2354823.0,SBLINK,L,,,,,...,,,,,,,,,,
4,ANTI_SACCADE,103,0.0,2354842.0,FIXPOINT,,255 255 255,960.0,540.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135567,ANTI_SACCADE,96,5.0,,ESACC,L,,,,2456380.0,...,,1594.6,499.8,1005.1,699.8,11.12,1671.0,,,
135568,ANTI_SACCADE,96,5.0,,EBLINK,R,,,,2456367.0,...,,,,,,,,,,
135569,ANTI_SACCADE,96,5.0,,ESACC,R,,,,2456354.0,...,,1640.2,568.1,1634.3,562.2,0.15,45.0,,,
135570,ANTI_SACCADE,96,5.0,,EFIX,L,,,,2456597.0,...,1443.0,,,,,,,,,


## Experiment specific preprocessing

### Anti-saccade

In [38]:
def transform_numeric_columns(df):
    nummeric_columns = ['participant_id', 'trial_id', 'time', 'stimulus_x', 'stimulus_y', 'start_time', 'end_time', 
                        'duration', 'x', 'y', 'avg_pupil_size', 'start_x', 'start_y', 'end_x', 'end_y', 'amplitude', 
                        'peak_velocity', 'time_elapsed', 'delay']
    for col in nummeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    return df

def coalesce_time_elapsed(df):
    return (
        df.assign(
            time_elapsed = lambda x: x[['time_elapsed', 'delay']].bfill(axis=1)['time_elapsed']
        )
        .drop(columns=['delay'])
    )

def fill_values_side(df):
    return(
        df.sort_values(['participant_id', 'trial_id', 'time'])
        .groupby(['participant_id', 'trial_id'], group_keys=False)[df.columns]
        .apply(lambda g: g.assign(side=g['side'].ffill().bfill()))
    )

# def stimulus_onset_time(df):
#     return (
#         df.sort_values(['participant_id', 'trial_id', 'time'])
#         .groupby(['participant_id', 'trial_id'], group_keys=False)[df.columns]
#         .apply(lambda g: g.assign(
#                 time = g.apply(lambda row: 
#                     row['time'] if row['colour'] != '255 0 0' 
#                         else (g.loc[(g['colour'] == '255 255 255'), 'time'].iloc[0] + 1000 * g.loc[(g['event'] == 'TRIAL_VAR_DATA'), 'time_elapsed'].iloc[0]), 
#                     axis=1))
#                )


In [39]:
def stimulus_onset_time(df):
    # Vectorized approach:
    results = []
    
    # Process each group separately
    for (participant, trial), group in df.sort_values(['participant_id', 'trial_id', 'time']).groupby(['participant_id', 'trial_id']):
        # Create a mask for red color
        red_mask = group['colour'] == '255 0 0'
        
        if red_mask.any():
            # Get base time and time elapsed once per group
            try:
                base_time = group.loc[group['colour'] == '255 255 255', 'time'].iloc[0]
                time_factor = group.loc[group['event'] == 'TRIAL_VAR_DATA', 'time_elapsed'].iloc[0]
                
                # Calculate new time for all red entries at once
                new_time = base_time + 1000 * time_factor
                
                # Create a copy of the group's time column
                new_times = group['time'].copy()
                
                # Update only the red entries
                new_times.loc[red_mask] = new_time
                
                # Assign the modified time back to the group
                group = group.assign(time=new_times)
            except IndexError:
                # Handle case where required data is missing
                pass
        
        results.append(group)
    
    return pd.concat(results) if results else df.copy()

In [52]:
df_exp_trans = (
    df_raw.pipe(transform_numeric_columns)
    .pipe(coalesce_time_elapsed)
    .pipe(fill_values_side)
    # .pipe(stimulus_onset_time)
)

In [53]:
df_exp_trans[df_exp_trans["trial_id"].isna()]

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,colour,stimulus_x,stimulus_y,start_time,...,y,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed


In [41]:
df_exp_trans

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,colour,stimulus_x,stimulus_y,start_time,...,y,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed
132725,ANTI_SACCADE,87,0.0,4707367.0,TRIALID,,,,,,...,,,,,,,,,left,
132726,ANTI_SACCADE,87,0.0,4707396.0,START,,,,,,...,,,,,,,,,left,
132727,ANTI_SACCADE,87,0.0,4707406.0,SFIX,R,,,,,...,,,,,,,,,left,
132728,ANTI_SACCADE,87,0.0,4707414.0,SFIX,R,,,,,...,,,,,,,,,left,
132729,ANTI_SACCADE,87,0.0,4707415.0,SFIX,L,,,,,...,,,,,,,,,left,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132720,ANTI_SACCADE,404,15.0,,ESACC,L,,,,3775564.0,...,,,1877.9,28.1,1945.0,,1.30,179.0,left,
132721,ANTI_SACCADE,404,15.0,,EFIX,R,,,,3775597.0,...,,871.0,,,,,,,left,
132722,ANTI_SACCADE,404,15.0,,EFIX,L,,,,3775599.0,...,12.2,1080.0,,,,,,,left,
132723,ANTI_SACCADE,404,15.0,,EBLINK,R,,,,3775803.0,...,,,,,,,,,left,


## General preprocessing

In [42]:
def coalesce_time(df):
    df.loc[:,"time"] = df[["time", "end_time"]].bfill(axis=1)
    
    return df
    
def remove_start_events(df):
    mask = (df["event"] == "SFIX") | (df["event"] == "SSACC")
    df_masked = df.loc[~mask,:]
    
    return df_masked

def group_df(df):
    grouped_df = df.sort_values(["participant_id", "trial_id", "time"]).groupby(["participant_id", "trial_id"])#[df.columns]
    
    return grouped_df

def standardise_time(df):
    
    grouped_df = group_df(df)
    
    df.loc[:,"time"] = df["time"] - grouped_df.time.transform('min')
    df.loc[:,"start_time"] = df["start_time"] - grouped_df.time.transform('min')
    df.loc[:,"end_time"] = df["end_time"] - grouped_df.time.transform('min')
    
    return df

def fill_values(df):
    
    grouped_df = group_df(df)
    df.loc[:,"colour"] = grouped_df["colour"].ffill()
    df.loc[:,"stimulus_x"] = grouped_df["stimulus_x"].ffill()
    df.loc[:,"stimulus_y"] = grouped_df["stimulus_y"].ffill()
    
    return df

In [43]:
def remove_invalid_saccades(df):
    """
    Remove ESACC events that have a blink occurring during the saccade.
    
    A saccade is considered invalid if there is a SBLINK and EBLINK event
    between the corresponding SSACC and ESACC events.
    """
    # Sort the data by participant_id, trial_id, and time
    df_sorted = df.sort_values(["participant_id", "trial_id", "time"])
    
    # Process each participant and trial separately
    results = []
    
    for (participant, trial), group in df_sorted.groupby(["participant_id", "trial_id"]):
        # Reset index to iterate through rows sequentially
        group = group.reset_index(drop=True)
        rows_to_keep = []
        
        # Track current saccade start
        in_saccade = False
        has_sblink = False
        has_eblink = False
        
        for i, row in group.iterrows():
            event = row['event']
            
            # Start of a new saccade
            if event == 'SSACC':
                in_saccade = True
                has_sblink = False
                has_eblink = False
                rows_to_keep.append(row)
            
            # During a saccade, track blinks
            elif in_saccade:
                if event == 'SBLINK':
                    has_sblink = True
                    rows_to_keep.append(row)
                elif event == 'EBLINK':
                    has_eblink = True
                    rows_to_keep.append(row)
                elif event == 'ESACC':
                    # Only keep ESACC if there wasn't a complete blink during this saccade
                    if not (has_sblink and has_eblink):
                        rows_to_keep.append(row)
                    in_saccade = False
                else:
                    # Keep all other events
                    rows_to_keep.append(row)
            
            # Not in a saccade, keep everything
            else:
                rows_to_keep.append(row)
        
        # Create a dataframe from the kept rows
        if rows_to_keep:
            results.append(pd.DataFrame(rows_to_keep))
    
    # Combine all results
    return pd.concat(results) if results else pd.DataFrame(columns=df.columns)

In [44]:
df_trans = (
    df_exp_trans
    .pipe(coalesce_time)
    .pipe(standardise_time)
    .pipe(fill_values)
    .pipe(remove_invalid_saccades)
    .pipe(remove_start_events)
)

In [None]:
df_trans.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,colour,stimulus_x,stimulus_y,start_time,...,y,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed
0,ANTI_SACCADE,87,0.0,0.0,TRIALID,,,,,,...,,,,,,,,,left,
1,ANTI_SACCADE,87,0.0,29.0,START,,,,,,...,,,,,,,,,left,
5,ANTI_SACCADE,87,0.0,71.0,FIXPOINT,,255 255 255,960.0,540.0,,...,,,,,,,,,left,
6,ANTI_SACCADE,87,0.0,248.0,EFIX,L,255 255 255,960.0,540.0,48.0,...,509.6,2888.0,,,,,,,left,
8,ANTI_SACCADE,87,0.0,249.0,EFIX,R,255 255 255,960.0,540.0,47.0,...,553.6,3079.0,,,,,,,left,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,ANTI_SACCADE,404,15.0,2476.0,SBLINK,R,255 0 0,629.0,540.0,,...,,,,,,,,,left,
49,ANTI_SACCADE,404,15.0,2529.0,TRIAL_VAR_DATA,,255 0 0,629.0,540.0,,...,,,,,,,,,left,1.462709
50,ANTI_SACCADE,404,15.0,2630.0,EFIX,L,255 0 0,629.0,540.0,2273.0,...,12.2,1080.0,,,,,,,left,
51,ANTI_SACCADE,404,15.0,2631.0,EBLINK,R,255 0 0,629.0,540.0,2477.0,...,,,,,,,,,left,
