# Preprocess event data

In [2]:
import pandas as pd
from config import *

In [12]:
experiment = "EVIL_BASTARD"
df_raw = pd.read_parquet(f"{RAW_DIR}/{experiment}_events.pq")

In [4]:
df_raw.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,start_time,end_time,duration,x,...,end_y,amplitude,peak_velocity,angle,speed,target_x,target_y,colour,stimulus_x,stimulus_y
0,EVIL_BASTARD,105,0.0,17145614.0,TRIALID,,,,,,...,,,,,,,,,,
1,EVIL_BASTARD,105,0.0,17145624.0,START,,,,,,...,,,,,,,,,,
2,EVIL_BASTARD,105,0.0,17145631.0,SFIX,L,,,,,...,,,,,,,,,,
3,EVIL_BASTARD,105,0.0,17145631.0,SFIX,R,,,,,...,,,,,,,,,,
4,EVIL_BASTARD,105,0.0,17145681.0,SSACC,L,,,,,...,,,,,,,,,,


In [14]:
df_raw = df_raw.loc[df_raw["participant_id"]=="105",:]

## Experiment specific preprocessing

### Anti-saccade

In [16]:
def transform_numeric_columns(df):
    nummeric_columns = ['participant_id', 'trial_id', 'time', 'stimulus_x', 'stimulus_y', 'start_time', 'end_time', 
                        'duration', 'x', 'y', 'avg_pupil_size', 'start_x', 'start_y', 'end_x', 'end_y', 'amplitude', 
                        'peak_velocity', 'time_elapsed', 'delay']
    for col in nummeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    return df

def coalesce_time_elapsed(df):
    return (
        df.assign(
            time_elapsed = lambda x: x[['time_elapsed', 'delay']].bfill(axis=1)['time_elapsed']
        )
        .drop(columns=['delay'])
    )

def fill_values_side(df):
    return(
        df.sort_values(['participant_id', 'trial_id', 'time'])
        .groupby(['participant_id', 'trial_id'], group_keys=False)[df.columns]
        .apply(lambda g: g.assign(side=g['side'].ffill().bfill()))
    )

# def stimulus_onset_time(df):
#     return (
#         df.sort_values(['participant_id', 'trial_id', 'time'])
#         .groupby(['participant_id', 'trial_id'], group_keys=False)[df.columns]
#         .apply(lambda g: g.assign(
#                 time = g.apply(lambda row: 
#                     row['time'] if row['colour'] != '255 0 0' 
#                         else (g.loc[(g['colour'] == '255 255 255'), 'time'].iloc[0] + 1000 * g.loc[(g['event'] == 'TRIAL_VAR_DATA'), 'time_elapsed'].iloc[0]), 
#                     axis=1))
#                )


In [17]:
def stimulus_onset_time(df):
    # Vectorized approach:
    results = []
    
    # Process each group separately
    for (participant, trial), group in df.sort_values(['participant_id', 'trial_id', 'time']).groupby(['participant_id', 'trial_id']):
        # Create a mask for red color
        red_mask = group['colour'] == '255 0 0'
        
        if red_mask.any():
            # Get base time and time elapsed once per group
            try:
                base_time = group.loc[group['colour'] == '255 255 255', 'time'].iloc[0]
                time_factor = group.loc[group['event'] == 'TRIAL_VAR_DATA', 'time_elapsed'].iloc[0]
                
                # Calculate new time for all red entries at once
                new_time = base_time + 1000 * time_factor
                
                # Create a copy of the group's time column
                new_times = group['time'].copy()
                
                # Update only the red entries
                new_times.loc[red_mask] = new_time
                
                # Assign the modified time back to the group
                group = group.assign(time=new_times)
            except IndexError:
                # Handle case where required data is missing
                pass
        
        results.append(group)
    
    return pd.concat(results) if results else df.copy()

In [18]:
if experiment == "ANTI_SACCADE":
    df_exp_trans = (
        df_raw.pipe(transform_numeric_columns)
        .pipe(coalesce_time_elapsed)
        .pipe(fill_values_side)
        .pipe(stimulus_onset_time)
        )
elif experiment == "EVIL_BASTARD":
    df_exp_trans = df_raw
else: df_exp_trans = df_raw

In [19]:
df_exp_trans.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,start_time,end_time,duration,x,...,end_y,amplitude,peak_velocity,angle,speed,target_x,target_y,colour,stimulus_x,stimulus_y
0,EVIL_BASTARD,105,0.0,17145614.0,TRIALID,,,,,,...,,,,,,,,,,
1,EVIL_BASTARD,105,0.0,17145624.0,START,,,,,,...,,,,,,,,,,
2,EVIL_BASTARD,105,0.0,17145631.0,SFIX,L,,,,,...,,,,,,,,,,
3,EVIL_BASTARD,105,0.0,17145631.0,SFIX,R,,,,,...,,,,,,,,,,
4,EVIL_BASTARD,105,0.0,17145681.0,SSACC,L,,,,,...,,,,,,,,,,


## General preprocessing

In [20]:
def coalesce_time(df):
    df.loc[:,"time"] = df[["time", "end_time"]].bfill(axis=1)
    
    return df
    
def remove_start_events(df):
    mask = (df["event"] == "SFIX") | (df["event"] == "SSACC")
    df_masked = df.loc[~mask,:]
    
    return df_masked

def group_df(df):
    grouped_df = df.sort_values(["participant_id", "trial_id", "time"]).groupby(["participant_id", "trial_id"])#[df.columns]
    
    return grouped_df

def standardise_time(df):
    
    grouped_df = group_df(df)
    
    df.loc[:,"time"] = df["time"] - grouped_df.time.transform('min')
    df.loc[:,"start_time"] = df["start_time"] - grouped_df.time.transform('min')
    df.loc[:,"end_time"] = df["end_time"] - grouped_df.time.transform('min')
    
    return df

def fill_values(df):
    
    grouped_df = group_df(df)
    df.loc[:,"colour"] = grouped_df["colour"].ffill()
    df.loc[:,"stimulus_x"] = grouped_df["stimulus_x"].ffill()
    df.loc[:,"stimulus_y"] = grouped_df["stimulus_y"].ffill()
    
    return df

In [21]:
def remove_invalid_saccades(df):
    """
    Remove ESACC events that have a blink occurring during the saccade.
    
    A saccade is considered invalid if there is a SBLINK and EBLINK event
    between the corresponding SSACC and ESACC events.
    """
    # Sort the data by participant_id, trial_id, and time
    df_sorted = df.sort_values(["participant_id", "trial_id", "time"])
    
    # Process each participant and trial separately
    results = []
    
    for (participant, trial), group in df_sorted.groupby(["participant_id", "trial_id"]):
        # Reset index to iterate through rows sequentially
        group = group.reset_index(drop=True)
        rows_to_keep = []
        
        # Track current saccade start
        in_saccade = False
        has_sblink = False
        has_eblink = False
        
        for i, row in group.iterrows():
            event = row['event']
            
            # Start of a new saccade
            if event == 'SSACC':
                in_saccade = True
                has_sblink = False
                has_eblink = False
                rows_to_keep.append(row)
            
            # During a saccade, track blinks
            elif in_saccade:
                if event == 'SBLINK':
                    has_sblink = True
                    rows_to_keep.append(row)
                elif event == 'EBLINK':
                    has_eblink = True
                    rows_to_keep.append(row)
                elif event == 'ESACC':
                    # Only keep ESACC if there wasn't a complete blink during this saccade
                    if not (has_sblink and has_eblink):
                        rows_to_keep.append(row)
                    in_saccade = False
                else:
                    # Keep all other events
                    rows_to_keep.append(row)
            
            # Not in a saccade, keep everything
            else:
                rows_to_keep.append(row)
        
        # Create a dataframe from the kept rows
        if rows_to_keep:
            results.append(pd.DataFrame(rows_to_keep))
    
    # Combine all results
    return pd.concat(results) if results else pd.DataFrame(columns=df.columns)

In [22]:
df_trans = (
    df_exp_trans
    .pipe(coalesce_time)
    .pipe(standardise_time)
    .pipe(fill_values)
    .pipe(remove_invalid_saccades)
    .pipe(remove_start_events)
)

In [23]:
df_trans.head()

Unnamed: 0,experiment,participant_id,trial_id,time,event,eye,start_time,end_time,duration,x,...,end_y,amplitude,peak_velocity,angle,speed,target_x,target_y,colour,stimulus_x,stimulus_y
0,EVIL_BASTARD,105,0.0,0.0,TRIALID,,,,,,...,,,,,,,,,,
1,EVIL_BASTARD,105,0.0,10.0,START,,,,,,...,,,,,,,,,,
4,EVIL_BASTARD,105,0.0,66.0,EFIX,L,17.0,66.0,49.0,998.5,...,,,,,,,,,,
5,EVIL_BASTARD,105,0.0,66.0,EFIX,R,17.0,66.0,49.0,971.7,...,,,,,,,,,,
8,EVIL_BASTARD,105,0.0,83.0,ESACC,R,67.0,83.0,16.0,,...,533.9,0.19,50.0,,,,,,,


In [24]:
df_trans.to_parquet(PREPROCESSED_DIR / f"{experiment}_events.pq")