In [1]:
import pandas as pd
from config import *
SPECIAL_PARTICIPANTS = ["87", "89", "93", "96", "103", "105", "109", "117", "118", "119", "120", "127", "128", "141"]

anti_saccade = pd.read_parquet(RAW_DIR / "ANTI_SACCADE.pq")
anti_saccade

Unnamed: 0,experiment,participant_id,trial_id,time,event,colour,stimulus_x,stimulus_y,eye,start_time,...,avg_pupil_size,start_x,start_y,end_x,end_y,amplitude,peak_velocity,side,time_elapsed,delay
0,ANTI_SACCADE,103,0.0,2354775.0,TRIALID,,,,,,...,,,,,,,,,,
1,ANTI_SACCADE,103,0.0,2354807.0,START,,,,,,...,,,,,,,,,,
2,ANTI_SACCADE,103,0.0,2354842.0,FIXPOINT,255 255 255,960.0,540.0,,,...,,,,,,,,,,
3,ANTI_SACCADE,103,0.0,2354987.0,SFIX,,,,L,,...,,,,,,,,,,
4,ANTI_SACCADE,103,0.0,2355008.0,SFIX,,,,R,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127091,ANTI_SACCADE,96,5.0,,EFIX,,,,L,2454912.0,...,1458.0,,,,,,,,,
127092,ANTI_SACCADE,96,5.0,,ESACC,,,,L,2456380.0,...,,1594.6,499.8,1005.1,699.8,11.12,1671.0,,,
127093,ANTI_SACCADE,96,5.0,,ESACC,,,,R,2456354.0,...,,1640.2,568.1,1634.3,562.2,0.15,45.0,,,
127094,ANTI_SACCADE,96,5.0,,EFIX,,,,L,2456597.0,...,1443.0,,,,,,,,,


In [None]:
def exclude_nan_participants(df: pd.DataFrame) -> pd.DataFrame:
    print("Removing na participants")
    print()

    filtered_df = df[df["participant_id"].notna()]
    return filtered_df


def exclude_special_participants(df: pd.DataFrame, special_participants: list[str]) -> pd.DataFrame:
    print(f"Removing special participants: {special_participants}")
    print()

    filtered_df = df[~df["participant_id"].isin(special_participants)]
    return filtered_df

def check_trialid_event(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there is a trial_id")
    df_check = (df.
        query("event == 'TRIALID'").
        groupby(["participant_id", "trial_id", "event"])["event"].
        count().
        reset_index(name='n_TRIALID').
        query("n_TRIALID == 1")
    )
        
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
        
    rows_removed = len(df)-len(filtered_df)
    
    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df

def check_fixpoint_amount(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there are the correct amount of fixpoints for the given experiment")
    experiment = df["experiment"].unique()[0]
    if experiment == "ANTI_SACCADE":
        df_check = (df.
            query("event == 'FIXPOINT'").
            groupby(["participant_id", "trial_id", "event"])["event"].
            count().
            reset_index(name='n_fixpoints').
            query("n_fixpoints == 2")
        )
    else:
        return None    
    
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
        
    rows_removed = len(df)-len(filtered_df)
    
    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df

def check_red_fixpoint(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there are the correct amount of red fixpoints for the given experiment")
    experiment = df["experiment"].unique()[0]
    if experiment == "ANTI_SACCADE":
        df_check = (df.
            query("event == 'FIXPOINT' & colour == '255 0 0'").
            groupby(["participant_id", "trial_id", "event"])["event"].
            count().
            reset_index(name='n_red_fixpoints').
            query("n_red_fixpoints == 1")
        )    
    else:
        return None
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
    
    rows_removed = len(df)-len(filtered_df)
    
    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df

def check_white_fixpoint(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there are the correct amount of white fixpoints for the given experiment")
    experiment = df["experiment"].unique()[0]
    if experiment == "ANTI_SACCADE":
        df_check = (df.
            query("event == 'FIXPOINT' & colour == '255 255 255'").
            groupby(["participant_id", "trial_id", "event"])["event"].
            count().
            reset_index(name='n_white_fixpoints').
            query("n_white_fixpoints == 1")
        )
    else:
        return None
    
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
    rows_removed = len(df)-len(filtered_df)
    
    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df

def check_trial_var_data(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there are the correct amount of trial_var_data events for the given experiment")

    df_check = (df.
        query("event == 'TRIAL_VAR_DATA'").
        groupby(["participant_id", "trial_id", "event"])["event"].
        count().
        reset_index(name='n_trial_var_data_events').
        query("n_trial_var_data_events == 1")
    )
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
        
    rows_removed = len(df)-len(filtered_df)
    
    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df

def check_start_event(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there are the correct amount of start events for the given experiment")

    df_check = (df.
        query("event == 'START'").
        groupby(["participant_id", "trial_id", "event"])["event"].
        count().
        reset_index(name='n_start').
        query("n_start == 1")
    )
    
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
    rows_removed = len(df)-len(filtered_df)
 
    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df

def check_end_event(df: pd.DataFrame) -> pd.DataFrame:
    print("Checking if there are the correct amount of end events for the given experiment")

    df_check = (df.
        query("event == 'END'").
        groupby(["participant_id", "trial_id", "event"])["event"].
        count().
        reset_index(name='n_end').
        query("n_end == 1")
    )
    filtered_df = (df.
        loc[df["participant_id"].isin(df_check["participant_id"]),:].
        loc[df["trial_id"].isin(df_check["trial_id"]),:]
    )
    rows_removed = len(df)-len(filtered_df)

    print("Removed", rows_removed, "rows")
    if rows_removed > 0:
        print("Removed rows with [p_id, t_id]:\n", pd.unique(pd.concat([filtered_df,df]).drop_duplicates(keep=False)[["participant_id", "trial_id"]].values.ravel("K")))
    print()
    return filtered_df


In [6]:
df_new = (anti_saccade.
    pipe(exclude_nan_participants).
    pipe(exclude_special_participants, special_participants=SPECIAL_PARTICIPANTS).
    pipe(check_trialid_event).
    pipe(check_fixpoint_amount).
    pipe(check_red_fixpoint).
    pipe(check_white_fixpoint).
    pipe(check_trial_var_data).
    pipe(check_start_event).
    pipe(check_end_event)
)

Removing na participants

Removing special participants: ['87', '89', '93', '96', '103', '105', '109', '117', '118', '119', '120', '127', '128', '141']

Checking if there is a trial_id
Removed 19 rows
Removed rows with [p_id, t_id]:
 ['237' nan]

Checking if there are the correct amount of fixpoints for the given experiment
Removed 0 rows

Checking if there are the correct amount of red fixpoints for the given experiment
Removed 0 rows

Checking if there are the correct amount of white fixpoints for the given experiment
Removed 0 rows

Checking if there are the correct amount of trial_var_data events for the given experiment
Removed 0 rows

Checking if there are the correct amount of start events for the given experiment
Removed 1213 rows
Removed rows with [p_id, t_id]:
 ['237' 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0
 15.0]

Checking if there are the correct amount of end events for the given experiment
Removed 0 rows



In [4]:
path_save = CLEANED_DIR / "ANTI_SACCADE.pq"
df_new.to_parquet(path_save, index=False)