Read in stroop data, starting from "before" run, compile all data into one for each subject (each sheet). Remove unnecessary outputs such as labels other than the column names. Count and remove missing inputs and errors (double entries). Calculate overall time mean and median. Calculate standard deviation and 4 standard deviations to find outliers (4 SD from mean). Calculate each of the stroop conditions means, medians, SDs. 

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

#read in excel spreadsheet
path = '/Users/kjung6/Eva/Stroop/hivpd23/copy/HIVPD23_SMtS_behav.xlsx'
sheets = pd.read_excel(path, sheet_name=None, header=None)
output_path = '/Users/kjung6/Eva/Stroop/filtered_sheets.xlsx'

In [2]:
#define the values we want in column E (index 4)
valid_values = ['conM', 'conMNM', 'conNM', 'incM', 'incMNM', 'incNM']#, 'EventTag']

filtered_sheets = {}

#loop over each sheet and filter data based on valid values in column E
for sheet_name, sheet_data in sheets.items():
    #filter the sheet based on the values in column E (index 4)
    filtered_data = sheet_data[sheet_data.iloc[:, 4].isin(valid_values)]
    filtered_data = filtered_data.iloc[:, :8]
    filtered_data.columns = ['Trial', 'RunLabel', 'Condition', 'TrialStart', 'EventTag', 'Time', 'keys', 'match_status']
    #, 'sequence', 'mouse_down']
    filtered_sheets[sheet_name] = filtered_data

for sheet_name, sheet_data in filtered_sheets.items():
    #drop duplicates to get rid of extra colnames
    sheet_data = sheet_data.drop_duplicates()
    #update the filtered_sheets with the cleaned data
    filtered_sheets[sheet_name] = sheet_data

# #add new column to each sheet (pandas df) in dictionary (filtered_sheets) for match or nonmatch status called match_status
# for sheet_name, sheet_data in filtered_sheets.items():
#    sheet_data['match_status'] = ''

##count # of each unique val in 'keys' column of each df
# for sheet_name, sheet_data in filtered_sheets.items():
#     counts = sheet_data['keys'].value_counts()
#     print(counts)



In [16]:
#print(filtered_sheets['HP23-00057'].head())
subj_57_filtered = filtered_sheets['HP23-00057']
subj_57_sorted = subj_57_filtered.sort_values(by = 'EventTag', ascending = True)
subj_57_sorted_trial = subj_57_filtered.sort_values(by = 'Trial', ascending = True)
#print(subj_57_sorted)
subj_57_condition_stats = pd.DataFrame()
subj_57_condition_stats['metrics'] = ['mean', 'median', 'stdev']

#OVERALL

match = '[1]'
nonmatch = '[2]'
def map_condition(value):
    if 'conM' in value or 'incM' in value:
        return '[1]'
    elif 'conNM' in value or 'incNM' in value:
        return '[2]'
    else:
        return value  # keep the original value if no match

#apply the function to the 'Condition' column
subj_57_sorted['Condition_bin'] = subj_57_sorted['Condition'].apply(map_condition)
#print(subj_57_sorted)

#count the number of differences
num_differences = (subj_57_sorted['Condition_bin']  != subj_57_sorted['keys']).sum()
#print(num_differences)

#if different, delete row (by subsetting only rows that match)
subj_57_sorted = subj_57_sorted[subj_57_sorted['Condition_bin']  == subj_57_sorted['keys']]

#count num missing
missing_count = subj_57_sorted['keys'].isna().sum()
subj_57_sorted = subj_57_sorted[subj_57_sorted['keys'].isna() != True]

#mean overall rt
overall_mean = subj_57_sorted['Time'].mean()

#median overall rt
overall_median = subj_57_sorted['Time'].median()

#overall sd and 4x
overall_sd = subj_57_sorted['Time'].std()
sd4x = overall_sd * 4

#outlier cutoff
outlier_cut = overall_mean + sd4x
#print(outlier_cut)
#subset outliers and count
outlier_df = subj_57_sorted[subj_57_sorted['Time'] >= outlier_cut]
outlier_count = len(outlier_df)
subj_57_sorted = subj_57_sorted[subj_57_sorted['Time'] <= outlier_cut]

subj_57_overall_stats = pd.DataFrame()
subj_57_overall_stats['metrics'] = ['misses', 'errors', 'overallRTmean', 'overallRTmedian', 'stdev', '4STDEV', 'outlier_cutoff', 'num_prolonged']
#subj_57_overall_stats['values'] = [''] * len(subj_57_overall) 
subj_57_overall_stats['values'] = [missing_count, num_differences, overall_mean, overall_median, overall_sd, sd4x, outlier_cut, outlier_count]




#conditional statistics

#conM-RR 
#conM_df = subj_57_sorted[subj_57_sorted['EventTag'].str.contains('conM', case=False, na=False)]
conM_df = subj_57_sorted[subj_57_sorted['EventTag'] == 'conM']
subj_57_condition_stats['conM-RR'] = [conM_df['Time'].mean(), conM_df['Time'].median(), conM_df['Time'].std()]

#incM-RR 
#incM_df = subj_57_sorted[subj_57_sorted['EventTag'].str.contains('incM', case=False, na=False)]
incM_df = subj_57_sorted[subj_57_sorted['EventTag'] == 'incM']
subj_57_condition_stats['incM-RR'] = [incM_df['Time'].mean(), incM_df['Time'].median(), incM_df['Time'].std()]

#conM-RS 
#df that is all rows other than the conM rr ones
#non_conM_df = subj_57_sorted[~subj_57_sorted['EventTag'].str.contains('conM', case=False, na=False)]
non_conM_df = subj_57_sorted[subj_57_sorted['EventTag'] != 'conM']
#df that is all conM Conditions that are in non_conM_df aka all the rs conM's
conM_rs_df = non_conM_df[non_conM_df['Condition'].str.contains('conM', case=False, na=False)]
subj_57_condition_stats['conM-RS'] = [conM_rs_df['Time'].mean(), conM_rs_df['Time'].median(), conM_rs_df['Time'].std()]

#incM-RS
#df that is all rows other than incM rr ones
non_incM_df = subj_57_sorted[subj_57_sorted['EventTag'] != 'incM']
#df that is all incM Conditions that are in non_conM_df aka all the rs incM's 
incM_rs_df = non_incM_df[non_incM_df['Condition'].str.contains('incM', case=False, na=False)]
subj_57_condition_stats['incM-RS'] = [incM_rs_df['Time'].mean(), incM_rs_df['Time'].median(), incM_rs_df['Time'].std()]

#conNM-RR
conNM_df = subj_57_sorted[subj_57_sorted['EventTag'] == 'conNM']
subj_57_condition_stats['conNM-RR'] = [conNM_df['Time'].mean(), conNM_df['Time'].median(), conNM_df['Time'].std()]

#incNM-RR
incNM_df = subj_57_sorted[subj_57_sorted['EventTag'] == 'incNM']
subj_57_condition_stats['incNM-RR'] = [incNM_df['Time'].mean(), incNM_df['Time'].median(), incNM_df['Time'].std()]

#conNM-RS
#df that is all rows other than the conNM rr ones
non_conNM_df = subj_57_sorted[subj_57_sorted['EventTag'] != 'conNM']
#df that is all conNM Conditions that are in non_conNM_df aka all the rs conNM's
conNM_rs_df = non_conNM_df[non_conNM_df['Condition'].str.contains('conNM', case=False, na=False)]
subj_57_condition_stats['conNM-RS'] = [conNM_rs_df['Time'].mean(), conNM_rs_df['Time'].median(), conNM_rs_df['Time'].std()]

#incNM-RS
#df that is all rows other than the conNM rr ones
non_incNM_df = subj_57_sorted[subj_57_sorted['EventTag'] != 'incNM']
#df that is all incNM Conditions that are in non_incNM_df aka all the rs incNM's
incNM_rs_df = non_incNM_df[non_incNM_df['Condition'].str.contains('incNM', case=False, na=False)]
subj_57_condition_stats['incNM-RS'] = [incNM_rs_df['Time'].mean(), incNM_rs_df['Time'].median(), incNM_rs_df['Time'].std()]

#con-RR
#con_df = subj_57_sorted[subj_57_sorted['EventTag'].str.contains('conM', case=False, na=False)]
RR_df = subj_57_sorted[~subj_57_sorted['EventTag'].str.contains('MNM', case=False, na=False)]
con_RR_df = RR_df[RR_df['Condition'].str.contains("con", case=False, na=False)]
subj_57_condition_stats['con-RR'] = [con_RR_df['Time'].mean(), con_RR_df['Time'].median(), con_RR_df['Time'].std()]

#inc-RR
inc_RR_df = RR_df[RR_df['Condition'].str.contains("inc", case=False, na=False)]
subj_57_condition_stats['inc-RR'] = [inc_RR_df['Time'].mean(), inc_RR_df['Time'].median(), inc_RR_df['Time'].std()]

#con-RS
RS_df = subj_57_sorted[subj_57_sorted['EventTag'].str.contains('MNM', case=False, na=False)]
con_RS_df = RS_df[RS_df['Condition'].str.contains("con", case=False, na=False)]
subj_57_condition_stats['con-RS'] = [con_RS_df['Time'].mean(), con_RS_df['Time'].median(), con_RS_df['Time'].std()]

#inc-RS
inc_RS_df = RS_df[RS_df['Condition'].str.contains("inc", case=False, na=False)]
subj_57_condition_stats['inc-RS'] = [inc_RS_df['Time'].mean(), inc_RS_df['Time'].median(), inc_RS_df['Time'].std()]



In [None]:
#separate each run into its own df to find num missing

#output path for blocked data
output_path = '/Users/kjung6/Eva/Stroop/blocked_data.xlsx'

#define the values we want in column E
valid_values = ['conM', 'conMNM', 'conNM', 'incM', 'incMNM', 'incNM', 'EventTag']

#define the colnames
valid_row_values = ['Trial', 'RunLabel', 'Condition', 'trial start', 'EventTag', 'Time', 'keys', 'sequence', 'mouse_down']

filtered_sheets_2 = {}

#filter each sheet based on valid values in column E (index 4)
for sheet_name, sheet_data in sheets.items():
    filtered_data = sheet_data[sheet_data.iloc[:, 4].isin(valid_values)]
    filtered_data = filtered_data.reset_index(drop=True)
    filtered_sheets_2[sheet_name] = filtered_data

#set first row as colnames and drop the first row that's now redundant
for sheet_name, sheet_data in filtered_sheets_2.items():
    sheet_data.columns = sheet_data.iloc[0]
    sheet_data = sheet_data.drop(index=0)
    filtered_sheets_2[sheet_name] = sheet_data

#remove duplicates in first 15 rows to get rid of extra header rows
for sheet_name, sheet_data in filtered_sheets_2.items():
    first_15_rows = sheet_data.head(15)
    header_rows_in_first_15 = first_15_rows[first_15_rows.apply(lambda row: row.tolist() == valid_row_values, axis=1)]

    #if the header rows in first 15 row not empty, drop them
    if not header_rows_in_first_15.empty:
        sheet_data = sheet_data.drop(header_rows_in_first_15.index[1:])
    sheet_data = sheet_data.reset_index(drop=True)
    filtered_sheets_2[sheet_name] = sheet_data

#function that splits the data into separate blocks (individual dfs) for each run (based on each occurrence of header row)
def split_dataframe_by_header(sheet_data):
    blocks = []
    header_indices = sheet_data[sheet_data.iloc[:, 0] == valid_row_values[0]].index.tolist()
    header_indices.append(len(sheet_data))

    #split the df into blocks
    for i in range(len(header_indices) - 1):
        start = header_indices[i]
        end = header_indices[i + 1]
        block = sheet_data.iloc[start:end].reset_index(drop=True)
        
        #if first row of the block is a header, drop it (already used as colname)
        if block.iloc[0].tolist() == valid_row_values:
            block = block.drop(index=0).reset_index(drop=True)

        blocks.append(block)

    return blocks


#function to remove any extra characters in 'keys' column (such as _5UP)
def clean_keys_column(sheet_data):
    sheet_data['keys'] = sheet_data['keys'].str.replace(r'\[([0-9]+)\].*', r'[\1]', regex=True)
    return sheet_data

#function to find the most common value in 'keys' for specific EventTag section
def most_common_key_in_section(block, event_tag):
    section_keys = block[block['EventTag'] == event_tag]['keys']
    
    #if the section is empty, fall back to 'incM' or 'incNM'
    if section_keys.empty:
        fallback_event_tag = None
        if event_tag == 'conM':
            fallback_event_tag = 'incM'
        elif event_tag == 'conNM':
            fallback_event_tag = 'incNM'
        
        if fallback_event_tag:
            section_keys = block[block['EventTag'] == fallback_event_tag]['keys']
    
    #if the section is still empty after fallback, return None
    if section_keys.empty:
        return None
    
    return section_keys.mode()[0] if not section_keys.mode().empty else None

#make function to map conditions
def map_condition(condition_value):
    if pd.isna(condition_value):
        return condition_value

    #check if 'Condition' contains 'conM' or 'incM' and return '[1]' if does
    if 'conM' in condition_value or 'incM' in condition_value:
        return '[1]'
    #check if 'Condition' contains 'conNM' or 'incNM' and return '[2]' if does
    elif 'conNM' in condition_value or 'incNM' in condition_value:
        return '[2]'
    else:
        return condition_value  

subject_trial_counts = {}
#for each sheet in filtered_sheets_2...
for sheet_name, sheet_data in filtered_sheets_2.items():
    real_missed_trials_total = 0
    fake_missed_trials_total = 0
    duplicated_trials_total = 0
    total_errors = 0
    #drop col if more than 9 cols
    if sheet_data.shape[1] > 9:
        sheet_data = sheet_data.drop(columns=sheet_data.columns[9])
        print(f"Extra column dropped from sheet '{sheet_name}'.")

    #if sheet_data has 9 cols...
    if sheet_data.shape[1] == 9:
        sheet_data = clean_keys_column(sheet_data)
        blocks = split_dataframe_by_header(sheet_data)

        non_empty_blocks = []
        for block in blocks:
            if not block.empty:
                block = block.copy()

                #drop the last two columns
                block = block.iloc[:, :-2]
                
                #remove the last row if it matches the header
                if block.iloc[-1].tolist() == valid_row_values:
                    block = block.drop(block.index[-1])

                #find the most common 'keys' value for conM and conNM sections
                conM_key = most_common_key_in_section(block, 'conM')
                conNM_key = most_common_key_in_section(block, 'conNM')

                # Print the most common keys for debugging
                # print(f"\nFor block '{sheet_name}':")
                # print(f"  Most common 'conM' key: {conM_key}")
                # print(f"  Most common 'conNM' key: {conNM_key}")

                #replace all occurrences of the most common key for conM with [1] in whole block
                if conM_key:
                    block.loc[:, 'keys'] = block['keys'].replace(conM_key, '[1]')
                
                #replace all occurrences of the most common key for conNM with [2] in whole block
                if conNM_key:
                    block.loc[:, 'keys'] = block['keys'].replace(conNM_key, '[2]')

                #apply the function to create 'Condition_bin' 
                block.loc[:, 'Condition_bin'] = block['Condition'].apply(map_condition)
                
                #initialize metrics
                overall_mean = 0
                overall_median = 0
                overall_sd = 0
                total_time = 0
                total_rows = 0
                #total_blocks = 0

                #count errors by comparing 'Condition_bin' to 'keys'
                errors_in_block = (block['Condition_bin'] != block['keys']).sum()

                #remove the rows where Condition_bin != to keys by subsetting those that do
                #block = block[block['Condition_bin'] == block['keys']]

                #add the number of errors for this block to the total count for the subject
                total_errors += errors_in_block

                #calculate total mean time for each block and add it to the subject's tally
                #total_time += block['Time'].mean()  
                #total_blocks += 1
                total_time += block['Time'].sum()  
                total_rows += len(block)

                trial_numbers = block['Trial'].astype(int)
                event_tags = block['EventTag']

                real_missed_trials = 0
                fake_missed_trials = 0
                duplicated_trials = 0

                #loop through trials to detect real missed trials, fake missed trials
                for i in range(1, len(trial_numbers)):
                    prev_trial = trial_numbers.iloc[i - 1]
                    curr_trial = trial_numbers.iloc[i]
                    prev_event = event_tags.iloc[i - 1]
                    curr_event = event_tags.iloc[i]
                    
                    if curr_trial > prev_trial + 1 and prev_event == curr_event:
                        real_missed_trials += 1
                    elif curr_trial > prev_trial + 1 and prev_event != curr_event:
                        fake_missed_trials += 1

                #count duplicates (trials that are repeated)
                duplicated_trials = trial_numbers[trial_numbers.duplicated()].nunique()

                #add current block's counts to the total
                real_missed_trials_total += real_missed_trials
                fake_missed_trials_total += fake_missed_trials
                duplicated_trials_total += duplicated_trials
                
                #remove duplicates from the block (keep first instance of each trial number)
                block = block.drop_duplicates(subset='Trial', keep='first')

                #remove the rows where Condition_bin != to keys (errors) by subsetting those that do
                block = block[block['Condition_bin'] == block['keys']]

                #add cleaned block to the non_empty_blocks list
                non_empty_blocks.append(block)

                
        #calculate mean time for the subject by dividing total average time in all blocks by the number of blocks
        average_time = total_time / total_rows

        subject_trial_counts[sheet_name] = {
            'Real missed trials': real_missed_trials_total,
            'Fake missed trials': fake_missed_trials_total,
            'Duplicate trials': duplicated_trials_total,
            'Total errors': total_errors,
            'Average Time': average_time
        }

#print tallied results for each subject
for subject, counts in subject_trial_counts.items():
    print(f"\nTally for {subject}:")
    print(f"  - Real missed trials: {counts['Real missed trials']}")
    print(f"  - Fake missed trials (block switches): {counts['Fake missed trials']}")
    print(f"  - Duplicate trials: {counts['Duplicate trials']}")
    print(f"  - Errors: {counts['Total errors']}")
    print(f"  - Mean Time: {counts['Average Time']:.2f}")

# for subject, counts in subject_trial_counts.items():
#     print(non_empty_blocks)



In [None]:
#print n check

# Define the output path
output_path = '/Users/kjung6/Eva/Stroop/blocked_data.xlsx'

subject_trial_counts = {}
# Create a Pandas Excel writer
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    # Loop through each sheet in filtered_sheets_2 (for each subject)
    for sheet_name, sheet_data in filtered_sheets_2.items():
        real_missed_trials_total = 0
        fake_missed_trials_total = 0
        duplicated_trials_total = 0
        total_errors = 0

        # Drop column if there are more than 9 columns
        if sheet_data.shape[1] > 9:
            sheet_data = sheet_data.drop(columns=sheet_data.columns[9])
            print(f"Extra column dropped from sheet '{sheet_name}'.")

        # If sheet_data has 9 columns...
        if sheet_data.shape[1] == 9:
            sheet_data = clean_keys_column(sheet_data)
            blocks = split_dataframe_by_header(sheet_data)

            non_empty_blocks = []
            for block_idx, block in enumerate(blocks):
                if not block.empty:
                    block = block.copy()

                    # Drop the last two columns
                    block = block.iloc[:, :-2]
                    
                    # Remove the last row if it matches the header
                    if block.iloc[-1].tolist() == valid_row_values:
                        block = block.drop(block.index[-1])

                    # Find the most common 'keys' value for conM and conNM sections
                    conM_key = most_common_key_in_section(block, 'conM')
                    conNM_key = most_common_key_in_section(block, 'conNM')

                    # Replace occurrences of the most common key for conM and conNM with [1] and [2]
                    if conM_key:
                        block.loc[:, 'keys'] = block['keys'].replace(conM_key, '[1]')
                    if conNM_key:
                        block.loc[:, 'keys'] = block['keys'].replace(conNM_key, '[2]')

                    # Apply the function to create 'Condition_bin'
                    block.loc[:, 'Condition_bin'] = block['Condition'].apply(map_condition)

                    # Count errors by comparing 'Condition_bin' to keys
                    errors_in_block = (block['Condition_bin'] != block['keys']).sum()
                    total_errors += errors_in_block

                    trial_numbers = block['Trial'].astype(int)
                    event_tags = block['EventTag']

                    real_missed_trials = 0
                    fake_missed_trials = 0
                    duplicated_trials = 0

                    # Loop through trials to detect real missed trials, fake missed trials, and duplicates
                    for i in range(1, len(trial_numbers)):
                        prev_trial = trial_numbers.iloc[i - 1]
                        curr_trial = trial_numbers.iloc[i]
                        prev_event = event_tags.iloc[i - 1]
                        curr_event = event_tags.iloc[i]

                        if curr_trial > prev_trial + 1 and prev_event == curr_event:
                            real_missed_trials += 1
                        elif curr_trial > prev_trial + 1 and prev_event != curr_event:
                            fake_missed_trials += 1

                    # Count duplicates (trials that are repeated)
                    duplicated_trials = trial_numbers[trial_numbers.duplicated()].nunique()

                    # Add current block's counts to the total
                    real_missed_trials_total += real_missed_trials
                    fake_missed_trials_total += fake_missed_trials
                    duplicated_trials_total += duplicated_trials

                    # Remove duplicates from the block (keep first instance of each trial number)
                    block = block.drop_duplicates(subset='Trial', keep='first')

                    # Add cleaned block to the non_empty_blocks list
                    non_empty_blocks.append(block)

            # Save the subject's blocks as separate sheets in the Excel file
            for block_idx, block in enumerate(non_empty_blocks):
                block_name = f"{sheet_name}_Block_{block_idx + 1}"  # Naming the sheets based on subject and block number
                block.to_excel(writer, sheet_name=block_name)

            # Store the trial counts in a dictionary (optional)
            subject_trial_counts[sheet_name] = {
                'Real missed trials': real_missed_trials_total,
                'Fake missed trials': fake_missed_trials_total,
                'Duplicate trials': duplicated_trials_total,
                'Total errors': total_errors
            }

# Output confirmation message
print(f"Data saved to '{output_path}'.")


In [12]:

#write out to excel
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    for sheet_name, filtered_data in filtered_sheets.items():
        filtered_data.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
for sheet_name, sheet_data in filtered_sheets.items():
    print(sheet_name)

In [15]:
#separate each run into its own df to find num missing

#output path for blocked data
output_path = '/Users/kjung6/Eva/Stroop/blocked_data.xlsx'

#define the values we want in column E
valid_values = ['conM', 'conMNM', 'conNM', 'incM', 'incMNM', 'incNM', 'EventTag']

#define the colnames
valid_row_values = ['Trial', 'RunLabel', 'Condition', 'trial start', 'EventTag', 'Time', 'keys', 'sequence', 'mouse_down']

filtered_sheets_2 = {}

#filter each sheet based on valid values in column E (index 4)
for sheet_name, sheet_data in sheets.items():
    filtered_data = sheet_data[sheet_data.iloc[:, 4].isin(valid_values)]
    filtered_data = filtered_data.reset_index(drop=True)
    filtered_sheets_2[sheet_name] = filtered_data

#set first row as colnames and drop the first row that's now redundant
for sheet_name, sheet_data in filtered_sheets_2.items():
    sheet_data.columns = sheet_data.iloc[0]
    sheet_data = sheet_data.drop(index=0)
    filtered_sheets_2[sheet_name] = sheet_data

#remove duplicates in first 15 rows to get rid of extra header rows
for sheet_name, sheet_data in filtered_sheets_2.items():
    first_15_rows = sheet_data.head(15)
    header_rows_in_first_15 = first_15_rows[first_15_rows.apply(lambda row: row.tolist() == valid_row_values, axis=1)]

    #if the header rows in first 15 row not empty, drop them
    if not header_rows_in_first_15.empty:
        sheet_data = sheet_data.drop(header_rows_in_first_15.index[1:])
    sheet_data = sheet_data.reset_index(drop=True)
    filtered_sheets_2[sheet_name] = sheet_data

#function that splits the data into separate blocks (individual dfs) for each run (based on each occurrence of header row)
def split_dataframe_by_header(sheet_data):
    blocks = []
    header_indices = sheet_data[sheet_data.iloc[:, 0] == valid_row_values[0]].index.tolist()
    header_indices.append(len(sheet_data))

    #split the df into blocks
    for i in range(len(header_indices) - 1):
        start = header_indices[i]
        end = header_indices[i + 1]
        block = sheet_data.iloc[start:end].reset_index(drop=True)
        
        #if first row of the block is a header, drop it (already used as colname)
        if block.iloc[0].tolist() == valid_row_values:
            block = block.drop(index=0).reset_index(drop=True)

        blocks.append(block)

    return blocks


#function to remove any extra characters in 'keys' column (such as _5UP)
def clean_keys_column(sheet_data):
    sheet_data['keys'] = sheet_data['keys'].str.replace(r'\[([0-9]+)\].*', r'[\1]', regex=True)
    return sheet_data

#function to find the most common value in 'keys' for specific EventTag section
def most_common_key_in_section(block, event_tag):
    section_keys = block[block['EventTag'] == event_tag]['keys']
    
    #if the section is empty, fall back to 'incM' or 'incNM'
    if section_keys.empty:
        fallback_event_tag = None
        if event_tag == 'conM':
            fallback_event_tag = 'incM'
        elif event_tag == 'conNM':
            fallback_event_tag = 'incNM'
        
        if fallback_event_tag:
            section_keys = block[block['EventTag'] == fallback_event_tag]['keys']
    
    #if the section is still empty after fallback, return None
    if section_keys.empty:
        return None
    
    return section_keys.mode()[0] if not section_keys.mode().empty else None

#make function to map conditions
def map_condition(condition_value):
    if pd.isna(condition_value):
        return condition_value

    #check if 'Condition' contains 'conM' or 'incM' and return '[1]' if does
    if 'conM' in condition_value or 'incM' in condition_value:
        return '[1]'
    #check if 'Condition' contains 'conNM' or 'incNM' and return '[2]' if does
    elif 'conNM' in condition_value or 'incNM' in condition_value:
        return '[2]'
    else:
        return condition_value  
    
#function to initialize the subject's metrics
def initialize_subject_metrics():
    return {
        'real_missed_trials': 0,
        'fake_missed_trials': 0,
        'duplicated_trials': 0,
        'total_errors': 0,
        'total_time': 0,
        'total_rows': 0,
        'average_time': 0,
        'std': 0,
        '4std': 0,
        'outlier_cutoff': 0
    }

#function to drop extra columns if more than 9
def drop_extra_columns(sheet_data):
    if sheet_data.shape[1] > 9:
        sheet_data = sheet_data.drop(columns=sheet_data.columns[9])
        print(f"Extra column dropped.")
    return sheet_data


#function to clean and process each block of data
def process_block_data(block):
    block = clean_block_data(block)
    conM_key = most_common_key_in_section(block, 'conM')
    conNM_key = most_common_key_in_section(block, 'conNM')

    block = replace_most_common_keys(block, conM_key, conNM_key)
    block.loc[:, 'Condition_bin'] = block['Condition'].apply(map_condition)

    return block


#function to clean the block data (drop last two columns, drop rows that match header row, etc.)
def clean_block_data(block):
    block = block.iloc[:, :-2]  
    if block.iloc[-1].tolist() == valid_row_values:
        block = block.drop(block.index[-1])  
    return block


#function to update the subject's metrics based on the block data
def update_subject_metrics(block, subject_metrics):
    errors_in_block = count_errors(block)
    subject_metrics['total_errors'] += errors_in_block

    real_missed_trials, fake_missed_trials, duplicated_trials = count_trials(block)
    subject_metrics['real_missed_trials'] += real_missed_trials
    subject_metrics['fake_missed_trials'] += fake_missed_trials
    subject_metrics['duplicated_trials'] += duplicated_trials

    subject_metrics['total_time'] += block['Time'].sum()
    subject_metrics['total_rows'] += len(block)


#function to count errors in the block
def count_errors(block):
    return (block['Condition_bin'] != block['keys']).sum()


#function to count real missed trials, fake missed trials, and duplicated trials
def count_trials(block):
    trial_numbers = block['Trial'].astype(int)
    event_tags = block['EventTag']

    real_missed_trials = 0
    fake_missed_trials = 0
    duplicated_trials = 0

    for i in range(1, len(trial_numbers)):
        prev_trial = trial_numbers.iloc[i - 1]
        curr_trial = trial_numbers.iloc[i]
        prev_event = event_tags.iloc[i - 1]
        curr_event = event_tags.iloc[i]

        if curr_trial > prev_trial + 1 and prev_event == curr_event:
            real_missed_trials += 1
        elif curr_trial > prev_trial + 1 and prev_event != curr_event:
            fake_missed_trials += 1

    duplicated_trials = trial_numbers[trial_numbers.duplicated()].nunique()
    return real_missed_trials, fake_missed_trials, duplicated_trials


#function to calculate the average time for the subject
def calculate_average_time(total_time, total_rows):
    return total_time / total_rows if total_rows else 0


#function to replace most common keys in the block
def replace_most_common_keys(block, conM_key, conNM_key):
    if conM_key:
        block.loc[:, 'keys'] = block['keys'].replace(conM_key, '[1]')
    if conNM_key:
        block.loc[:, 'keys'] = block['keys'].replace(conNM_key, '[2]')
    return block

#function to process each subject's sheet
def process_subject_data(filtered_sheets_2):
    subject_trial_counts = {}

    #loop over each sheet in the filtered_sheets_2 dictionary
    for sheet_name, sheet_data in filtered_sheets_2.items():
        
        subject_metrics = initialize_subject_metrics()

        #drop extra columns if needed
        sheet_data = drop_extra_columns(sheet_data)

        #make sure sheet has exactly 9 columns
        if sheet_data.shape[1] == 9:
            sheet_data = clean_keys_column(sheet_data)
            blocks = split_dataframe_by_header(sheet_data)

            #process each block
            for block in blocks:
                if not block.empty:
                    block = process_block_data(block)
                    update_subject_metrics(block, subject_metrics)

            #calculate average time for the subject
            average_time = calculate_average_time(subject_metrics['total_time'], subject_metrics['total_rows'])
            subject_metrics['average_time'] = average_time

            #find std by finding std across all rows
            #subject_metrics['std'] = np.std(subject_metrics['total_time'])
            #subject_metrics['total_time'].std()
            #4*std
            #subject_metrics['4std'] = 4*subject_metrics['total_time'].std()
            #outlier cutoff
            subject_metrics['outlier_cutoff'] = subject_metrics['4std'] + subject_metrics['average_time']
            
            #calculate std for each subject
            #subject_metrics['std'] =

            #calculate cutoffs for each subject

            #calculate mean times after removing prolonged responses (>= cutoff) and anticipations (<= 150ms)
            

            #store the subject's results tally
            subject_trial_counts[sheet_name] = subject_metrics

    return subject_trial_counts

#call the main function and process the data
subject_trial_counts = process_subject_data(filtered_sheets_2)

#print tallied results for each subject
for subject, counts in subject_trial_counts.items():
    print(f"\nTally for {subject}:")
    print(f"  - Real missed trials: {counts['real_missed_trials']}")
    print(f"  - Fake missed trials (block switches): {counts['fake_missed_trials']}")
    print(f"  - Duplicate trials: {counts['duplicated_trials']}")
    print(f"  - Errors: {counts['total_errors']}")
    print(f"  - Mean Time w/o Removal: {counts['average_time']:.2f}")
    print(f"  - Standard Deviation: {counts['std']:.2f}")
    print(f"  - Outlier Cutoff: {counts['std']:.2f}")

# #mean overall rt
# overall_mean = subj_57_sorted['Time'].mean()

# #median overall rt
# overall_median = subj_57_sorted['Time'].median()

# #overall sd and 4x
# overall_sd = subj_57_sorted['Time'].std()
# sd4x = overall_sd * 4

# #outlier cutoff
# outlier_cut = overall_mean + sd4x
# #print(outlier_cut)
# #subset outliers and count
# outlier_df = subj_57_sorted[subj_57_sorted['Time'] >= outlier_cut]
# outlier_count = len(outlier_df)
# subj_57_sorted = subj_57_sorted[subj_57_sorted['Time'] <= outlier_cut]

Extra column dropped.

Tally for HP23-01696:
  - Real missed trials: 7
  - Fake missed trials (block switches): 30
  - Duplicate trials: 1
  - Errors: 5
  - Mean Time w/o Removal: 1607.74
  - Standard Deviation: 0.00
  - Outlier Cutoff: 0.00

Tally for HP23-00057:
  - Real missed trials: 2
  - Fake missed trials (block switches): 30
  - Duplicate trials: 0
  - Errors: 1
  - Mean Time w/o Removal: 708.68
  - Standard Deviation: 0.00
  - Outlier Cutoff: 0.00

Tally for HP23-01708:
  - Real missed trials: 2
  - Fake missed trials (block switches): 30
  - Duplicate trials: 0
  - Errors: 2
  - Mean Time w/o Removal: 958.14
  - Standard Deviation: 0.00
  - Outlier Cutoff: 0.00

Tally for HP23-01656:
  - Real missed trials: 6
  - Fake missed trials (block switches): 30
  - Duplicate trials: 0
  - Errors: 25
  - Mean Time w/o Removal: 874.61
  - Standard Deviation: 0.00
  - Outlier Cutoff: 0.00

Tally for HP23-01205:
  - Real missed trials: 8
  - Fake missed trials (block switches): 32
  - Dup

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block.loc[:, 'Condition_bin'] = block['Condition'].apply(map_condition)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block.loc[:, 'Condition_bin'] = block['Condition'].apply(map_condition)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  block.loc[:, 'Condition_bin'] = block['Condition'].apply(map_

In [None]:
for sheet_name, sheet_data in filtered_sheets.items():
    #find sd, outlier cutoff
    