In [2]:
import pandas as pd
import os
import numpy as np
import re
import matplotlib.pyplot as plt
from statsmodels.stats.anova import AnovaRM
from scipy import stats

make functions to grab subject info and also walk through all files within our data folder and look for behavioral data csv files

In [4]:
# this function gets the subject and run from the file name of the behavioral data file 
def extract_subject_and_run(input_str):
    # Define the regex patterns
    subject_pattern = re.compile(r'SF\d+')
    run_pattern = re.compile(r'run\d+')

    # Search for the patterns in the string
    subject_match = subject_pattern.search(input_str)
    run_match = run_pattern.search(input_str)

    # Extract the matched strings
    subject_number = subject_match.group(0) if subject_match else None
    run_number = run_match.group(0) if run_match else None
    
    return subject_number, run_number

#this function makes a list of our behavioral data filepaths
def find_csv_files(root_folder):
    file_paths = []
    
    # Check if root_folder exists
    if not os.path.exists(root_folder):
        print(f"Error: {root_folder} does not exist!")
        return file_paths
    
    # Walk through root_folder
    for foldername, subfolders, filenames in os.walk(root_folder):
        for file in filenames:
            if file.endswith('.csv') and not file.startswith('.'):  # Avoid hidden files
                file_path = os.path.join(foldername, file)
                # print(f"Appending CSV file: {file_path}")
                file_paths.append(file_path)
    
    return file_paths

use the functions we just made, and create a raw data dataframe

In [5]:
# Specify the main folder
main_folder = '/Users/jinjiang-macair/Library/CloudStorage/Box-Box/Pro00101414/StabFlex/data'

# Find .csv files recursively
file_paths = find_csv_files(main_folder)

import pandas as pd

# initialize empty dataframe to store raw data
raw_data = pd.DataFrame()

for filePath in file_paths:
    df = pd.read_csv(filePath)

    # Extract subject and run information
    subject, run = extract_subject_and_run(filePath)
    df['subject'] = subject
    df['run'] = run
    
    # Append the raw data to the raw data dataframe
    raw_data = pd.concat([raw_data, df])

raw_data

Unnamed: 0,stim,ITI,cuedTask,taskSequence,congruency,corrResp,blockType,trials.thisRepN,trials.thisTrialN,trials.thisN,...,date,expName,psychopyVersion,frameRate,frameDur,Unnamed: 39,subject,thisRow.t,notes,Unnamed: 41
0,,,,,,,,,,,...,2023-06-09_15h26.28.663,stability_flexibility_tradeoff,2022.2.5,59.962859,0.016667,,SF05,,,
1,,,,,,,,,,,...,2023-06-09_15h26.28.663,stability_flexibility_tradeoff,2022.2.5,59.962859,0.016667,,SF05,,,
2,,,,,,,,,,,...,2023-06-09_15h26.28.663,stability_flexibility_tradeoff,2022.2.5,59.962859,0.016667,,SF05,,,
3,1.0,1.5,p,n,c,1.0,C,0.0,0.0,0.0,...,2023-06-09_15h26.28.663,stability_flexibility_tradeoff,2022.2.5,59.962859,0.016667,,SF05,,,
4,8.0,1.0,p,r,c,2.0,C,0.0,1.0,1.0,...,2023-06-09_15h26.28.663,stability_flexibility_tradeoff,2022.2.5,59.962859,0.016667,,SF05,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,3.0,2.0,m,s,c,1.0,A,0.0,124.0,124.0,...,2024-02-06_13h27.38.199,stability_flexibility_tradeoff,2022.2.5,60.014031,0.016667,,SF39,,,
128,7.0,2.0,m,r,i,2.0,A,0.0,125.0,125.0,...,2024-02-06_13h27.38.199,stability_flexibility_tradeoff,2022.2.5,60.014031,0.016667,,SF39,,,
129,3.0,1.0,m,r,c,1.0,A,0.0,126.0,126.0,...,2024-02-06_13h27.38.199,stability_flexibility_tradeoff,2022.2.5,60.014031,0.016667,,SF39,,,
130,2.0,1.0,m,r,i,1.0,A,0.0,127.0,127.0,...,2024-02-06_13h27.38.199,stability_flexibility_tradeoff,2022.2.5,60.014031,0.016667,,SF39,,,


this is just to print out the column names

In [6]:
raw_data.columns.values.tolist()

['stim',
 'ITI',
 'cuedTask',
 'taskSequence',
 'congruency',
 'corrResp',
 'blockType',
 'trials.thisRepN',
 'trials.thisTrialN',
 'trials.thisN',
 'trials.thisIndex',
 'main2_start',
 'key_pressed',
 'key_RT',
 'routine_end',
 'scanner_start',
 'run',
 'fixation_start',
 'fixation_start_rel',
 'fixation_end_rel',
 'stimulus_start',
 'stimulus_start_rel',
 'stimulus_end_rel',
 'acc',
 'corrResp.1',
 'feedback_start',
 'feedback_start_rel',
 'feedback_end_rel',
 'blockType.1',
 'blockBreak_start',
 'blockBreak_start_rel',
 'blockBreak_end_rel',
 'Participant',
 'Run (1-4)',
 'date',
 'expName',
 'psychopyVersion',
 'frameRate',
 'frameDur',
 'Unnamed: 39',
 'subject',
 'thisRow.t',
 'notes',
 'Unnamed: 41']

now you get the average accuracy and reaction time for each subject :)  
here's the pandas documentation: https://pandas.pydata.org/pandas-docs/stable/index.html  
but also ask chatpgt and stackoverflow for help  
HINT: try the groupby function from pandas