In [45]:
import sys 
sys.path.append('/pl/active/banich/studies/Relevantstudies/abcd/env/lib/python3.7/site-packages')
sys.path.append('/pl/active/banich/studies/Clearvale/jake_scripts/Amy_flywheel_scripts/')

import numpy as np
import pandas as pd
import os

In [46]:
import pandas as pd
import os

# Function to extract subject and run from a file path
def extract_subject_run(file_path):
    path_parts = file_path.split(os.sep)
    subject_number = path_parts[path_parts.index('subjects') + 1]
    file_name = os.path.basename(file_path)
    if 'run' in file_name:
        run_number = file_name.split('run')[-1].split('.')[0]
    else:
        run_number = "Unknown"
    return subject_number, run_number

# Function to process a single file into a DataFrame
def process_single_file(file_path):
    # Extract subject and run information
    subject, run = extract_subject_run(file_path)

    # Initialize an empty list to store the data
    data = []

    # Read the file and parse each line
    with open(file_path, 'r') as file:
        for line in file:
            columns = line.strip().split()
            data.append(columns)

    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data)
    df.iloc[:, 3:] = df.iloc[:, 3:].astype(int)
    df.columns = ['run', 'op', 'cat', 'x', 'y', 'z']

    # Create a unique trial type
    df['trial_type'] = df['op'].astype(str) + '_' + df['cat'].astype(str)

    # Process the data
    processed_df = (
        df.reset_index()[['index', 'op', 'x', 'y', 'z']]
        .melt(id_vars=['index', 'op'])
        .query('value > 1')
        .sort_values(['index', 'value'])
        .assign(sub=subject, run=f'run{run}')
        .reset_index()
        [['index', 'sub', 'run', 'op']]
    )

    return processed_df

# Function to find and process all relevant files
def process_all_subjects(base_path):
    all_data = []
    # Loop through all subject directories
    for subject_dir in sorted(os.listdir(base_path)):
        subject_path = os.path.join(base_path, subject_dir)
        if os.path.isdir(subject_path):
            ev_dir = os.path.join(subject_path, 'EV')
            if os.path.isdir(ev_dir):
                # Find all files starting with 'simple_run'
                for file in os.listdir(ev_dir):
                    if file.startswith('simple_run') and file.endswith('.txt'):
                        file_path = os.path.join(ev_dir, file)
                        processed_df = process_single_file(file_path)
                        all_data.append(processed_df)
    
    # Combine all individual DataFrames into a single DataFrame
    if all_data:
        result_df = pd.concat(all_data, ignore_index=True)
        return result_df
    else:
        print("No valid files found.")
        return pd.DataFrame()

# Base path containing the subject directories
base_path = '/pl/active/banich/studies/wmem/fmri/subjects/'

# Process all subjects and retrieve the results
code_df = (process_all_subjects(base_path)
           .sort_values(['sub','run', 'index'])
           .reset_index(drop=True)
          )

In [47]:
import pandas as pd
import os

def get_subject_directories(base_path):
    subject_directories = []
    for item in os.listdir(base_path):
        item_path = os.path.join(base_path, item)
        if os.path.isdir(item_path):
            subject_directories.append(item_path)
    return subject_directories

def process_subjects(base_path):
    all_data = []
    subject_directories = sorted(get_subject_directories(base_path))
    
    for subject_dir in subject_directories:
        # Get the subject number from the directory name
        subject_number = os.path.basename(subject_dir)
        
        # Define the EV directory path
        ev_dir = os.path.join(subject_dir, 'EV')
        
        # Check if the EV directory exists
        if not os.path.isdir(ev_dir):
            print(f"EV directory not found for subject {subject_number}. Skipping...")
            continue
        
        # Find all TR files in the EV directory
        tr_files = [file for file in os.listdir(ev_dir) if file.startswith('TR_run') and file.endswith('.txt')]
        
        # Check if TR files exist
        if not tr_files:
            print(f"No TR files found in the EV directory for subject {subject_number}. Skipping...")
            continue
        
        for tr_file in tr_files:
            # Extract the run number from the file name
            run_number = tr_file.split('_')[-1].split('.')[0]
            
            # File path
            tr_path = os.path.join(ev_dir, tr_file)
            
            # Read the single-column text file into a pandas DataFrame
            df = pd.read_csv(tr_path, header=None, names=['TR'], encoding='utf-8').reset_index()
            df.columns = ['TR', 'cat']
            
            # Data processing steps...
            df.TR = df.TR + 1
            df['onset'] = df.index * 0.46
            df['end'] = df.TR * 0.46
            df['group'] = (df['cat'] != df['cat'].shift()).cumsum()
            df['start'] = df.groupby('group')['onset'].transform('first')
            df['finish'] = df.groupby('group')['end'].transform('last')
            df.drop(columns=['group'], inplace=True)
            df.drop_duplicates('finish', inplace=True)
            df['duration'] = df.finish - df.start
            df['run'] = run_number
            df['sub'] = subject_number
            df = df[['sub', 'run', 'start', 'duration', 'cat']]
            df.columns = ['sub', 'run', 'onset', 'duration', 'trial_type']
            df.reset_index(drop=True, inplace=True)
            
            # Append the processed DataFrame to the list
            all_data.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    result_df = (pd.concat(all_data, ignore_index=True)
                 .sort_values(['sub', 'run'])
                 .reset_index(drop=True)
                )
    
    return result_df

# Base path containing subject directories
base_path = '/pl/active/banich/studies/wmem/fmri/subjects/'
# Process the subjects within the "EV" directories
result_df = process_subjects(base_path)

EV directory not found for subject .ipynb_checkpoints. Skipping...
EV directory not found for subject rest_results. Skipping...
EV directory not found for subject wmem_results. Skipping...


In [48]:
final_df = pd.concat([code_df, result_df], axis=1).iloc[:, [4,5,3,8, 6,7]]

# Mapping old operation names to new ones
op_mapping = {
    'FIX':"fix",
    'repCat': 'replace',
    'target': 'suppress',
    'repItem': 'replace_remove',
    'global': 'clear',
    'maintain': 'maintain'
}

final_df['op'] = final_df['op'].map(op_mapping)
final_df = final_df.query('op != "replace_remove" & op != "fix"')
final_df['selector'] = list(range(1,4)) * int((51840/3))
final_df = final_df.query('selector != 1').drop('selector', axis=1)
#final_df['onset'] = final_df['onset'] + 4.6
final_df['duration'] = final_df['duration']
final_df = (final_df.rename({'op':'condition'}, axis=1)
            .drop('trial_type', axis=1)
            .sort_values(['sub', 'run', 'onset'])
            .reset_index(drop=True)
           )
final_df.groupby(['sub', 'run', 'condition']).count().query('sub == "001"')

final_df['group'] = (final_df.index // 2) + 1

In [49]:
final_df.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/design_conditions_not_combined.csv', index=False)

In [50]:
# Define the function to aggregate based on a group ID
def aggregate_group(df, group_id):
    # Filter the DataFrame for the given group ID
    grouped_df = df.query(f'group == {group_id}')
    
    # Take the first row as a base and modify it
    aggregated_row = pd.DataFrame(grouped_df.iloc[0, :]).T
    
    # Sum up the duration for the whole group
    aggregated_row['duration'] = grouped_df['duration'].sum()
    
    return aggregated_row

# Apply this function to every unique group in the DataFrame
unique_groups = final_df['group'].unique()
aggregated_results = pd.concat([aggregate_group(final_df, gid) for gid in unique_groups], ignore_index=True)

aggregated_df = aggregated_results.sort_values(['sub', 'run', 'onset']).drop('group', axis=1)

In [51]:
aggregated_df['sub'] = aggregated_df['sub'].astype(str)

In [52]:
aggregated_df.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/design_conditions.csv', index=False)

In [53]:
# Get the unique combinations of runs and conditions
expected_runs = set(aggregated_df['run'].unique())
expected_conditions = set(aggregated_df['condition'].unique())

# Function to verify if each subject has all runs and conditions
def all_subjects_have_complete_data(df, expected_runs, expected_conditions):
    for sub in df['sub'].unique():
        subject_data = df[df['sub'] == sub]
        subject_runs = set(subject_data['run'].unique())
        subject_conditions = set(subject_data['condition'].unique())
        
        if subject_runs != expected_runs or subject_conditions != expected_conditions:
            return False
    return True

# Check if all subjects have the same runs and conditions
result = all_subjects_have_complete_data(aggregated_df, expected_runs, expected_conditions)
print(f'Subjects have same number of runs and conditions: {result}')

Subjects have same number of runs and conditions: True


In [54]:
aggregated_df

Unnamed: 0,sub,run,condition,onset,duration
0,001,run1,replace,16.56,6.90
1,001,run1,suppress,26.22,5.52
2,001,run1,replace,34.5,5.98
3,001,run1,maintain,43.24,5.52
4,001,run1,replace,75.44,5.06
...,...,...,...,...,...
17275,084,run6,suppress,459.54,6.44
17276,084,run6,clear,476.56,6.44
17277,084,run6,maintain,485.76,5.52
17278,084,run6,suppress,494.04,5.98


In [55]:
aggregated_df = pd.read_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/design_conditions.csv')

# Define a function to format the 'sub' column
def format_sub_column(sub):
    if sub < 10:
        return f'{sub:03}'
    elif sub >= 10 and sub < 100:
        return f'{sub:03}'
    else:
        return f'{sub}'

# Apply the function to the 'sub' column
aggregated_df['sub'] = aggregated_df['sub'].apply(format_sub_column)

aggregated_df

Unnamed: 0,sub,run,condition,onset,duration
0,001,run1,replace,16.56,6.90
1,001,run1,suppress,26.22,5.52
2,001,run1,replace,34.50,5.98
3,001,run1,maintain,43.24,5.52
4,001,run1,replace,75.44,5.06
...,...,...,...,...,...
17275,084,run6,suppress,459.54,6.44
17276,084,run6,clear,476.56,6.44
17277,084,run6,maintain,485.76,5.52
17278,084,run6,suppress,494.04,5.98


In [56]:
#aggregated_df['comb_condition'] = aggregated_df.run + '_' + aggregated_df.condition

In [57]:
#aggregated_df = aggregated_df[['sub', 'run', 'comb_condition', 'onset', 'duration']]
aggregated_df.columns = ['sub', 'run', 'condition', 'onset', 'duration']

In [58]:
aggregated_df

Unnamed: 0,sub,run,condition,onset,duration
0,001,run1,replace,16.56,6.90
1,001,run1,suppress,26.22,5.52
2,001,run1,replace,34.50,5.98
3,001,run1,maintain,43.24,5.52
4,001,run1,replace,75.44,5.06
...,...,...,...,...,...
17275,084,run6,suppress,459.54,6.44
17276,084,run6,clear,476.56,6.44
17277,084,run6,maintain,485.76,5.52
17278,084,run6,suppress,494.04,5.98


In [59]:
#session1 = aggregated_df.query('sub == "001" & run == "run1"')
#session2 = aggregated_df.query('sub == "001" & run == "run2"')
#session3 = aggregated_df.query('sub == "001" & run == "run3"')
#session4 = aggregated_df.query('sub == "001" & run == "run4"')
#session5 = aggregated_df.query('sub == "001" & run == "run5"')
#session6 = aggregated_df.query('sub == "001" & run == "run6"')

#session1.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_run1_events.csv')
#session2.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_run2_events.csv')
#session3.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_run3_events.csv')
#session4.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_run4_events.csv')
#session5.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_run5_events.csv')
#session6.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_run6_events.csv')

In [60]:
import pandas as pd

# Rename the columns for BIDS compatibility
aggregated_df.columns = ['SUBJECT', 'SESSION', 'trial_type', 'onset', 'duration']

# Define the mapping from run name to task name
# Replace with the actual task names corresponding to your "runs"
run_to_task = {
    'run1': 'task1',
    'run2': 'task2',
    'run3': 'task3',
    'run4': 'task4',
    'run5': 'task5',
    'run6': 'task6'
}

run_to_session = {
    'run1': 3,
    'run2': 4,
    'run3': 5,
    'run4': 6,
    'run5': 7,
    'run6': 8
}


# Apply task names using the mapping
#aggregated_df['task'] = aggregated_df['SESSION'].map(run_to_task)
aggregated_df['SESSION'] = aggregated_df['SESSION'].map(run_to_session)

# Sort the DataFrame by SESSION and onset for chronological order
aggregated_df_sorted = aggregated_df.sort_values(by=['SESSION', 'onset'])

# Export each run as a BIDS-compliant TSV file
subject_id = '001'  # Replace with your desired subject ID
# Apply task names to the DataFrame

In [61]:
# Save the combined data as a single BIDS-compatible TSV file
subject_id = '001'
file_name = f"/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-{subject_id}_ses-001_all_tasks_events.tsv"

In [62]:
aggregated_df_001 = aggregated_df_sorted.query('SUBJECT == "001"')

aggregated_df_001_replace = aggregated_df_001.query('trial_type == "replace"')


def conn_design_setup(df):
    # Round onset and duration to the second decimal
    df['onset'] = df['onset'].round(2)
    df['duration'] = df['duration'].round(2)

    # Ensure onset and duration are numeric
    #df['onset'] = df['onset'].astype(str)
    #df['duration'] = df['duration'].astype(str)
    
    # Group by SUBJECT and SESSION and aggregate onset and duration
    new_df = df.groupby(['SUBJECT', 'SESSION', 'trial_type']).agg({
        'onset': lambda x: list(x),
        'duration': lambda x: list(x)
    }).reset_index()

    # Rename columns
    #new_df.columns = ['condition_name', 'subject_number', 'session_number', 'onsets', 'durations']

    new_df['onset'] = new_df['onset'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else str(x))
    new_df['duration'] = new_df['duration'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else str(x))


    # Set subject_number to blank
    new_df['SUBJECT'] = ''

    
    return new_df


wm_conn = conn_design_setup(aggregated_df_001).sort_values(['trial_type', 'SESSION'])

# Creating the new DataFrame with the specified rows
rest_conn = pd.DataFrame({
    'SUBJECT': ['']*8,
    'SESSION': [1, 2, 3, 4, 5, 6, 7, 8],
    'trial_type': ['rest']*8,
    'onset': ['0']*8,
    'duration': ['Inf']*8
})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


In [63]:
sub_conn_design_matrix = pd.concat([rest_conn, wm_conn]).reset_index(drop=True)

sub_conn_design_matrix['SESSION'] = sub_conn_design_matrix['SESSION'].astype(str)
#sub_conn_design_matrix['onset'] = sub_conn_design_matrix['onset'].astype(int)
#sub_conn_design_matrix['duration'] = sub_conn_design_matrix['duration'].astype(int)

In [64]:
sub_conn_design_matrix

Unnamed: 0,SUBJECT,SESSION,trial_type,onset,duration
0,,1,rest,0,Inf
1,,2,rest,0,Inf
2,,3,rest,0,Inf
3,,4,rest,0,Inf
4,,5,rest,0,Inf
5,,6,rest,0,Inf
6,,7,rest,0,Inf
7,,8,rest,0,Inf
8,,3,clear,83.26 125.58 133.4 159.62 176.64 193.2 217.58 ...,6.44 5.06 5.52 5.52 5.98 5.06 5.06 5.06 5.98 5...
9,,4,clear,32.66 58.42 91.08 108.1 141.68 157.32 190.44 3...,5.52 5.98 5.98 5.06 5.06 5.52 5.98 5.06 5.06 5...


In [65]:
#[conditions,nsubs,nsess,onsets,durations]

sub_conn_design_matrix = sub_conn_design_matrix[['trial_type', 'SUBJECT', 'SESSION', 'onset', 'duration']]

sub_conn_design_matrix.columns = ['condition_name','subject_number','session_number','onsets','durations']

In [66]:
file_name = '/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_all_tasks_events.csv'
# Write to TSV (tab-separated values) file
sub_conn_design_matrix.to_csv(file_name, index=False)

In [71]:
# Creating the new DataFrame with the specified rows
rest_conn_rest1 = pd.DataFrame({
    'SUBJECT': ['']*7,
    'SESSION': [1, 2, 3, 4, 5, 6, 7],
    'trial_type': ['rest']*7,
    'onset': ['0']*7,
    'duration': ['Inf']*7
})


In [69]:
wm_conn_rest1 = wm_conn
wm_conn_rest1.SESSION = wm_conn_rest1.SESSION - 1
wm_conn_rest1

Unnamed: 0,SUBJECT,SESSION,trial_type,onset,duration
0,,2,clear,83.26 125.58 133.4 159.62 176.64 193.2 217.58 ...,6.44 5.06 5.52 5.52 5.98 5.06 5.06 5.06 5.98 5...
4,,3,clear,32.66 58.42 91.08 108.1 141.68 157.32 190.44 3...,5.52 5.98 5.98 5.06 5.06 5.52 5.98 5.06 5.06 5...
8,,4,clear,60.72 93.84 111.32 194.12 254.84 320.62 421.36...,5.52 5.52 5.52 5.52 5.52 6.44 5.06 5.98 6.44 5...
12,,5,clear,16.56 42.32 166.52 174.34 182.62 208.84 234.14...,6.9 5.52 5.06 5.52 5.52 5.98 5.06 5.06 5.06 5....
16,,6,clear,16.56 74.98 91.08 174.8 217.58 251.62 316.02 3...,5.06 5.06 6.44 5.98 5.98 5.52 5.98 5.06 6.44 6...
20,,7,clear,86.48 102.12 159.16 167.44 291.64 334.42 358.8...,5.06 5.52 5.52 6.44 5.52 5.06 5.06 5.52 5.52 6...
1,,2,maintain,43.24 108.56 185.38 209.3 250.24 258.52 334.88...,5.52 5.98 5.06 5.52 5.52 5.52 5.06 6.44 5.98 6...
5,,3,maintain,16.56 49.22 75.44 83.26 258.06 293.02 324.76 3...,5.06 6.44 5.06 5.06 5.98 5.52 5.98 5.98 5.98 5...
9,,4,maintain,51.52 119.6 136.62 144.9 178.48 219.88 246.1 2...,6.44 5.98 5.52 5.52 5.06 5.98 5.98 5.52 5.52 5...
13,,5,maintain,107.64 125.12 158.7 297.62 322.92 348.22 365.7...,6.44 5.06 5.06 5.98 5.06 5.52 5.52 6.44 5.06 5...


In [73]:
sub_conn_design_matrix_rest1 = pd.concat([rest_conn_rest1, wm_conn_rest1]).reset_index(drop=True)

sub_conn_design_matrix_rest1['SESSION'] = sub_conn_design_matrix_rest1['SESSION'].astype(str)
#sub_conn_design_matrix['onset'] = sub_conn_design_matrix['onset'].astype(int)
#sub_conn_design_matrix['duration'] = sub_conn_design_matrix['duration'].astype(int)


In [75]:
#[conditions,nsubs,nsess,onsets,durations]

sub_conn_design_matrix_rest1 = sub_conn_design_matrix_rest1[['trial_type', 'SUBJECT', 'SESSION', 'onset', 'duration']]

sub_conn_design_matrix_rest1.columns = ['condition_name','subject_number','session_number','onsets','durations']

In [76]:
sub_conn_design_matrix_rest1

Unnamed: 0,condition_name,subject_number,session_number,onsets,durations
0,rest,,1,0,Inf
1,rest,,2,0,Inf
2,rest,,3,0,Inf
3,rest,,4,0,Inf
4,rest,,5,0,Inf
5,rest,,6,0,Inf
6,rest,,7,0,Inf
7,clear,,2,83.26 125.58 133.4 159.62 176.64 193.2 217.58 ...,6.44 5.06 5.52 5.52 5.98 5.06 5.06 5.06 5.98 5...
8,clear,,3,32.66 58.42 91.08 108.1 141.68 157.32 190.44 3...,5.52 5.98 5.98 5.06 5.06 5.52 5.98 5.06 5.06 5...
9,clear,,4,60.72 93.84 111.32 194.12 254.84 320.62 421.36...,5.52 5.52 5.52 5.52 5.52 6.44 5.06 5.98 6.44 5...


In [77]:
file_name = '/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/conn/sub-001_ses-001_all_tasks_rest1_events.csv'
# Write to TSV (tab-separated values) file
sub_conn_design_matrix_rest1.to_csv(file_name, index=False)