## Combine columns of Sternberg bids and EEG events for processing

This script starts with the Sternberg data.
A copy of the EEG.event structure is dumped to the dataset as `_events_temp.tsv` files.
The `sternberg_01_initial_summary.ipynb` has already been run and
indicates that the corresponding versions of the event files have
the same number of events.

This notebook creates a `_events_temp1.tsv` for each:
1. Check that the respective event files have the same number of events.
2. Remove the `trial_type`, `response_time`, `stim_file` columns from the bids events file.
3. Save as `_events_temp1.tsv`.

In [1]:
from hed.tools import BidsTsvDictionary, HedLogger
from hed.util import get_file_list, get_new_dataframe

# Set the specific variables for the Attention Shift dataset.
bids_root_path = 'G:\Sternberg\SternbergWorking'
exclude_dirs = ['sourcedata', 'stimuli', 'code']
entities = ('sub', 'ses', 'run')
bids_cols_remove = ['trial_type', 'response_time', 'stim_file']
final_columns = ['onset', 'duration', 'sample', 'value']
# Set up the logger and create the file dictionaries
status = HedLogger()
bids_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs = ['sourcedata'],
                           name_suffix='_events')
bids_dict = BidsTsvDictionary(bids_files, entities=entities)
eeg_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs = ['sourcedata'],
                          name_suffix="_events_temp")
eeg_dict = BidsTsvDictionary(eeg_files, entities=entities)

# Perform the operations to combine the two versions of the event files
fatal_error_keys = []
for key, file, rowcount, column_count in bids_dict.iter_tsv_info():
    df_bids = get_new_dataframe(file.file_path)
    status.add(key, f"Created a dataframe for {file.file_path}")

    df_eeg = get_new_dataframe(eeg_dict.get_file(key).file_path)
    status.add(key, f"Created a dataframe for {eeg_dict.get_file(key).file_path}")

    # Combine the two versions of the events file after verifying they have same number of rows
    if rowcount != eeg_dict.rowcount_dict[key]:
        status.add(key, f"---ERROR--- the BIDs file has {rowcount} has {rowcount} row and the EEG file has" +
                   f"{eeg_dict.rowcount_dict[key]} rows", also_print=True)
        fatal_error_keys.append(key)
        continue
    else:
        status.add(key, f"Bids event file and EEG.set event structure have the same number of events")

    # Drop the extra columns
    drop_columns = bids_cols_remove
    df_bids.drop(columns=drop_columns, inplace=True)
    status.add(key, f"Dropped {str(drop_columns)} columns")

    # Make sure the dataframe has the correct final columns
    extra_cols = set(df_bids.columns).symmetric_difference(set(final_columns))
    if extra_cols:
        status.add(key, f"---ERROR--- Columns {str(extra_cols)} do not match expected.", also_print=True)
        fatal_error_keys.append(key)
        continue

    # Do a final reordering for uniformity
    df_bids = df_bids[final_columns]
    status.add(key, f"Reordered the final columns as {str(final_columns)}")


    filename = file.file_path[:-4] + "_temp1.tsv"
    df_bids.to_csv(filename, sep='\t', index=False)
    status.add(key, f"Saved as {filename}")

if fatal_error_keys:
    print(f"There were fatal event file errors for {str(fatal_error_keys)}. These should be fixed before continuing")
    raise KeyboardInterrupt

In [2]:
key_dict = {
    'sub-001_ses-01_run-2': [0],
    'sub-001_ses-01_run-3': [0],
    'sub-001_ses-01_run-4': [0],
    'sub-002_ses-01_run-2': [0, 1],
    'sub-002_ses-01_run-3': [0],
    'sub-002_ses-01_run-4': [0],
    'sub-003_ses-01_run-2': [0],
    'sub-003_ses-01_run-3': [0],
    'sub-003_ses-01_run-4': [0],
    'sub-004_ses-01_run-2': [0],
    'sub-004_ses-01_run-3': [0],
    'sub-004_ses-01_run-4': [0],
    'sub-005_ses-01_run-2': [0],
    'sub-005_ses-01_run-3': [0],
    'sub-005_ses-01_run-4': [0],
    'sub-006_ses-01_run-2': [0],
    'sub-006_ses-01_run-3': [0],
    'sub-006_ses-01_run-4': [0],
    'sub-007_ses-01_run-2': [0],
    'sub-007_ses-01_run-3': [0],
    'sub-007_ses-01_run-4': [0],
    'sub-008_ses-01_run-2': [0],
    'sub-008_ses-01_run-3': [0],
    'sub-008_ses-01_run-4': [0],
    'sub-009_ses-01_run-2': [0],
    'sub-009_ses-01_run-3': [0],
    'sub-009_ses-01_run-4': [0],
    'sub-010_ses-01_run-2': [0, 1],
    'sub-010_ses-01_run-3': [0],
    'sub-010_ses-01_run-4': [0],
    'sub-011_ses-01_run-2': [0, 1],
    'sub-011_ses-01_run-3': [0],
    'sub-011_ses-01_run-4': [0],
    'sub-012_ses-01_run-2': [0],
    'sub-012_ses-01_run-3': [0],
    'sub-014_ses-01_run-2': [0],
    'sub-014_ses-01_run-3': [0],
    'sub-015_ses-01_run-2': [0],
    'sub-015_ses-01_run-3': [0],
    'sub-016_ses-01_run-2': [0],
    'sub-016_ses-01_run-3': [0],
    'sub-017_ses-01_run-2': [0],
    'sub-017_ses-01_run-3': [0],
    'sub-018_ses-01_run-2': [0],
    'sub-018_ses-01_run-3': [0],
    'sub-019_ses-01_run-2': [0],
    'sub-019_ses-01_run-3': [0],
    'sub-020_ses-01_run-2': [0],
    'sub-020_ses-01_run-3': [0],
    'sub-021_ses-01_run-2': [0],
    'sub-021_ses-01_run-3': [0],
    'sub-022_ses-01_run-1': [307],
    'sub-022_ses-01_run-2': [0, 196],
    'sub-022_ses-01_run-4': [0, 1],
    'sub-022_ses-01_run-4': [0],
    'sub-022_ses-01_run-5': [0, 1],
    'sub-022_ses-01_run-6': [0, 1],
    'sub-023_ses-01_run-2': [0],
    'sub-023_ses-01_run-3': [0],
    'sub-023_ses-01_run-4': [0, 1],
    'sub-023_ses-01_run-5': [0, 1],
    'sub-024_ses-01_run-2': [0],
    'sub-024_ses-01_run-3': [0]
}
# Fix boundary and empty events for several runs
for key, values in key_dict.items():
    file = bids_dict.get_file(key)
    filename = file.file_path[:-4] + "_temp1.tsv"
    df = get_new_dataframe(filename)
    status.add(key, f"Number of rows {len(df.index)}", also_print=True)
    for val in values:
        status.add(key, f"Dropping row {str(val)}:\n{df.loc[[val]]}", also_print=True)
        df.drop(axis=0, index=val, inplace=True)
        df.reset_index(inplace=True)
    status.add(key, f"Saving {filename}")
    df.to_csv(filename, sep='\t', index=False)

sub-001_ses-01_run-2: Number of rows 351
sub-001_ses-01_run-2: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-001_ses-01_run-3: Number of rows 351
sub-001_ses-01_run-3: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-001_ses-01_run-4: Number of rows 351
sub-001_ses-01_run-4: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-002_ses-01_run-2: Number of rows 352
sub-002_ses-01_run-2: Dropping row 0:
   onset           duration  sample     value
0 -0.002  181119.0000000000    -0.5  boundary
sub-002_ses-01_run-2: Dropping row 1:
   onset duration  sample     value
1  0.002      n/a     0.5  boundary
sub-002_ses-01_run-3: Number of rows 351
sub-002_ses-01_run-3: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-002_ses-01_run-4: Number of rows 351
sub-002_ses-01_run-4: Dropping row 0:
   onset duration  sample     value


In [3]:
status.print_log()
status.save_log(bids_root_path, sub_path='code', log_name='as_hed_02_initial_combination_log.json')

sub-001_ses-01_run-1
	Created a dataframe for G:\Sternberg\SternbergWorking\sub-001\ses-01\eeg\sub-001_ses-01_task-Experiment_run-1_events.tsv
	Created a dataframe for G:\Sternberg\SternbergWorking\sub-001\ses-01\eeg\sub-001_ses-01_task-Experiment_run-1_events_temp.tsv
	Bids event file and EEG.set event structure have the same number of events
	Dropped ['trial_type', 'response_time', 'stim_file'] columns
	Reordered the final columns as ['onset', 'duration', 'sample', 'value']
	Saved as G:\Sternberg\SternbergWorking\sub-001\ses-01\eeg\sub-001_ses-01_task-Experiment_run-1_events_temp1.tsv
sub-001_ses-01_run-2
	Created a dataframe for G:\Sternberg\SternbergWorking\sub-001\ses-01\eeg\sub-001_ses-01_task-Experiment_run-2_events.tsv
	Created a dataframe for G:\Sternberg\SternbergWorking\sub-001\ses-01\eeg\sub-001_ses-01_task-Experiment_run-2_events_temp.tsv
	Bids event file and EEG.set event structure have the same number of events
	Dropped ['trial_type', 'response_time', 'stim_file'] column