## Combine columns of Attention Shift bids and EEG events for processing

This script starts with the data that uploaded to OpenNeuro as ds002893.
A copy of the EEG.event structure is dumped to the dataset as `_events_temp.tsv` files.
The `attention_shift_01_initial_summary.ipynb` has already been run and
indicates that the corresponding versions of the event files have
the same number of events.

This notebook creates a `_events_temp1.tsv` for each:
1. Check that the respective event files have the expected columns.
2. Combine the dataframes representing the two versions of the file along the columns.
3. Make sure that the `sample` column has the same values as the corresponding values in the
`sample_offset` and `latency` columns.
4. Remove the columns `trial_type`, `value`, `response_time`, `stim_file`, `HED`,
`sample_offset`, `latency`,`urevent`, `type`, and `usertags` from the combined data frame.
5. Save as `_events_temp1.tsv`.

In [1]:
import pandas as pd
from hed.tools import BidsTsvDictionary, HedLogger
from hed.util import get_file_list, get_new_dataframe

# Set the specific variable for the Attention Shift dataset.
bids_root_path = 'G:\AttentionShift\AttentionShiftWorking'
exclude_dirs = ['sourcedata']
entities = ('sub', 'run')
bids_cols_expected = ['onset', 'duration', 'sample', 'trial_type', 'response_time', 'stim_file', 'value', 'HED']
bids_cols_remove = ['trial_type', 'value', 'response_time', 'stim_file', 'HED']
eeg_cols_expected = ['sample_offset', 'event_code', 'cond_code', 'type', 'latency', 'urevent', 'usertags']
eeg_cols_remove = ['sample_offset', 'latency', 'urevent', 'usertags']
final_cols = ['onset', 'duration', 'sample', 'event_code', 'cond_code', 'type']

# Set up the logger and create the file dictionaries
status = HedLogger()
bids_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs = ['sourcedata'],
                           name_suffix='_events')
bids_dict = BidsTsvDictionary(bids_files, entities = ('sub', 'run'))
eeg_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs = ['sourcedata'],
                          name_suffix="_events_temp")
eeg_dict = BidsTsvDictionary(eeg_files, entities = ('sub', 'run'))
fatal_error_keys = []

for key, file, rowcount, column_count in bids_dict.iter_tsv_info():
    df_bids = get_new_dataframe(file.file_path)
    status.add(key, f"Created a dataframe for {file.file_path}")

    df_eeg = get_new_dataframe(eeg_dict.get_file(key).file_path)
    status.add(key, f"Created a dataframe for {eeg_dict.get_file(key).file_path}")

    # Combine the two versions of the events file after verifying they have same number of rows
    if rowcount != eeg_dict.rowcount_dict[key]:
        status.add(key, f"ERROR {key}: the BIDs file has {rowcount} has {rowcount} row and the EEG file has" +
                   f"{eeg_dict.rowcount_dict[key]} rows", also_print=True)
        fatal_error_keys.append(key)
        continue
    df_out = pd.concat([df_bids, df_eeg], axis=1)
    status.add(key, f"Concatenated the BIDS and EEG event files for processing")

   # Make sure that the sample, sample_offset, and latency columns are equal
    offset_diff = sum(df_out['sample'].map(str) != df_out['sample_offset'].map(str))
    latency_diff = sum(df_out['sample'].map(str) != df_out['latency'].map(str))
    if offset_diff + latency_diff:
        status.add(key, f"ERROR {key}: sample col has {offset_diff} differences with sample_offset and "
                        f"{latency_diff} differences with latency", also_print=True)
        fatal_error_keys.append(key)
        continue
    else:
        status.add(key, f"Verified sample column, sample_offset, and latency columns have same values")

    # Drop the extra columns
    drop_columns = bids_cols_remove + eeg_cols_remove
    df_out.drop(columns=bids_cols_remove+eeg_cols_remove, inplace=True)
    status.add(key, f"Dropped {str(drop_columns)} drop_columns")

    # Make sure the dataframe has the correct final columns
    extra_cols = set(df_out.columns).difference(set(final_cols))
    if extra_cols:
        df_out.drop(columns=extra_cols, inplace=True)
        status.add(key, f"Dropped extra columns {str(extra_cols)}")
    missing_cols = set(final_cols).difference(set(df_out.columns))
    if missing_cols:
        df_out[missing_cols] = 'n/a'
        status.add(key, f"Added missing columns {str(missing_cols)}")

    # Do a final reordering for uniformity
    df_out = df_out[final_cols]
    status.add(key, f"Reordered the final columns as {str(final_cols)}")

    filename = file.file_path[:-4] + "_temp1.tsv"
    df_out.to_csv(filename, sep='\t', index=False)
    status.add(key, f"Saved as {filename}")

if fatal_error_keys:
    print(f"There were fatal event file errors for {str(fatal_error_keys)}. These should be fixed before continuing")
    raise KeyboardInterrupt


In [2]:
status.print_log()

sub_001_run_01
	Created a dataframe for G:\AttentionShift\AttentionShiftWorking\sub-001\eeg\sub-001_task-AuditoryVisualShift_run-01_events.tsv
	Created a dataframe for G:\AttentionShift\AttentionShiftWorking\sub-001\eeg\sub-001_task-AuditoryVisualShift_run-01_events_temp.tsv
	Concatenated the BIDS and EEG event files for processing
	Verified sample column, sample_offset, and latency columns have same values
	Dropped ['trial_type', 'value', 'response_time', 'stim_file', 'HED', 'sample_offset', 'latency', 'urevent', 'usertags'] drop_columns
	Reordered the final columns as ['onset', 'duration', 'sample', 'event_code', 'cond_code', 'type']
	Saved as G:\AttentionShift\AttentionShiftWorking\sub-001\eeg\sub-001_task-AuditoryVisualShift_run-01_events_temp1.tsv
sub_002_run_01
	Created a dataframe for G:\AttentionShift\AttentionShiftWorking\sub-002\eeg\sub-002_task-AuditoryVisualShift_run-01_events.tsv
	Created a dataframe for G:\AttentionShift\AttentionShiftWorking\sub-002\eeg\sub-002_task-Audi