### Convert raw data to BIDS format
Adapted from John Veillette, see the original script [here](https://github.com/john-veillette/eeg-training-materials/blob/main/convert-to-bids.ipynb). I removed all the comments and explanations that were in the original notebook because only keeping a portion of them seems more confusing and irresponsible than redirecting anyone interested in how this script actually works to the original file with Johns exhaustive annotations.

In [1]:
from mne_bids import BIDSPath, write_raw_bids, get_anonymization_daysback
import pandas as pd
import numpy as np
import itertools
import mne
import os
import re

In [2]:
# Constants
DATA_DIR = '../data/raw/' # where our data currently lives
BIDS_DIR = '../data/bids/' # where we want it to live
MAPS_DIR = '../data/captrak/' # where the mapping and electrode location files love

#### Parse filenames

In [3]:
# Get filenames and digest them
fnames = os.listdir(DATA_DIR)
fnames = [f for f in fnames if '.vhdr' in f] # filter for .vhdr files

# Get subject list from file order
filter_subs = re.compile('letty_subj_(\w?).*') # create regex filter
subs = list(map(filter_subs.findall, fnames)) # extract subject numbers with filter
subs = list(itertools.chain(*subs)) # flatten then nested list

# Get a task list
tasks = ['pitch']*len(subs) # broadcast the only task name

# Get a run list
filter_runs = re.compile('\w+[0-9]_([0-9]).*')
runs = list(map(filter_runs.findall, fnames))
runs = ['1' if x == [] else x for x in runs]
runs = list(itertools.chain(*runs))

#### Retrieve mappings between channel numbers and channel names

In [4]:
# For subj 2, 3, 5, 6
mapping_table = pd.read_csv(MAPS_DIR + 'pitch_tracking_64_at_IZ.csv')
mapping_64_at_IZ = {mapping_table.number[i]: mapping_table.name[i] for i in range(len(mapping_table))}

# For subj 4 IZ is excluded but channel 64 is not moved to FCZ
mapping_table = pd.read_csv(MAPS_DIR + 'pitch_tracking_no_IZ.csv')
mapping_no_IZ = {mapping_table.number[i]: mapping_table.name[i] for i in range(len(mapping_table))}

# For subj 7, and onwards
mapping_table = pd.read_csv(MAPS_DIR + 'pitch_tracking_64_at_FCZ.csv')
mapping_64_at_FCZ = {mapping_table.number[i]: mapping_table.name[i] for i in range(len(mapping_table))}

# Create dict for subjects and their mappings
special_mappings = {'2': mapping_64_at_IZ,
           '3': mapping_64_at_IZ, 
           '4': mapping_no_IZ,
           '5': mapping_64_at_IZ,
           '6': mapping_64_at_IZ,
           '7': mapping_64_at_FCZ}

# Create function to fetch correct mapping
def get_mapping(sub, special_mappings):
    if sub in special_mappings.keys():
        mapping = special_mappings[sub]
    else:
        mapping = mapping_64_at_FCZ
    return mapping

#### Run conversion on all files

In [5]:
for i in range(len(fnames)):
    sub = subs[i]
    task = tasks[i]
    run = runs[i]
    fpath = os.path.join(DATA_DIR, fnames[i])
    print(fpath)

    # load data with MNE function for your file format
    raw = mne.io.read_raw_brainvision(fpath)
    raw.load_data()
    raw.set_channel_types({'Aux1': 'stim'})

    # add some info BIDS will want
    raw.info['line_freq'] = 60 # the power line frequency in the building we collected in

    # map channel numbers to channel names
    mapping = get_mapping(sub, special_mappings)
    raw.rename_channels(mapping)
    raw.add_reference_channels(ref_channels = ['Cz'])

    # map channels to their coordinates
    dig = mne.channels.read_dig_captrak(MAPS_DIR + 'subj_' + sub + '.bvct')
    raw.set_montage(dig, on_missing = 'warn')

    # # drop meaningless event name
    events, event_ids = mne.events_from_annotations(raw)
    events = events[events[:,2] != event_ids['New Segment/'], :]

    # # rename events to their stimulus pitch
    event_codes = events[:,2]
    baseline_code = np.argmax(np.bincount(event_codes)) # the one with more trials
    event_names = {1: '50', 2: '100', 3: '150', 4: '200', 5: '250'}
    annot = mne.annotations_from_events(events, sfreq = raw.info['sfreq'], event_desc = event_names)
    raw.set_annotations(annot)

    # build appropriate BIDS directory structure 
    bids_path = BIDSPath(
        run = run,
        subject = sub, 
        task = task, 
        datatype = 'eeg', 
        root = BIDS_DIR
    )

    # get range of dates the BIDS specification will accept
    daysback_min, daysback_max = get_anonymization_daysback(raw)

    # write data into BIDS directory, while anonymizing
    write_raw_bids(
        raw, 
        bids_path = bids_path, 
        allow_preload = True, # whether to load full dataset into memory when copying
        format = 'BrainVision', # format to save to
        anonymize = dict(daysback = daysback_min), # shift dates by daysback
        overwrite = True,
    )

../data/raw/letty_subj_3_2.vhdr
Extracting parameters from ../data/raw/letty_subj_3_2.vhdr...
Setting channel info structure...
Reading 0 ... 4713499  =      0.000 ...   942.700 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-3/eeg/sub-3_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-3/eeg/sub-3_coordsystem.json
Writing '../data/bids/sub-3/eeg/sub-3_electrodes.tsv'...
Writing '../data/bids/sub-3/eeg/sub-3_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-3/eeg/sub-3_task-pitch_run-2_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-3/eeg/sub-3_task-pitch_run-2_eeg.json'...
Writing '../data/bids/sub-3/eeg/sub-3_task-pitch_run-2_channels.tsv'...
Copying data files to sub-3_task-pitch_run-2_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):
Note that the BrainVision format specification supports only µV.
  warn(msg)


Writing '../data/bids/sub-3/sub-3_scans.tsv'...
Wrote ../data/bids/sub-3/sub-3_scans.tsv entry with eeg/sub-3_task-pitch_run-2_eeg.vhdr.
../data/raw/letty_subj_6.vhdr
Extracting parameters from ../data/raw/letty_subj_6.vhdr...
Setting channel info structure...
Reading 0 ... 9212749  =      0.000 ...  1842.550 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-6/eeg/sub-6_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-6/eeg/sub-6_coordsystem.json
Writing '../data/bids/sub-6/eeg/sub-6_electrodes.tsv'...
Writing '../data/bids/sub-6/eeg/sub-6_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-6/eeg/sub-6_task-pitch_run-1_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-6/eeg/sub-6_task-pitch_run-1_eeg.json'...
Writing '../data/bids/sub-6/eeg/sub-6_task-pitch_run-1_channels.tsv'...
Copying data files to sub-6_task-pitch_run-1_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):


Writing '../data/bids/sub-6/sub-6_scans.tsv'...
Wrote ../data/bids/sub-6/sub-6_scans.tsv entry with eeg/sub-6_task-pitch_run-1_eeg.vhdr.
../data/raw/letty_subj_4_2.vhdr
Extracting parameters from ../data/raw/letty_subj_4_2.vhdr...
Setting channel info structure...
Reading 0 ... 4596749  =      0.000 ...   919.350 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog', 'Ch64'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-4/eeg/sub-4_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-4/eeg/sub-4_coordsystem.json
Writing '../data/bids/sub-4/eeg/sub-4_electrodes.tsv'...
Writing '../data/bids/sub-4/eeg/sub-4_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-4/eeg/sub-4_task-pitch_run-2_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-4/eeg/sub-4_task-pitch_run-2_eeg.json'...
Writing '../data/bids/sub-4/eeg/sub-4_task-pitch_run-2_channels.tsv'...
Copying data files to sub-4_task-pitch_run-2_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):


Writing '../data/bids/sub-4/sub-4_scans.tsv'...
Wrote ../data/bids/sub-4/sub-4_scans.tsv entry with eeg/sub-4_task-pitch_run-2_eeg.vhdr.
../data/raw/letty_subj_4.vhdr
Extracting parameters from ../data/raw/letty_subj_4.vhdr...
Setting channel info structure...
Reading 0 ... 5046249  =      0.000 ...  1009.250 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog', 'Ch64'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-4/eeg/sub-4_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-4/eeg/sub-4_coordsystem.json
Writing '../data/bids/sub-4/eeg/sub-4_electrodes.tsv'...
Writing '../data/bids/sub-4/eeg/sub-4_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-4/eeg/sub-4_task-pitch_run-1_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-4/eeg/sub-4_task-pitch_run-1_eeg.json'...
Writing '../data/bids/sub-4/eeg/sub-4_task-pitch_run-1_channels.tsv'...
Copying data files to sub-4_task-pitch_run-1_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):


Writing '../data/bids/sub-4/sub-4_scans.tsv'...
Wrote ../data/bids/sub-4/sub-4_scans.tsv entry with eeg/sub-4_task-pitch_run-1_eeg.vhdr.
../data/raw/letty_subj_5.vhdr
Extracting parameters from ../data/raw/letty_subj_5.vhdr...
Setting channel info structure...
Reading 0 ... 9320749  =      0.000 ...  1864.150 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-5/eeg/sub-5_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-5/eeg/sub-5_coordsystem.json
Writing '../data/bids/sub-5/eeg/sub-5_electrodes.tsv'...
Writing '../data/bids/sub-5/eeg/sub-5_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-5/eeg/sub-5_task-pitch_run-1_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-5/eeg/sub-5_task-pitch_run-1_eeg.json'...
Writing '../data/bids/sub-5/eeg/sub-5_task-pitch_run-1_channels.tsv'...
Copying data files to sub-5_task-pitch_run-1_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):


Writing '../data/bids/sub-5/sub-5_scans.tsv'...
Wrote ../data/bids/sub-5/sub-5_scans.tsv entry with eeg/sub-5_task-pitch_run-1_eeg.vhdr.
../data/raw/letty_subj_2.vhdr
Extracting parameters from ../data/raw/letty_subj_2.vhdr...
Setting channel info structure...
Reading 0 ... 21218499  =      0.000 ...  2121.850 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-2/eeg/sub-2_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-2/eeg/sub-2_coordsystem.json
Writing '../data/bids/sub-2/eeg/sub-2_electrodes.tsv'...
Writing '../data/bids/sub-2/eeg/sub-2_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-2/eeg/sub-2_task-pitch_run-1_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-2/eeg/sub-2_task-pitch_run-1_eeg.json'...
Writing '../data/bids/sub-2/eeg/sub-2_task-pitch_run-1_channels.tsv'...
Copying data files to sub-2_task-pitch_run-1_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):


Writing '../data/bids/sub-2/sub-2_scans.tsv'...
Wrote ../data/bids/sub-2/sub-2_scans.tsv entry with eeg/sub-2_task-pitch_run-1_eeg.vhdr.
../data/raw/letty_subj_3.vhdr
Extracting parameters from ../data/raw/letty_subj_3.vhdr...
Setting channel info structure...
Reading 0 ... 5026249  =      0.000 ...  1005.250 secs...


  raw.set_channel_types({'Aux1': 'stim'})


Location for this channel is unknown; consider calling set_montage() again if needed.
Used Annotations descriptions: ['New Segment/', 'Stimulus/S  1', 'Stimulus/S  2', 'Stimulus/S  3', 'Stimulus/S  4', 'Stimulus/S  5']



['leog', 'reog'].

Consider using inst.set_channel_types if these are not EEG channels, or use the on_missing parameter if the channel positions are allowed to be unknown in your analyses.
  raw.set_montage(dig, on_missing = 'warn')


Writing '../data/bids/participants.tsv'...
Writing '../data/bids/participants.json'...
Writing electrodes file to... ../data/bids/sub-3/eeg/sub-3_electrodes.tsv
Writing coordsytem file to... ../data/bids/sub-3/eeg/sub-3_coordsystem.json
Writing '../data/bids/sub-3/eeg/sub-3_electrodes.tsv'...
Writing '../data/bids/sub-3/eeg/sub-3_coordsystem.json'...
Used Annotations descriptions: ['100', '150', '200', '250', '50']
Writing '../data/bids/sub-3/eeg/sub-3_task-pitch_run-1_events.tsv'...
Writing '../data/bids/dataset_description.json'...
Writing '../data/bids/sub-3/eeg/sub-3_task-pitch_run-1_eeg.json'...
Writing '../data/bids/sub-3/eeg/sub-3_task-pitch_run-1_channels.tsv'...
Copying data files to sub-3_task-pitch_run-1_eeg.vhdr


  write_raw_bids(
  LooseVersion(library.__version__) < LooseVersion(min_version):
  LooseVersion(library.__version__) < LooseVersion(min_version):


Writing '../data/bids/sub-3/sub-3_scans.tsv'...
Wrote ../data/bids/sub-3/sub-3_scans.tsv entry with eeg/sub-3_task-pitch_run-1_eeg.vhdr.


#### Check output files

In [6]:
from mne_bids import print_dir_tree
print_dir_tree(BIDS_DIR)

|/
|--- .DS_Store
|--- README
|--- dataset_description.json
|--- participants.json
|--- participants.tsv
|derivatives/
|--- .DS_Store
|--- preprocessing/
|------ .DS_Store
|------ dataset_description.json
|------ sub-1.html
|------ sub-1/
|--------- sub-1_task-pitches_desc-clean_epo.fif.gz
|sub-1/
|--- eeg/
|sub-2/
|--- .DS_Store
|--- sub-2_scans.tsv
|--- eeg/
|------ sub-2_coordsystem.json
|------ sub-2_electrodes.tsv
|------ sub-2_task-pitch_run-1_channels.tsv
|------ sub-2_task-pitch_run-1_eeg.eeg
|------ sub-2_task-pitch_run-1_eeg.json
|------ sub-2_task-pitch_run-1_eeg.vhdr
|------ sub-2_task-pitch_run-1_eeg.vmrk
|------ sub-2_task-pitch_run-1_events.tsv
|sub-3/
|--- .DS_Store
|--- sub-3_scans.tsv
|--- eeg/
|------ .DS_Store
|------ sub-3_coordsystem.json
|------ sub-3_electrodes.tsv
|------ sub-3_task-pitch_run-1_channels.tsv
|------ sub-3_task-pitch_run-1_eeg.eeg
|------ sub-3_task-pitch_run-1_eeg.json
|------ sub-3_task-pitch_run-1_eeg.vhdr
|------ sub-3_task-pitch_run-1_eeg.vm