Define Paths

In [None]:
# Dataset Paths
raw_data_folder = '/kaggle/input/bci-competition-iv-2a'
mat_folder = '/kaggle/input/2a-true-labels/' # Evaluation files Lables
# Paths for Cleaned data output
cleaned_data_folder_1 = '/kaggle/working/cleaned_data/first_session/'
cleaned_data_folder_2 = '/kaggle/working/cleaned_data/second_session/'

Define Preprocessing Filters

In [None]:
def apply_filters(raw, l_freq=8, h_freq=30, notch_freq=50):
    raw.filter(l_freq=l_freq, h_freq=None, method='fir', fir_design='firwin')
    
    # (Nyquist frequency= 125 Hz, sfreq=250 Hz)
    notch_freqs = [notch_freq, 100] 
    raw.notch_filter(freqs=notch_freqs, method='fir', fir_design='firwin')
    raw.filter(l_freq=None, h_freq=h_freq, method='fir', fir_design='firwin')
    
    return raw


def remove_artifacts(raw, n_components=20):
    from mne.preprocessing import ICA
    
    # Create a copy for ICA fitting
    raw_copy = raw.copy()
    raw_copy.filter(l_freq=1, h_freq=None)
    
    # Fit ICA
    ica = ICA(n_components=n_components, method='infomax', random_state=42, max_iter=800)
    ica.fit(raw_copy)
    
    # Simplified approach
    ica.apply(raw)
    
    return raw

def create_overlapping_epochs(raw, events, event_ids, tmin=0.5, tmax=4.5, overlap=0.25):
    all_epochs = []
    
    # Base epochs
    base_epochs = mne.Epochs(raw, events, event_id=event_ids, tmin=tmin, tmax=tmax,
                            reject=None, baseline=(tmin, tmin+0.2), preload=True)
    all_epochs.append(base_epochs)
    
    # Create overlapping epochs with different starting points
    window_length = tmax - tmin
    step_size = window_length * (1 - overlap)
    
    for shift in np.arange(0.1, 1.0, step_size):
        try:
            shifted_events = events.copy()
            shift_samples = int(shift * raw.info['sfreq'])
            shifted_events[:, 0] += shift_samples
            
            epochs = mne.Epochs(raw, shifted_events, event_id=event_ids,
                              tmin=tmin, tmax=tmax, reject=None,
                              baseline=(tmin, tmin+0.2), preload=True)
            if len(epochs) > 0:
                all_epochs.append(epochs)
        except:
            continue
    
    if len(all_epochs) > 1:
        return mne.concatenate_epochs(all_epochs)
    else:
        return all_epochs[0]

Data preprocessing

In [None]:
# Data preprocessing - First Session
# Create output directory
os.makedirs(cleaned_data_folder_1, exist_ok=True)

# Get all files in raw_data_folder
files = os.listdir(raw_data_folder)

# Filter for training GDF files ('T.gdf')
filtered_files = [file for file in files if file.endswith('T.gdf')]
print(f"Found {len(filtered_files)} training files")

raw_list = []

for file in filtered_files:
    print(f"Processing {file}...")
    file_path = os.path.join(raw_data_folder, file)

    # Load raw EEG data
    raw = mne.io.read_raw_gdf(file_path, eog=['EOG-left', 'EOG-central', 'EOG-right'], preload=True)

    # Drop EOG channels
    raw.drop_channels(['EOG-left', 'EOG-central', 'EOG-right'])

    # Apply filtering
    raw = apply_filters(raw, l_freq=8, h_freq=30, notch_freq=50)
    
    # Apply artifact removal
    raw = remove_artifacts(raw, n_components=20)

    # Save cleaned data
    new_file_path = os.path.join(cleaned_data_folder_1, file[:-4] + '.fif')
    raw.save(new_file_path, overwrite=True)

    raw_list.append(raw)

# Concatenate all raw objects into one
final_raw = mne.concatenate_raws(raw_list)
new_file_path = os.path.join(cleaned_data_folder_1, 'First_Session_Subjects.fif')
final_raw.save(new_file_path, overwrite=True)

In [None]:
# Extract events and epochs for first session
events = mne.events_from_annotations(final_raw)
print("First session events:", events[1])

# Create epochs with overlap for data augmentation
epochs = create_overlapping_epochs(final_raw, events[0], event_ids=[7, 8, 9, 10], 
                                 tmin=0.5, tmax=4.5, overlap=0.25)

first_session_data = epochs.get_data(copy=True)
first_session_labels = epochs.events[:,-1]

print("First_session_dataset shape:", first_session_data.shape)
print("First session labels distribution:", np.bincount(first_session_labels - np.min(first_session_labels)))

In [None]:
# Data preprocessing - Second Session (Evaluation data)
# Ensure output folder exists
os.makedirs(cleaned_data_folder_2, exist_ok=True)

# Filter .E.gdf files
gdf_files = [f for f in os.listdir(raw_data_folder) if f.endswith('E.gdf')]
mat_files = [f for f in os.listdir(mat_folder) if f.endswith('E.mat')]

raw_list = []
second_session_labels = np.array([])

for file in gdf_files:
    print(f"Processing evaluation file {file}...")
    file_path = os.path.join(raw_data_folder, file)

    # Load GDF EEG data
    raw = mne.io.read_raw_gdf(file_path, eog=['EOG-left', 'EOG-central', 'EOG-right'], preload=True)
    raw.drop_channels(['EOG-left', 'EOG-central', 'EOG-right'])
    
    # Apply filtering
    raw = apply_filters(raw, l_freq=8, h_freq=30, notch_freq=50)
    
    # Apply artifact removal
    raw = remove_artifacts(raw, n_components=20)

    # Save cleaned data
    new_file_path = os.path.join(cleaned_data_folder_2, file[:-4] + '.fif')
    raw.save(new_file_path, overwrite=True)

    raw_list.append(raw)

    # Load corresponding .mat label file
    mat_file_name = file.replace('.gdf', '.mat')
    mat_file_path = os.path.join(mat_folder, mat_file_name)
    print(f"data: {file}, label: {mat_file_name}")

    if os.path.exists(mat_file_path):
        mat_data = scipy.io.loadmat(mat_file_path)
        class_labels = mat_data.get('classlabel', [])

        if class_labels is not None and class_labels.size > 0:
            class_labels_array = np.array(class_labels, dtype=int).flatten()
            second_session_labels = np.concatenate((second_session_labels, class_labels_array))
        else:
            print(f"⚠️ Warning: 'classlabel' missing or empty in {mat_file_name}")
    else:
        print(f"⚠️ Warning: {mat_file_name} not found in /kaggle/input/2a-true-labels/")

# Concatenate all raw sessions
final_raw = mne.concatenate_raws(raw_list)
new_file_path = os.path.join(cleaned_data_folder_2, 'Second_Session_Subjects.fif')
final_raw.save(new_file_path, overwrite=True)

In [None]:
# Extract events and epochs for second session
events = mne.events_from_annotations(final_raw)
print("Second session events:", events[1])

epochs = mne.Epochs(final_raw, events[0], event_id=7, tmin=0.5, tmax=4.5, 
                   reject=None, baseline=(0.5, 0.7), preload=True)
second_session_data = epochs.get_data(copy=True)

print("Second Session Dataset shape:", second_session_data.shape)
print("Second session labels distribution:", np.bincount(second_session_labels.astype(int) - 1))