In [1]:
import mne
import glob
import os
import logging
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import GridSearchCV
from scipy.signal import find_peaks
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

## Helper Functions

In [2]:
def split_train_test_path_list(data_path, file_name_template, train_ratio):
    file_list = sorted(glob.glob(os.path.join(data_path, file_name_template)))
    np.random.shuffle(file_list)
    split_id = int(len(file_list) * train_ratio)

    train_list = file_list[:split_id]
    test_list = file_list[split_id:]

    return train_list, test_list

def read_eeg_epochs(train_list, test_list):
    epochs_train_list = []
    epochs_test_list = []

    for file_path in train_list:
        with mne.utils.use_log_level("ERROR"):
            epoch_train = mne.read_epochs(file_path, preload=True)
            epochs_train_list.append(epoch_train)

    for file_path in test_list:
        with mne.utils.use_log_level("ERROR"):
            epoch_test = mne.read_epochs(file_path, preload=True)
            epochs_test_list.append(epoch_test)

    epochs_train = mne.concatenate_epochs(epochs_train_list)
    epochs_test = mne.concatenate_epochs(epochs_test_list)

    return epochs_train, epochs_test


def get_window_idx(center, width, times_arr):
    return np.where((times_arr >= center - width/2) & (times_arr <= center + width/2))[0]

def zero_crossings(sig):
    return np.where(np.diff(np.sign(sig)))[0].size

def extract_peak_features(X, times, window_size=0.02):

    n_epochs, n_channels, n_times = X.shape
    features = []

    peak_window_N170 =(0.12, 0.22)
    peak_window_P100=(0.05, 0.15)
    peak_window_P300 = (0.2, 0.3)

    win_mask_N170 = (times >= peak_window_N170[0]) & (times <= peak_window_N170[1])
    win_mask_P100 = (times >= peak_window_P100[0]) & (times <= peak_window_P100[1])
    win_mask_P300 = (times >= peak_window_P300[0]) & (times <= peak_window_P300[1])

    for ep in range(n_epochs):
        feats_ep = []
        for ch in range(n_channels):
            signal = X[ep, ch, :]


            sig_win_P100 = signal[win_mask_P100]
            times_win_P100 = times[win_mask_P100]

            peaks, _ = find_peaks(sig_win_P100, distance=100)
            if len(peaks) == 0:
                idx1 = np.argmax(sig_win_P100)
            else:
                idx1 = peaks[0]
            time1 = times_win_P100[idx1]
            amp1 = float(sig_win_P100[idx1])

            sig_win_N170 = signal[win_mask_N170]
            times_win_N170 = times[win_mask_N170]

            peaks_min, _ = find_peaks(-sig_win_N170, distance=100)
            if len(peaks_min) == 0:
                idx2 = np.argmin(sig_win_N170)
            else:
                idx2 = peaks_min[0]
            time2 = times_win_N170[idx2]
            amp2 = float(sig_win_N170[idx2])


            sig_win_P300 = signal[win_mask_P300]
            times_win_P300 = times[win_mask_P300]

            peaks, _ = find_peaks(sig_win_P300, distance=100)
            if len(peaks) == 0:
                idx3 = np.argmax(sig_win_P300)
            else:
                idx3 = peaks[0]
            time3 = times_win_P300[idx3]
            amp3 = float(sig_win_P300[idx3])

            win1 = get_window_idx(time1, window_size, times)
            win2 = get_window_idx(time2, window_size, times)
            win3 = get_window_idx(time3, window_size, times)

            # Adds latency and amplitude for all peaks
            feats_ep.extend([time1, time2, time3, amp1, amp2, amp3])

            for win in [win1, win2, win3]:
                if len(win) > 0:
                    sig_win = signal[win]
                    t_win = times[win]
                    n = len(sig_win)

                    # 1) Zero-crossing rate (number of zero crossings normalized by number of samples)
                    feat_zc = np.where(np.diff(np.sign(sig_win)))[0].size / n

                    # 2) Peak-to-peak amplitude (max - min) in µV
                    feat_ptp = np.ptp(sig_win)

                    # 3) RMS (Root Mean Square): overall energy of the signal in µV
                    feat_rms = np.sqrt(np.mean(sig_win ** 2))

                    # 4) Standard deviation: variability of the signal in µV
                    feat_std = np.std(sig_win)

                    # 5) Absolute Area Under the Curve (µV·s): integral of |signal| over time
                    feat_auc_abs = np.trapezoid(np.abs(sig_win), x=t_win) if n > 1 else 0.0

                    # 6) Slope (µV/s): linear trend of the signal
                    slope = np.polyfit(t_win, sig_win, 1)[0] if n > 1 else np.nan

                    # 7) Line length (µV): sum of absolute differences between consecutive samples
                    line_length = np.sum(np.abs(np.diff(sig_win))) if n > 1 else 0.0

                    # 8) Median amplitude (µV): robust central tendency of the signal
                    feat_median = np.median(sig_win)

                    # 9. Mean  amplitude (µV)
                    feat_mean = np.mean(sig_win)

                    feats_ep.extend([
                        feat_zc,       # Zero-crossing rate
                        feat_ptp,      # Peak-to-peak amplitude
                        feat_rms,      # RMS
                        feat_std,      # Standard deviation
                        feat_auc_abs,  # Absolute AUC
                        slope,         # Slope
                        line_length,   # Line length
                        feat_median,   # Median amplitude
                        feat_mean      # Mean amplitude
                    ])
                else:
                    # No samples in this window → fill with NaNs for consistency
                    feats_ep.extend([np.nan] * 9)

        features.append(feats_ep)
    return np.array(features)  # shape: (n_epochs, n_channels*3*14)


def get_X_and_Y_from_epochs(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5):

    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_train_event1 = [0] * len(epochs_train_list_event1)
    labels_train_event2 = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_train_event1, labels_train_event2), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_test_event1 = [0] * len(epochs_test_list_event1)
    labels_test_event2 = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_test_event1, labels_test_event2), axis=0)

    return X_train, X_test, y_train, y_test


def get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, events, picks=None, window_size = 0.02):

    t_min = -0.2
    t_max = 0.5

    X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, events, picks, t_min, t_max)

    times = np.linspace(t_min, t_max, X_train.shape[2])

    # Feature exctraction from peaks
    X_train_feats = extract_peak_features(X_train, times,window_size)
    X_test_feats = extract_peak_features(X_test, times, window_size)

    logging.info(f"shape: {X_train_feats.shape}")

    return X_train_feats, X_test_feats, y_train, y_test

def eval_split(name, X, y, model, condition_in_out):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)

    logging.info(f"\n== {name.upper()} ==")
    logging.info(f"AUC      : {roc_auc_score(y, model.predict_proba(X)[:, 1]):.4f}")
    logging.info(f"Accuracy : {accuracy_score(y, y_pred):.4f}")
    logging.info(f"F1       : {f1_score(y, y_pred, pos_label=1):.4f}")
    logging.info(f"Precision: {precision_score(y, y_pred, pos_label=1):.4f}")
    logging.info(f"Recall   : {recall_score(y, y_pred, pos_label=1):.4f}")
    if condition_in_out:
        cm_df = pd.DataFrame(cm, index=["Actual in-group", "Actual out-group"], columns=["Predicted in-group", "Predicted out-group"])
    else:
        cm_df = pd.DataFrame(cm, index=["Actual inv", "Actual up"], columns=["Predicted inv", "Predicted up"])
    logging.info(f"\nConfusion matrix ({name}):")
    logging.info(cm_df)


def train_and_test_model(X_train, X_test, y_train, y_test, model, name, condition_in_out):

    logging.info(f"\n== {name.upper()} ==")
    model.fit(X_train, y_train)

    eval_split("train", X_train, y_train, model, condition_in_out)
    eval_split("test",  X_test,  y_test, model, condition_in_out)

    logging.info("\n== GridSearchCV ==")
    logging.info(f"Best params: {model.best_params_}")

    return model

In [3]:
dir_path = 'D:\studia\magisterka\dane EEG\BADANIE_POLITYCZNE_2022_eeg_bdfy\EEG_preprocessed'
file_name_template = "s*.bdf-epo.fif"
train_ratio = 0.8
selected_channels = ['P5', 'P6', 'P7', 'P8','PO7', 'PO8']

flatten_transformer = FunctionTransformer(lambda X: X.reshape(X.shape[0], -1))

## LOGGER

In [4]:
log_file = "training_lox_history.txt"

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(log_file, mode="a"),
        logging.StreamHandler()
    ]
)

# Wyłączenie logów z zewnętrznych bibliotek
logging.getLogger("mne").setLevel(logging.ERROR)
logging.getLogger("sklearn").setLevel(logging.ERROR)
logging.getLogger("matplotlib").setLevel(logging.ERROR)

## GBC pipeline & param grid

In [5]:
model_gbc = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier())
])

param_grid_gbc = {
    'gbc__n_estimators': [100, 200]
}

## LDA pipeline & param grid

In [28]:
model_lda = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis())
])

param_grid_lda = {
    "lda__solver": ["lsqr"],
    "lda__shrinkage": [0.2, 0.5, 0.7, 'auto']
}

## MODEL 1: GBC IN/OUT Exploration

In [8]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["in", "out"],  selected_channels, t_min=0.1, t_max=0.25)
model_1 = GridSearchCV(model_gbc, param_grid_gbc, scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_1, "GBC IN/OUT EXPLORATION", condition_in_out = True)

2025-08-31 13:11:48,046 - INFO - 
== GBC IN/OUT EXPLORATION ==
2025-08-31 13:39:47,508 - INFO - 
== TRAIN ==
2025-08-31 13:39:47,621 - INFO - AUC      : 0.7841
2025-08-31 13:39:47,623 - INFO - Accuracy : 0.6976
2025-08-31 13:39:47,629 - INFO - F1       : 0.7006
2025-08-31 13:39:47,634 - INFO - Precision: 0.6932
2025-08-31 13:39:47,639 - INFO - Recall   : 0.7081
2025-08-31 13:39:47,647 - INFO - 
Confusion matrix (train):
2025-08-31 13:39:47,647 - INFO -                   Predicted in-group  Predicted out-group
Actual in-group                 6400                 2913
Actual out-group                2714                 6583
2025-08-31 13:39:47,680 - INFO - 
== TEST ==
2025-08-31 13:39:47,703 - INFO - AUC      : 0.5118
2025-08-31 13:39:47,705 - INFO - Accuracy : 0.5083
2025-08-31 13:39:47,710 - INFO - F1       : 0.5124
2025-08-31 13:39:47,714 - INFO - Precision: 0.5071
2025-08-31 13:39:47,718 - INFO - Recall   : 0.5178
2025-08-31 13:39:47,720 - INFO - 
Confusion matrix (test):
2025-08-31

## MODEL 2: GBC IN/OUT Feature Extraction

In [9]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["in", "out"],  selected_channels)
model_2 = GridSearchCV(model_gbc, param_grid_gbc,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_2, "GBC IN/OUT Feature Extraction", condition_in_out = True)

2025-08-31 13:42:00,488 - INFO - shape: (18636, 198)
2025-08-31 13:42:00,499 - INFO - 
== GBC IN/OUT FEATURE EXTRACTION ==
2025-08-31 14:21:25,661 - INFO - 
== TRAIN ==
2025-08-31 14:21:25,815 - INFO - AUC      : 0.7927
2025-08-31 14:21:25,818 - INFO - Accuracy : 0.7104
2025-08-31 14:21:25,822 - INFO - F1       : 0.7102
2025-08-31 14:21:25,827 - INFO - Precision: 0.7099
2025-08-31 14:21:25,832 - INFO - Recall   : 0.7105
2025-08-31 14:21:25,833 - INFO - 
Confusion matrix (train):
2025-08-31 14:21:25,834 - INFO -                   Predicted in-group  Predicted out-group
Actual in-group                 6626                 2702
Actual out-group                2695                 6613
2025-08-31 14:21:25,871 - INFO - 
== TEST ==
2025-08-31 14:21:25,901 - INFO - AUC      : 0.4995
2025-08-31 14:21:25,903 - INFO - Accuracy : 0.4947
2025-08-31 14:21:25,907 - INFO - F1       : 0.4887
2025-08-31 14:21:25,911 - INFO - Precision: 0.4939
2025-08-31 14:21:25,915 - INFO - Recall   : 0.4836
2025-08-3

## MODEL 3: LDA IN/OUT Exploration

In [29]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["in", "out"], selected_channels, t_min=0.1, t_max=0.25)
model_3 = GridSearchCV(model_lda, param_grid_lda,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_3, "LDA IN/OUT Exploration", condition_in_out = True)

2025-08-31 20:53:57,925 - INFO - 
== LDA IN/OUT EXPLORATION ==
2025-08-31 20:54:01,023 - INFO - 
== TRAIN ==
2025-08-31 20:54:01,052 - INFO - AUC      : 0.5252
2025-08-31 20:54:01,054 - INFO - Accuracy : 0.5203
2025-08-31 20:54:01,059 - INFO - F1       : 0.5242
2025-08-31 20:54:01,063 - INFO - Precision: 0.5200
2025-08-31 20:54:01,068 - INFO - Recall   : 0.5285
2025-08-31 20:54:01,069 - INFO - 
Confusion matrix (train):
2025-08-31 20:54:01,070 - INFO -                   Predicted in-group  Predicted out-group
Actual in-group                 4791                 4565
Actual out-group                4411                 4945
2025-08-31 20:54:01,080 - INFO - 
== TEST ==
2025-08-31 20:54:01,091 - INFO - AUC      : 0.4949
2025-08-31 20:54:01,092 - INFO - Accuracy : 0.4935
2025-08-31 20:54:01,097 - INFO - F1       : 0.5155
2025-08-31 20:54:01,102 - INFO - Precision: 0.4915
2025-08-31 20:54:01,107 - INFO - Recall   : 0.5420
2025-08-31 20:54:01,108 - INFO - 
Confusion matrix (test):
2025-08-31

## MODEL 4: LDA IN/OUT Feature Extraction

In [30]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["in", "out"],  selected_channels)
model_4 = GridSearchCV(model_lda, param_grid_lda,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_4, "LDA IN/OUT Feature Extraction", condition_in_out = True)

2025-08-31 20:56:14,504 - INFO - shape: (18622, 198)
2025-08-31 20:56:14,517 - INFO - 
== LDA IN/OUT FEATURE EXTRACTION ==
2025-08-31 20:56:18,209 - INFO - 
== TRAIN ==
2025-08-31 20:56:18,242 - INFO - AUC      : 0.5396
2025-08-31 20:56:18,244 - INFO - Accuracy : 0.5249
2025-08-31 20:56:18,249 - INFO - F1       : 0.5225
2025-08-31 20:56:18,254 - INFO - Precision: 0.5245
2025-08-31 20:56:18,259 - INFO - Recall   : 0.5205
2025-08-31 20:56:18,260 - INFO - 
Confusion matrix (train):
2025-08-31 20:56:18,261 - INFO -                   Predicted in-group  Predicted out-group
Actual in-group                 4933                 4389
Actual out-group                4459                 4841
2025-08-31 20:56:18,273 - INFO - 
== TEST ==
2025-08-31 20:56:18,285 - INFO - AUC      : 0.4910
2025-08-31 20:56:18,287 - INFO - Accuracy : 0.4864
2025-08-31 20:56:18,291 - INFO - F1       : 0.4999
2025-08-31 20:56:18,295 - INFO - Precision: 0.4867
2025-08-31 20:56:18,299 - INFO - Recall   : 0.5138
2025-08-3

## MODEL 5: GBC INV/UP Exploration


In [13]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["inv", "up"],  selected_channels, t_min=0.1, t_max=0.25)
model_5 = GridSearchCV(model_gbc, param_grid_gbc,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_5, "GBC INV/UP EXPLORATION", condition_in_out = False)

2025-08-31 14:24:20,054 - INFO - 
== GBC INV/UP EXPLORATION ==
2025-08-31 14:51:58,176 - INFO - 
== TRAIN ==
2025-08-31 14:51:58,299 - INFO - AUC      : 0.8390
2025-08-31 14:51:58,301 - INFO - Accuracy : 0.7569
2025-08-31 14:51:58,307 - INFO - F1       : 0.7643
2025-08-31 14:51:58,311 - INFO - Precision: 0.7448
2025-08-31 14:51:58,316 - INFO - Recall   : 0.7849
2025-08-31 14:51:58,317 - INFO - 
Confusion matrix (train):
2025-08-31 14:51:58,318 - INFO -             Predicted inv  Predicted up
Actual inv           6753          2514
Actual up            2011          7338
2025-08-31 14:51:58,344 - INFO - 
== TEST ==
2025-08-31 14:51:58,368 - INFO - AUC      : 0.7935
2025-08-31 14:51:58,371 - INFO - Accuracy : 0.7142
2025-08-31 14:51:58,375 - INFO - F1       : 0.7156
2025-08-31 14:51:58,379 - INFO - Precision: 0.7145
2025-08-31 14:51:58,383 - INFO - Recall   : 0.7166
2025-08-31 14:51:58,384 - INFO - 
Confusion matrix (test):
2025-08-31 14:51:58,385 - INFO -             Predicted inv  Pred

## MODEL 6: GBC INV/UP Feature Extraction

In [14]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["inv", "up"],  selected_channels)
model_6 = GridSearchCV(model_gbc, param_grid_gbc,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_6, "GBC INV/UP Feature Extraction", condition_in_out = False)

2025-08-31 14:54:12,201 - INFO - shape: (18620, 198)
2025-08-31 14:54:12,214 - INFO - 
== GBC INV/UP FEATURE EXTRACTION ==
2025-08-31 15:33:07,049 - INFO - 
== TRAIN ==
2025-08-31 15:33:07,242 - INFO - AUC      : 0.8367
2025-08-31 15:33:07,244 - INFO - Accuracy : 0.7570
2025-08-31 15:33:07,249 - INFO - F1       : 0.7628
2025-08-31 15:33:07,253 - INFO - Precision: 0.7479
2025-08-31 15:33:07,258 - INFO - Recall   : 0.7783
2025-08-31 15:33:07,259 - INFO - 
Confusion matrix (train):
2025-08-31 15:33:07,260 - INFO -             Predicted inv  Predicted up
Actual inv           6822          2452
Actual up            2072          7274
2025-08-31 15:33:07,288 - INFO - 
== TEST ==
2025-08-31 15:33:07,320 - INFO - AUC      : 0.7471
2025-08-31 15:33:07,322 - INFO - Accuracy : 0.6792
2025-08-31 15:33:07,326 - INFO - F1       : 0.6862
2025-08-31 15:33:07,330 - INFO - Precision: 0.6751
2025-08-31 15:33:07,334 - INFO - Recall   : 0.6976
2025-08-31 15:33:07,335 - INFO - 
Confusion matrix (test):
2025

## MODEL 7: LDA INV/UP Exploration

In [31]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["inv", "up"], selected_channels, t_min=0.1, t_max=0.25)
model_7 = GridSearchCV(model_lda, param_grid_lda,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_7, "LDA INV/UP Exploration", condition_in_out = False)

2025-08-31 20:56:30,482 - INFO - 
== LDA INV/UP EXPLORATION ==
2025-08-31 20:56:33,169 - INFO - 
== TRAIN ==
2025-08-31 20:56:33,199 - INFO - AUC      : 0.7452
2025-08-31 20:56:33,201 - INFO - Accuracy : 0.6860
2025-08-31 20:56:33,206 - INFO - F1       : 0.6971
2025-08-31 20:56:33,211 - INFO - Precision: 0.6760
2025-08-31 20:56:33,215 - INFO - Recall   : 0.7195
2025-08-31 20:56:33,216 - INFO - 
Confusion matrix (train):
2025-08-31 20:56:33,216 - INFO -             Predicted inv  Predicted up
Actual inv           6065          3233
Actual up            2629          6744
2025-08-31 20:56:33,228 - INFO - 
== TEST ==
2025-08-31 20:56:33,238 - INFO - AUC      : 0.7544
2025-08-31 20:56:33,240 - INFO - Accuracy : 0.6928
2025-08-31 20:56:33,243 - INFO - F1       : 0.6903
2025-08-31 20:56:33,247 - INFO - Precision: 0.6995
2025-08-31 20:56:33,251 - INFO - Recall   : 0.6813
2025-08-31 20:56:33,252 - INFO - 
Confusion matrix (test):
2025-08-31 20:56:33,252 - INFO -             Predicted inv  Pred

## MODEL 8: LDA INV/UP Feature Extraction

In [32]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["inv", "up"],
                                                                                   selected_channels)
model_8 = GridSearchCV(model_lda, param_grid_lda, scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_8, "LDA INV/UP Feature Extraction", condition_in_out=False)

2025-08-31 20:58:46,592 - INFO - shape: (18714, 198)
2025-08-31 20:58:46,603 - INFO - 
== LDA INV/UP FEATURE EXTRACTION ==
2025-08-31 20:58:50,289 - INFO - 
== TRAIN ==
2025-08-31 20:58:50,323 - INFO - AUC      : 0.7516
2025-08-31 20:58:50,325 - INFO - Accuracy : 0.6841
2025-08-31 20:58:50,330 - INFO - F1       : 0.6969
2025-08-31 20:58:50,334 - INFO - Precision: 0.6721
2025-08-31 20:58:50,339 - INFO - Recall   : 0.7235
2025-08-31 20:58:50,341 - INFO - 
Confusion matrix (train):
2025-08-31 20:58:50,342 - INFO -             Predicted inv  Predicted up
Actual inv           6009          3315
Actual up            2596          6794
2025-08-31 20:58:50,355 - INFO - 
== TEST ==
2025-08-31 20:58:50,366 - INFO - AUC      : 0.7379
2025-08-31 20:58:50,368 - INFO - Accuracy : 0.6820
2025-08-31 20:58:50,372 - INFO - F1       : 0.6937
2025-08-31 20:58:50,376 - INFO - Precision: 0.6733
2025-08-31 20:58:50,379 - INFO - Recall   : 0.7155
2025-08-31 20:58:50,381 - INFO - 
Confusion matrix (test):
2025

## Subsample average

In [17]:
def subsample_average(data, subsample_size = 5):
    data_copy = data.copy()
    averaged_data = []

    while len(data_copy) >= subsample_size:
        indices = np.random.choice(len(data_copy), subsample_size, replace=False)

        selected = data_copy[indices]
        averaged = np.mean(selected, axis=0)
        averaged_data.append(averaged)

        mask = np.ones(len(data_copy), dtype=bool)
        mask[indices] = False
        data_copy = data_copy[mask]

    return np.array(averaged_data)


def get_X_and_Y_from_epochs_subsample_averaging(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5, subsample_size = 5):

    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_train_list_event1 = subsample_average(epochs_train_list_event1, subsample_size=subsample_size)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_train_list_event2 = subsample_average(epochs_train_list_event2, subsample_size=subsample_size)
    X_train = np.concatenate((subsample_average_train_list_event1, subsample_average_train_list_event2), axis=0)

    labels_up_train = [0] * len(subsample_average_train_list_event1)
    labels_inv_train = [1] * len(subsample_average_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_test_list_event1 = subsample_average(epochs_test_list_event1, subsample_size=subsample_size)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_test_list_event2 = subsample_average(epochs_test_list_event2, subsample_size=subsample_size)
    X_test = np.concatenate((subsample_average_test_list_event1, subsample_average_test_list_event2), axis=0)

    labels_up_test = [0] * len(subsample_average_test_list_event1)
    labels_inv_test = [1] * len(subsample_average_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    logging.info(f"shape: {X_train.shape}")


    return X_train, X_test, y_train, y_test

## GBC IN OUT EXPLORATION SUBSAMPLE AVERAGE

In [22]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_subsample_averaging(train_list, test_list, ["in", "out"],  selected_channels, t_min=0.1, t_max=0.25, subsample_size=5)
model_subsample = GridSearchCV(model_gbc, param_grid_gbc,  scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_subsample, "GBC IN/OUT EXPLORATION SUBSAMPLE AVERAGE = 5", condition_in_out = True)

2025-08-31 20:13:47,412 - INFO - shape: (3720, 6, 20)
2025-08-31 20:13:47,535 - INFO - 
== GBC IN/OUT EXPLORATION SUBSAMPLE AVERAGE = 5 ==
2025-08-31 20:18:27,582 - INFO - 
== TRAIN ==
2025-08-31 20:18:27,595 - INFO - AUC      : 0.9118
2025-08-31 20:18:27,597 - INFO - Accuracy : 0.8228
2025-08-31 20:18:27,606 - INFO - F1       : 0.8190
2025-08-31 20:18:27,611 - INFO - Precision: 0.8358
2025-08-31 20:18:27,615 - INFO - Recall   : 0.8029
2025-08-31 20:18:27,617 - INFO - 
Confusion matrix (train):
2025-08-31 20:18:27,619 - INFO -                   Predicted in-group  Predicted out-group
Actual in-group                 1570                  293
Actual out-group                 366                 1491
2025-08-31 20:18:27,629 - INFO - 
== TEST ==
2025-08-31 20:18:27,636 - INFO - AUC      : 0.4715
2025-08-31 20:18:27,638 - INFO - Accuracy : 0.4736
2025-08-31 20:18:27,642 - INFO - F1       : 0.4430
2025-08-31 20:18:27,646 - INFO - Precision: 0.4714
2025-08-31 20:18:27,650 - INFO - Recall   : 

## GBC INV/UP EXPLORATION SUBSAMPLE AVERAGE

In [34]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_subsample_averaging(train_list, test_list, ["inv", "up"],  selected_channels, t_min=0.1, t_max=0.25, subsample_size=5)
model_subsample2 = GridSearchCV(model_gbc, param_grid_gbc, scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_subsample2, "GBC INV/UP EXPLORATION SUBSAMPLE AVERAGE = 5", condition_in_out = False)

2025-08-31 21:04:25,540 - INFO - shape: (3749, 6, 20)
2025-08-31 21:04:25,640 - INFO - 
== GBC INV/UP EXPLORATION SUBSAMPLE AVERAGE = 5 ==
2025-08-31 21:08:59,532 - INFO - 
== TRAIN ==
2025-08-31 21:08:59,550 - INFO - AUC      : 0.9698
2025-08-31 21:08:59,552 - INFO - Accuracy : 0.9098
2025-08-31 21:08:59,556 - INFO - F1       : 0.9111
2025-08-31 21:08:59,560 - INFO - Precision: 0.9025
2025-08-31 21:08:59,564 - INFO - Recall   : 0.9198
2025-08-31 21:08:59,566 - INFO - 
Confusion matrix (train):
2025-08-31 21:08:59,567 - INFO -             Predicted inv  Predicted up
Actual inv           1680           187
Actual up             151          1731
2025-08-31 21:08:59,575 - INFO - 
== TEST ==
2025-08-31 21:08:59,581 - INFO - AUC      : 0.9181
2025-08-31 21:08:59,583 - INFO - Accuracy : 0.8341
2025-08-31 21:08:59,587 - INFO - F1       : 0.8386
2025-08-31 21:08:59,592 - INFO - Precision: 0.8195
2025-08-31 21:08:59,595 - INFO - Recall   : 0.8587
2025-08-31 21:08:59,596 - INFO - 
Confusion mat

## LDA IN/OUT EXPLORATION SUBSAMPLE AVERAGE

In [35]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_subsample_averaging(train_list, test_list, ["in", "out"],
                                                                               selected_channels, t_min=0.1, t_max=0.25,
                                                                               subsample_size=5)
model_subsample3 = GridSearchCV(model_lda, param_grid_lda, scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_subsample3, "LDA IN/OUT EXPLORATION SUBSAMPLE AVERAGE = 5",
                     condition_in_out=True)

2025-08-31 21:09:18,442 - INFO - shape: (3738, 6, 20)
2025-08-31 21:09:18,537 - INFO - 
== LDA IN/OUT EXPLORATION SUBSAMPLE AVERAGE = 5 ==
2025-08-31 21:09:19,350 - INFO - 
== TRAIN ==
2025-08-31 21:09:19,357 - INFO - AUC      : 0.5480
2025-08-31 21:09:19,359 - INFO - Accuracy : 0.5417
2025-08-31 21:09:19,362 - INFO - F1       : 0.5443
2025-08-31 21:09:19,366 - INFO - Precision: 0.5410
2025-08-31 21:09:19,370 - INFO - Recall   : 0.5476
2025-08-31 21:09:19,371 - INFO - 
Confusion matrix (train):
2025-08-31 21:09:19,372 - INFO -                   Predicted in-group  Predicted out-group
Actual in-group                 1002                  868
Actual out-group                 845                 1023
2025-08-31 21:09:19,377 - INFO - 
== TEST ==
2025-08-31 21:09:19,382 - INFO - AUC      : 0.4805
2025-08-31 21:09:19,383 - INFO - Accuracy : 0.4731
2025-08-31 21:09:19,387 - INFO - F1       : 0.4933
2025-08-31 21:09:19,391 - INFO - Precision: 0.4732
2025-08-31 21:09:19,395 - INFO - Recall   : 

## LDA INV/UP EXPLORATION SUBSAMPLE AVERAGE

In [36]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_subsample_averaging(train_list, test_list, ["inv", "up"],  selected_channels, t_min=0.1, t_max=0.25, subsample_size=5)
model_subsample4 = GridSearchCV(model_lda, param_grid_lda, scoring="roc_auc", cv=5)

train_and_test_model(X_train, X_test, y_train, y_test, model_subsample4, "LDA INV/UP EXPLORATION SUBSAMPLE AVERAGE = 5", condition_in_out = False)

2025-08-31 21:09:38,276 - INFO - shape: (3748, 6, 20)
2025-08-31 21:09:38,380 - INFO - 
== LDA INV/UP EXPLORATION SUBSAMPLE AVERAGE = 5 ==
2025-08-31 21:09:39,201 - INFO - 
== TRAIN ==
2025-08-31 21:09:39,208 - INFO - AUC      : 0.9306
2025-08-31 21:09:39,210 - INFO - Accuracy : 0.8519
2025-08-31 21:09:39,215 - INFO - F1       : 0.8554
2025-08-31 21:09:39,220 - INFO - Precision: 0.8390
2025-08-31 21:09:39,224 - INFO - Recall   : 0.8725
2025-08-31 21:09:39,225 - INFO - 
Confusion matrix (train):
2025-08-31 21:09:39,225 - INFO -             Predicted inv  Predicted up
Actual inv           1551           315
Actual up             240          1642
2025-08-31 21:09:39,231 - INFO - 
== TEST ==
2025-08-31 21:09:39,236 - INFO - AUC      : 0.9371
2025-08-31 21:09:39,237 - INFO - Accuracy : 0.8584
2025-08-31 21:09:39,241 - INFO - F1       : 0.8562
2025-08-31 21:09:39,244 - INFO - Precision: 0.8736
2025-08-31 21:09:39,248 - INFO - Recall   : 0.8395
2025-08-31 21:09:39,250 - INFO - 
Confusion mat