In [1]:
import sklearn
import mne
import numpy as np
import glob
import os
import sys
import logging

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from joblib import parallel_backend
import multiprocessing
from sklearn.decomposition import PCA
from pywt import wavedec

In [2]:
def split_train_test_path_list(data_path, file_name_template, train_ratio):
    file_list = sorted(glob.glob(os.path.join(data_path, file_name_template)))
    np.random.shuffle(file_list)
    split_id = int(len(file_list) * train_ratio)

    train_list = file_list[:split_id]
    test_list = file_list[split_id:]

    return train_list, test_list


In [3]:
def read_all_epochs(data_path, file_name_template, events, picks=None, t_min = -0.2, t_max = 0.5):
    file_list = sorted(glob.glob(os.path.join(data_path, file_name_template)))
    np.random.shuffle(file_list)

    epochs_list = []
    for file_path in train_list:
        epoch = mne.read_epochs(file_path, preload=True)
        epochs_list.append(epoch)

    epochs_list = mne.concatenate_epochs(epochs_list)

    epochs_event_1 = epochs_list[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_event_2 = epochs_list[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)

    return epochs_event_1, epochs_event_2

In [4]:
def read_eeg_epochs(train_list, test_list):
    epochs_train_list = []
    epochs_test_list = []

    for file_path in train_list:
        with mne.utils.use_log_level("ERROR"):
            epoch_train = mne.read_epochs(file_path, preload=True)
            epochs_train_list.append(epoch_train)

    for file_path in test_list:
        with mne.utils.use_log_level("ERROR"):
            epoch_test = mne.read_epochs(file_path, preload=True)
            epochs_test_list.append(epoch_test)

    epochs_train = mne.concatenate_epochs(epochs_train_list)
    epochs_test = mne.concatenate_epochs(epochs_test_list)

    return epochs_train, epochs_test

In [6]:
import numpy as np

a = np.array([5, 2, 8, -3, 7])
idx = np.min(a)
print(idx)         # => 3
print(a[idx])

-3
8


In [5]:
from scipy.signal import find_peaks
import numpy as np

y = np.array([0, 2, 1, 3, 0, 1, 2, 1])
peaks, _ = find_peaks(y)
print(peaks)        # => [1 3 6]
print(y[peaks])

[1 3 6]
[2 3 2]


In [20]:
import numpy as np
from scipy.signal import find_peaks
from scipy.stats import skew, kurtosis

def get_window_idx(center, width, times_arr):
    return np.where((times_arr >= center - width/2) & (times_arr <= center + width/2))[0]

def zero_crossings(sig):
    return np.where(np.diff(np.sign(sig)))[0].size

def extract_peak_features(X, times, window_size=0.04, baseline_correction = False):

    n_epochs, n_channels, n_times = X.shape
    features = []

    peak_window_N170 =(0.12, 0.22)
    peak_window_P100=(0.05, 0.15)
    peak_window_P300 = (0.2, 0.3)

    win_mask_N170 = (times >= peak_window_N170[0]) & (times <= peak_window_N170[1])
    win_mask_P100 = (times >= peak_window_P100[0]) & (times <= peak_window_P100[1])
    win_mask_P300 = (times >= peak_window_P300[0]) & (times <= peak_window_P300[1])

    for ep in range(n_epochs):
        feats_ep = []
        for ch in range(n_channels):
            signal = X[ep, ch, :]


            sig_win_P100 = signal[win_mask_P100]
            times_win_P100 = times[win_mask_P100]

            peaks, _ = find_peaks(sig_win_P100, distance=100)
            if len(peaks) == 0:
                idx1 = np.argmax(sig_win_P100)
            else:
                idx1 = peaks[0]
            time1 = times_win_P100[idx1]
            amp1 = float(sig_win_P100[idx1])
            if baseline_correction:
                amp1 -= float(sig_win_P100[0])


            sig_win_N170 = signal[win_mask_N170]
            times_win_N170 = times[win_mask_N170]

            peaks_min, _ = find_peaks(-sig_win_N170, distance=100)
            if len(peaks_min) == 0:
                idx2 = np.argmin(sig_win_N170)
            else:
                idx2 = peaks_min[0]
            time2 = times_win_N170[idx2]
            amp2 = float(sig_win_N170[idx2])
            if baseline_correction:
                amp2 -= float(sig_win_N170[0])


            sig_win_P300 = signal[win_mask_P300]
            times_win_P300 = times[win_mask_P300]

            peaks, _ = find_peaks(sig_win_P300, distance=100)
            if len(peaks) == 0:
                idx3 = np.argmax(sig_win_P300)
            else:
                idx3 = peaks[0]
            time3 = times_win_P300[idx3]
            amp3 = float(sig_win_P300[idx3])
            if baseline_correction:
                amp3 -= float(sig_win_P300[0])

            win1 = get_window_idx(time1, window_size, times)
            win2 = get_window_idx(time2, window_size, times)
            win3 = get_window_idx(time3, window_size, times)

            feats_ep.extend([time1, time2, time3, amp1, amp2, amp3])

            for win in [win1, win2, win3]:
                if len(win) > 0:
                    sig_win = signal[win]
                    t_win = times[win]

                    # 7. Zero-crossing rate
                    feat_zc = zero_crossings(sig_win)
                    # 8. Peak-to-peak amplitude
                    feat_ptp = np.ptp(sig_win)

                    if baseline_correction:
                        sig_win = sig_win - sig_win[0]

                    # Feature calculations:
                    # 1. RMS
                    feat_rms = np.sqrt(np.mean(sig_win ** 2))
                    # 2. Variance
                    feat_var = np.var(sig_win)
                    # 3. Std
                    feat_std = np.std(sig_win)
                    # 4. Skewness
                    feat_skew = skew(sig_win)
                    # 5. Kurtosis
                    feat_kurt = kurtosis(sig_win)
                    # 6. Area under the curve
                    feat_auc = np.sum(sig_win)
                    # 9. Slope
                    # Fit line: polyfit returns [slope, intercept]
                    slope = np.polyfit(t_win, sig_win, 1)[0] if len(sig_win) > 1 else np.nan
                    # 10. Mean
                    feat_mean = np.mean(sig_win)
                    # 11. Min
                    feat_min = np.min(sig_win)
                    # 12. Max
                    feat_max = np.max(sig_win)
                    # 13. Median
                    feat_median = np.median(sig_win)

                    feats_ep.extend([
                        feat_rms, feat_var, feat_std, feat_skew, feat_kurt, feat_auc, feat_zc, feat_ptp, slope, feat_mean, feat_min, feat_max, feat_median
                    ])
                else:
                    feats_ep.extend([np.nan]*14)

        features.append(feats_ep)
    return np.array(features)  # shape: (n_epochs, n_channels*3*14)


In [6]:
def extract_n170_features(X, times, n170_window=(0.12, 0.22)):
    """
    X: array (n_epochs, n_channels, n_times)
    times: array (n_times,) w sekundach
    n170_window: tuple (start, end) w sekundach
    Zwracane cechy (per kanał):
      [ min_amp_N170, latency_N170, mean_N170 ]
    """
    n_epochs, n_channels, _ = X.shape
    win_mask = (times >= n170_window[0]) & (times <= n170_window[1])
    feats = np.full((n_epochs, n_channels, 3), np.nan, dtype=float)

    if not np.any(win_mask):
        return feats.reshape(n_epochs, -1)

    t_win = times[win_mask]

    for ep in range(n_epochs):
        for ch in range(n_channels):
            sig = X[ep, ch, :]
            sig_win = sig[win_mask]

            if sig_win.size == 0:
                continue  # zostają NaNy

            idx_min = int(np.argmin(sig_win))
            min_amp = float(sig_win[idx_min])
            latency = float(t_win[idx_min])

            mean_amp = float(np.mean(sig_win))

            feats[ep, ch, 0] = min_amp
            feats[ep, ch, 1] = latency
            feats[ep, ch, 2] = mean_amp

    return feats

In [9]:
def band_power(sig, fs, band):
    fft_vals = np.fft.rfft(sig)
    power = np.abs(fft_vals) ** 2
    freqs = np.fft.rfftfreq(len(sig), 1/fs)
    idx = np.where((freqs >= band[0]) & (freqs < band[1]))[0]
    return np.sum(power[idx])

def extract_band_power_features(X, times, fs=128):
    n_epochs, n_channels, n_times = X.shape
    features = []

    window = (0.12, 0.22)  # 120–220 ms
    win_mask = (times >= window[0]) & (times <= window[1])

    bands = {
        'delta': (1, 4),
        'theta': (4, 8),
        'alpha': (8, 13),
        'beta': (13, 30),
        'gamma': (30, 45)
    }
    band_names = list(bands.keys())

    for ep in range(n_epochs):
        feats_ep = []
        for ch in range(n_channels):
            sig_win = X[ep, ch, win_mask]
            if len(sig_win) < 2: # if window is empty  fullfill with NaN
                feats_ep.extend([np.nan]*len(bands))
                continue
            for band in bands.values():
                bp = band_power(sig_win, fs, band)
                feats_ep.append(bp)
        features.append(feats_ep)
    return np.array(features)  # shape: (n_epochs, n_channels*len(bands))


In [10]:
def get_X_and_Y_from_epochs(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5):

    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_up_train = [0] * len(epochs_train_list_event1)
    labels_inv_train = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_up_test = [0] * len(epochs_test_list_event1)
    labels_inv_test = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    logging.info(f"shape: {X_train.shape}")


    return X_train, X_test, y_train, y_test

In [11]:
def subsample_average(data, subsample_size = 5):
    data_copy = data.copy()
    averaged_data = []

    while len(data_copy) >= subsample_size:
        indices = np.random.choice(len(data_copy), subsample_size, replace=False)

        selected = data_copy[indices]
        averaged = np.mean(selected, axis=0)
        averaged_data.append(averaged)

        mask = np.ones(len(data_copy), dtype=bool)
        mask[indices] = False
        data_copy = data_copy[mask]

    return np.array(averaged_data)


In [7]:
def get_X_and_Y_from_epochs_with_base_feature_extraction(train_list, test_list, events, picks=None):

    t_min = -0.2
    t_max = 0.5
    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_up_train = [0] * len(epochs_train_list_event1)
    labels_inv_train = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_up_test = [0] * len(epochs_test_list_event1)
    labels_inv_test = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    #####---------------------------------------------------------------------------------------------------------

    times = np.linspace(t_min, t_max, X_train.shape[2])
    # Ekstrakcja cech peaków
    X_train_feats = extract_n170_features(X_train, times)
    X_test_feats = extract_n170_features(X_test, times)

    logging.info(f"shape: {X_train_feats.shape}")

    return X_train_feats, X_test_feats, y_train, y_test

In [8]:
def get_X_and_Y_from_epochs_subsample_averaging_with_peak_feature_extraction(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5, subsample_size = 5):

    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_train_list_event1 = subsample_average(epochs_train_list_event1, subsample_size=subsample_size)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_train_list_event2 = subsample_average(epochs_train_list_event2, subsample_size=subsample_size)
    X_train = np.concatenate((subsample_average_train_list_event1, subsample_average_train_list_event2), axis=0)

    labels_up_train = [0] * len(subsample_average_train_list_event1)
    labels_inv_train = [1] * len(subsample_average_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_test_list_event1 = subsample_average(epochs_test_list_event1, subsample_size=subsample_size)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_test_list_event2 = subsample_average(epochs_test_list_event2, subsample_size=subsample_size)
    X_test = np.concatenate((subsample_average_test_list_event1, subsample_average_test_list_event2), axis=0)

    labels_up_test = [0] * len(subsample_average_test_list_event1)
    labels_inv_test = [1] * len(subsample_average_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

##### ----------------------------------------------------------------------------------------------------------------

    times = np.linspace(t_min, t_max, X_train.shape[2])
    # Ekstrakcja cech peaków
    X_train_feats = extract_peak_features(X_train, times)
    X_test_feats = extract_peak_features(X_test, times)

    logging.info(f"shape: {X_train_feats.shape}")

    return X_train_feats, X_test_feats, y_train, y_test

In [25]:
def get_X_and_Y_from_epochs_subsample_averaging(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5, subsample_size = 5):

    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_train_list_event1 = subsample_average(epochs_train_list_event1, subsample_size=subsample_size)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_train_list_event2 = subsample_average(epochs_train_list_event2, subsample_size=subsample_size)
    X_train = np.concatenate((subsample_average_train_list_event1, subsample_average_train_list_event2), axis=0)

    labels_up_train = [0] * len(subsample_average_train_list_event1)
    labels_inv_train = [1] * len(subsample_average_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_test_list_event1 = subsample_average(epochs_test_list_event1, subsample_size=subsample_size)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    subsample_average_test_list_event2 = subsample_average(epochs_test_list_event2, subsample_size=subsample_size)
    X_test = np.concatenate((subsample_average_test_list_event1, subsample_average_test_list_event2), axis=0)

    labels_up_test = [0] * len(subsample_average_test_list_event1)
    labels_inv_test = [1] * len(subsample_average_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    logging.info(f"shape: {X_train.shape}")


    return X_train, X_test, y_train, y_test

In [22]:
def get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, events, picks=None, window_size = 0.02, baseline_correction = False):

    t_min = -0.2
    t_max = 0.5
    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_up_train = [0] * len(epochs_train_list_event1)
    labels_inv_train = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_up_test = [0] * len(epochs_test_list_event1)
    labels_inv_test = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    #####---------------------------------------------------------------------------------------------------------

    times = np.linspace(t_min, t_max, X_train.shape[2])
    # Ekstrakcja cech peaków
    X_train_feats = extract_peak_features(X_train, times,window_size, baseline_correction)
    X_test_feats = extract_peak_features(X_test, times, window_size, baseline_correction)

    logging.info(f"shape: {X_train_feats.shape}")

    return X_train_feats, X_test_feats, y_train, y_test

In [23]:
def get_X_and_Y_from_epochs_with_PCA(train_list, test_list, events, picks=None):

    t_min = -0.2
    t_max = 0.5
    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_up_train = [0] * len(epochs_train_list_event1)
    labels_inv_train = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_up_test = [0] * len(epochs_test_list_event1)
    labels_inv_test = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    #####---------------------------------------------------------------------------------------------------------

    pca = PCA(n_components=0.9)
    X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
    X_test_reshaped = X_test.reshape(X_test.shape[0], -1)
    X_train_pca = pca.fit_transform(X_train_reshaped)
    X_test_pca = pca.transform(X_test_reshaped)

    print("Ilość komponentów PCA:", pca.n_components_)

    logging.info(f"shape: {X_train_pca.shape}")

    return X_train_pca, X_test_pca, y_train, y_test

In [19]:
def wavelet_avg_features(data, type_wav='db4'):

    n_epochs, n_channels, n_times = data.shape
    all_features = []

    for i in range(n_epochs):
        coeffs = wavedec(data, type_wav,level=3)
        features = []
        for ch in range(n_channels):
            cD_Energy = np.mean([
                             np.sum(np.square(coeffs[3])),np.sum(np.square(coeffs[2])),
                             np.sum(np.square(coeffs[1]))])
            cA_Energy = np.sum(np.square(coeffs[0]))
            D_Entropy = np.mean([
                             np.sum(np.square(coeffs[3]) * np.log(np.square(coeffs[3]))),
                             np.sum(np.square(coeffs[2]) * np.log(np.square(coeffs[2]))),
                             np.sum(np.square(coeffs[1]) * np.log(np.square(coeffs[1])))])

            A_Entropy = np.sum(np.square(coeffs[0]) * np.log(np.square(coeffs[0])))
            D_mean = np.mean([np.mean(coeffs[3]),np.mean(coeffs[2]),np.mean(coeffs[1])])
            A_mean = np.mean(coeffs[0])
            D_std = np.mean([np.std(coeffs[3]),np.std(coeffs[2]),np.std(coeffs[1])])
            A_std = np.std(coeffs[0])
            features = [cD_Energy,cA_Energy,D_Entropy,A_Entropy,D_mean,A_mean,D_std,A_std]
        all_features.append(features)
    return np.array(all_features)


In [12]:
def get_X_and_Y_from_epochs_with_WT(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5):

    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_up_train = [0] * len(epochs_train_list_event1)
    labels_inv_train = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_up_test = [0] * len(epochs_test_list_event1)
    labels_inv_test = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    #####---------------------------------------------------------------------------------------------------------

    X_train_feats = wavelet_avg_features(X_train)
    X_test_feats = wavelet_avg_features(X_test)

    logging.info(f"shape: {X_train_feats.shape}")

    return X_train_feats, X_test_feats, y_train, y_test

In [13]:
def get_X_and_Y_from_epochs_with_band_power_feature_extraction(train_list, test_list, events, picks=None):

    t_min = -0.2
    t_max = 0.5
    epochs_train, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_train_list_event1 = epochs_train[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_train_list_event2 = epochs_train[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_train_list_event1, epochs_train_list_event2), axis=0)

    labels_up_train = [0] * len(epochs_train_list_event1)
    labels_inv_train = [1] * len(epochs_train_list_event2)
    y_train = np.concatenate((labels_up_train, labels_inv_train), axis=0)

    ######--------------------------------------------------------------------------------------------------------

    epochs_test_list_event1 = epochs_test[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_test_list_event2 = epochs_test[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_test = np.concatenate((epochs_test_list_event1, epochs_test_list_event2), axis=0)

    labels_up_test = [0] * len(epochs_test_list_event1)
    labels_inv_test = [1] * len(epochs_test_list_event2)
    y_test = np.concatenate((labels_up_test, labels_inv_test), axis=0)

    #####---------------------------------------------------------------------------------------------------------

    times = np.linspace(t_min, t_max, X_train.shape[2])
    # Ekstrakcja cech peaków
    X_train_feats = extract_band_power_features(X_train, times)
    X_test_feats = extract_band_power_features(X_test, times)

    logging.info(f"shape: {X_train_feats.shape}")

    return X_train_feats, X_test_feats, y_train, y_test

In [11]:
def train_and_test_model(X_train, X_test, y_train, y_test, pipeline, gridSerach = False):

    pipeline.fit(X_train, y_train)

    # predict test data
    y_test_pred = pipeline.predict(X_test)
    test_score = accuracy_score(y_test, y_test_pred)

    # predict train data
    y_train_pred = pipeline.predict(X_train)
    train_score = accuracy_score(y_train, y_train_pred)

    print(f"test_score: {test_score:.4f}")
    print(f"train_score: {train_score:.4f}")

    if gridSerach:
        print(f"The best parameters: {pipeline.best_params_}")
        print(f"The best accuracy: {pipeline.best_score_:.4f}")

In [8]:
log_file = "training_log_3.txt"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(log_file, mode="a"), # a to overwrite
        logging.StreamHandler()
    ]
)

class StreamToLogger:
    def __init__(self, logger, level):
        self.logger = logger
        self.level = level
        self.line_buffer = ""

    def write(self, message):
        if message.strip():
            self.logger.log(self.level, message.strip())

    def flush(self):
        pass

sys.stdout = StreamToLogger(logging.getLogger(), logging.INFO)
sys.stderr = StreamToLogger(logging.getLogger(), logging.ERROR)


In [17]:
dir_path = 'D:\studia\magisterka\dane EEG\BADANIE_POLITYCZNE_2022_eeg_bdfy\EEG_preprocessed'
file_name_template = "s*.bdf-epo.fif"
train_ratio = 0.8
selected_channels = ['P5', 'P6', 'P7', 'P8','PO7', 'PO8']

flatten_transformer = FunctionTransformer(lambda X: X.reshape(X.shape[0], -1))

## MODEL 1: time-range 0-250 Logistic Regression

In [8]:
# model_1 = Pipeline(steps=[('reshape', flatten_transformer), ('scaler', StandardScaler()), ('logisticRegression', LogisticRegression(max_iter=10000))])
#
# train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
# X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["up", "inv"], t_min = 0.0, t_max = 0.25)
#
# train_and_test_model(X_train, X_test, y_train, y_test, model_1)

### MODEL 2: Support Vector Machine with grid search


In [9]:
# model_2 = Pipeline(steps=[('reshape', flatten_transformer), ('scaler', StandardScaler()), ('svc', SVC())])
#
# param_grid = dict(
#     svc__kernel=['linear'],
#     svc__C=[0.1, 1.0],
#     svc__gamma=[0.001, 0.01],
# )
#
# logging.info("Rozpoczynam trenowanie modelu...")
# train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
# X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["up", "inv"], t_min = 0.0, t_max = 0.25)
# logging.info("Rozpoczęto GridSearchCV.")
# with parallel_backend('multiprocessing'):
#     grid_search_model_2 = GridSearchCV(model_2, param_grid, cv=3, scoring='accuracy', n_jobs = -1, verbose=3)
#     train_and_test_model(X_train, X_test, y_train, y_test, grid_search_model_2, True)
#
#
# logging.info(f"Najlepsze parametry: {grid_search_model_2.best_params_}")
# logging.info(f"Najlepszy wynik cross-validation: {grid_search_model_2.best_score_}")
# logging.info("Trenowanie zakończone.")

In [13]:
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["in", "out"])

Not setting metadata
18683 matching events found
Applying baseline correction (mode: mean)
Not setting metadata
4653 matching events found
Applying baseline correction (mode: mean)
['Fp1', 'AF7', 'AF3', 'F1', 'F3', 'F5', 'F7', 'FT7', 'FC5', 'FC3', 'FC1', 'C1', 'C3', 'C5', 'T7', 'TP7', 'CP5', 'CP3', 'CP1', 'P1', 'P3', 'P5', 'P7', 'P9', 'PO7', 'PO3', 'O1', 'Iz', 'Oz', 'POz', 'Pz', 'CPz', 'Fpz', 'Fp2', 'AF8', 'AF4', 'AFz', 'Fz', 'F2', 'F4', 'F6', 'F8', 'FT8', 'FC6', 'FC4', 'FC2', 'FCz', 'Cz', 'C2', 'C4', 'C6', 'T8', 'TP8', 'CP6', 'CP4', 'CP2', 'P2', 'P4', 'P6', 'P8', 'P10', 'PO8', 'PO4', 'O2', 'EXG1', 'EXG2', 'EXG3', 'EXG4']


In [26]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_rel


def compare_events_per_epoch_peak(
    event1, event2, sfreq, channels,
    *,
    tmin=0.120, tmax=0.200,      # okno analizy (N170)
    tmin_epoch=-0.200,           # początek epoki (s)
    peak='min',                  # 'min' (N170) lub 'max'
    alpha=0.05,
    fdr='bh'                     # None lub 'bh' (Benjamini–Hochberg)
):
    """
    Porównuje średnią amplitudę w oknie, amplitudę peaku i latencję peaku
    między dwoma warunkami (event1, event2) dla każdego kanału,
    szukając peaku **per epoka**.

    Parameters
    ----------
    event1, event2 : ndarray [n_epochs, n_channels, n_times]
    sfreq : float, Hz
    channels : list[str]
    tmin, tmax : float, granice okna (s)
    tmin_epoch : float, początek epoki (s)
    peak : 'min' lub 'max'
    alpha : poziom istotności
    fdr : None lub 'bh' dla korekcji wielokrotnych porównań

    Returns
    -------
    pd.DataFrame z kolumnami:
        channel, mean_in/out, p_mean, peak_in/out, p_peak,
        lat_in/lat_out (s), p_lat, oraz flagi istotności (z FDR jeśli podano).
    """
    # --- sanity i dopasowanie liczby epok do t-testu sparowanego
    X1 = np.asarray(event1); X2 = np.asarray(event2)
    assert X1.ndim == 3 and X2.ndim == 3, "Wejście musi mieć kształt [n_epochs, n_channels, n_times]"
    assert X1.shape[1:] == X2.shape[1:], "Musi być ta sama liczba kanałów i próbek czasu"
    n1, n_channels, n_times = X1.shape
    n2 = X2.shape[0]
    n = min(n1, n2)
    if n1 != n2:
        X1 = X1[:n]; X2 = X2[:n]

    if channels is None or len(channels) != n_channels:
        channels = [f"ch{idx}" for idx in range(n_channels)]

    # --- wektor czasu i okno
    times = np.arange(n_times) / float(sfreq) + float(tmin_epoch)
    idx_tmin = int(np.argmin(np.abs(times - tmin)))
    idx_tmax = int(np.argmin(np.abs(times - tmax)))
    if idx_tmax <= idx_tmin:
        raise ValueError("tmax musi być > tmin (sekundy).")

    if peak == 'max':
        peak_fun = np.argmax
    elif peak == 'min':
        peak_fun = np.argmin
    else:
        raise ValueError("peak ∈ {'min','max'}")

    rows = []
    for ch_idx, ch in enumerate(channels):
        d1 = X1[:, ch_idx, idx_tmin:idx_tmax]  # [n, n_tw]
        d2 = X2[:, ch_idx, idx_tmin:idx_tmax]

        # średnia amplituda w oknie (per epoka)
        mean1 = d1.mean(axis=1)
        mean2 = d2.mean(axis=1)
        _, p_mean = ttest_rel(mean1, mean2, nan_policy='omit')

        # --- peak i latencja **per epoka**
        peak_idx1 = peak_fun(d1, axis=1)                 # [n]
        peak_idx2 = peak_fun(d2, axis=1)                 # [n]
        amp1 = d1[np.arange(n), peak_idx1]               # amplituda w peaku, per epoka
        amp2 = d2[np.arange(n), peak_idx2]
        # latencja w sekundach: start okna + indeks/sfreq
        lat1 = times[idx_tmin] + peak_idx1 / float(sfreq)
        lat2 = times[idx_tmin] + peak_idx2 / float(sfreq)

        # testy
        _, p_peak = ttest_rel(amp1, amp2, nan_policy='omit')
        _, p_lat  = ttest_rel(lat1,  lat2,  nan_policy='omit')

        rows.append({
            "channel": ch,
            "mean_in": float(np.nanmean(mean1)),
            "mean_out": float(np.nanmean(mean2)),
            "p_mean": float(p_mean),

            "peak_in": float(np.nanmean(amp1)),
            "peak_out": float(np.nanmean(amp2)),
            "p_peak": float(p_peak),

            "lat_in": float(np.nanmean(lat1)),
            "lat_out": float(np.nanmean(lat2)),
            "p_lat": float(p_lat),
        })

    df = pd.DataFrame(rows)
    return df

In [16]:

event1, event2 = read_all_epochs(dir_path, file_name_template, ["in", "out"], selected_channels)

results = compare_events_per_epoch_peak(event1, event2, 128, selected_channels)

print(results)

NameError: name 'dir_path' is not defined

In [33]:
def summarize_significant_channels(df, alpha=0.05, use_fdr=True,
                                   metrics=("mean", "peak", "lat")):
    """
    Zwraca słownik i czytelny tekst z listą kanałów,
    na których zaobserwowano istotne różnice dla wybranych metryk.

    Parameters
    ----------
    df : pd.DataFrame
        Wynik compare_events_per_epoch_peak(...) lub Twojej compare_events(...).
    alpha : float
        Poziom istotności.
    use_fdr : bool
        Jeśli True, używa kolumn p_*_fdr, jeśli istnieją; w przeciwnym razie p_*.
    metrics : tuple[str]
        Podzbiór z {"mean","peak","lat"}.

    Returns
    -------
    summary_dict : dict
        Np. {"mean": ["P3","PO7"], "peak": ["P4"], "lat": []}
    summary_text : str
        Zwięzły opis do wydruku.
    """
    summary = {}
    lines = []

    for m in metrics:
        # wybór właściwej kolumny p
        p_col_fdr = f"p_{m}_fdr"
        p_col_raw = f"p_{m}"
        if use_fdr and p_col_fdr in df.columns:
            p_col = p_col_fdr
            note = "po FDR"
        else:
            p_col = p_col_raw
            note = "bez korekcji"

        if p_col not in df.columns:
            summary[m] = []
            lines.append(f"- {m}: brak kolumny {p_col} w df.")
            continue

        sig_mask = df[p_col] < alpha
        chans = df.loc[sig_mask, "channel"].tolist()
        summary[m] = chans

        if chans:
            lines.append(f"- {m} ({note}, α={alpha}): {', '.join(chans)}")
        else:
            lines.append(f"- {m} ({note}, α={alpha}): brak istotnych kanałów")

    # kanały istotne w którejkolwiek metryce (uniona)
    any_sig = sorted(set().union(*summary.values())) if summary else []
    if any_sig:
        lines.append(f"\nKanały istotne w ≥1 metryce: {', '.join(any_sig)}")
    else:
        lines.append("\nBrak kanałów istotnych w jakiejkolwiek metryce.")

    return summary, "\n".join(lines)

In [34]:
summary, text = summarize_significant_channels(results, alpha=0.05, use_fdr=True)
print(text)

- mean (bez korekcji, α=0.05): brak istotnych kanałów
- peak (bez korekcji, α=0.05): brak istotnych kanałów
- lat (bez korekcji, α=0.05): P5, PO8

Kanały istotne w ≥1 metryce: P5, PO8


'Fp1', 'AF7', 'AF3', 'F1', 'F3', 'F5', 'F7', 'FT7', 'FC5', 'FC3', 'FC1', 'C1', 'C3', 'C5', 'T7', 'TP7', 'CP5', 'CP3', 'CP1', 'P1', 'P3', 'P5', 'P7', 'P9', 'PO7', 'PO3', 'O1', 'Iz', 'Oz', 'POz', 'Pz', 'CPz', 'Fpz', 'Fp2', 'AF8', 'AF4', 'AFz', 'Fz', 'F2', 'F4', 'F6', 'F8', 'FT8', 'FC6', 'FC4', 'FC2', 'FCz', 'Cz', 'C2', 'C4', 'C6', 'T8', 'TP8', 'CP6', 'CP4', 'CP2', 'P2', 'P4', 'P6', 'P8', 'P10', 'PO8', 'PO4', 'O2', 'EXG1', 'EXG2', 'EXG3', 'EXG4'

## Model 3: LinearDiscriminantAnalysis

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV

model_lda = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis())
])

logging.info("Rozpoczynam trenowanie modelu LDA...")
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)

X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["in", "out"], selected_channels, window_size=0.03, baseline_correction = True)

train_and_test_model(X_train, X_test, y_train, y_test, model_lda)

logging.info("Trenowanie zakończone.")


KeyboardInterrupt: 

## Model 4 Gradient Boosting Classifier


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier(n_estimators=200))
])

logging.info("Rozpoczynam trenowanie modelu GBC...")
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)

X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["inv", "up"], selected_channels)

train_and_test_model(X_train, X_test, y_train, y_test, model_gbc)

logging.info("Trenowanie zakończone.")

In [13]:
model_gbc = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier(n_estimators=200))
])

logging.info("Rozpoczynam trenowanie modelu GBC...")
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)

X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs(train_list, test_list, ["inv/in", "inv/out"], selected_channels)

train_and_test_model(X_train, X_test, y_train, y_test, model_gbc)

logging.info("Trenowanie zakończone.")

NameError: name 'get_X_and_Y_from_epochs' is not defined

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier(n_estimators=200))
])

logging.info("Rozpoczynam trenowanie modelu GBC...")
train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)

X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_WT(train_list, test_list, ["in", "out"],
                                                                   selected_channels)

train_and_test_model(X_train, X_test, y_train, y_test, model_gbc)

logging.info("Trenowanie zakończone.")

NameError: name 'get_X_and_Y_from_epochs_with_WT' is not defined

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = Pipeline(steps=[
    ('reshape', flatten_transformer),
    ('scaler', StandardScaler()),
    ('gbc', GradientBoostingClassifier())
])

param_grid = {
    'gbc__n_estimators': [100, 200],
    'gbc__learning_rate': [0.1, 0.05, 0.01],
    'gbc__max_depth': [3, 4, 5],
    'gbc__subsample': [1.0, 0.8],
    'gbc__min_samples_split': [2, 5, 10]
}

train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)

X_train, X_test, y_train, y_test = get_X_and_Y_from_epochs_with_feature_extraction(train_list, test_list, ["in", "out"],  selected_channels, window_size=0.03)

grid_search = GridSearchCV(model_gbc, param_grid, cv=3, n_jobs=4, verbose=2)

logging.info("Rozpoczynam trenowanie modelu GBC z GridSearch...")
grid_search.fit(X_train, y_train)

logging.info("Trenowanie zakończone.")
logging.info(f"Najlepsze parametry: {grid_search.best_params_}")
logging.info(f"Najlepszy wynik cross-val: {grid_search.best_score_:.4f}")

# Testowanie na test secie
test_score = grid_search.score(X_test, y_test)
logging.info(f"Dokładność na zbiorze testowym: {test_score:.4f}")

In [16]:
#### TESTING to find the best window to looking for peaks

dir_path = 'D:\studia\magisterka\dane EEG\BADANIE_POLITYCZNE_2022_eeg_bdfy\EEG_preprocessed'
file_name_template = "s*.bdf-epo.fif"
train_ratio = 0.8
selected_channels = ['P7', 'P8']

def extract_N170_peak_features2(X, times):

    peak_window=(0.12, 0.22)
    n_epochs, n_channels, n_times = X.shape
    win_mask = (times >= peak_window[0]) & (times <= peak_window[1])
    summary = []

    for ch in range(n_channels):
        count_good = 0
        for ep in range(n_epochs):
            signal = X[ep, ch, :]
            sig_win = signal[win_mask]

            peaks_min, _ = find_peaks(-sig_win, distance=100) ## distance is set to 100, to find exactly one peak (because number of timepoints in my epoch is 90)
            n_min = len(peaks_min)
            if n_min == 1:
                count_good += 1
        percent = 100 * count_good / n_epochs
        print(f"Channel {ch} ({ch if hasattr(X, 'ch_names') else ''}): {percent:.1f}% epoch with n170 peak")
        summary.append(percent)

    return summary

def extract_P100_peak_features2(X, times):

    peak_window=(0.05, 0.15)
    n_epochs, n_channels, n_times = X.shape
    win_mask = (times >= peak_window[0]) & (times <= peak_window[1])
    summary = []

    for ch in range(n_channels):
        count_good = 0
        for ep in range(n_epochs):
            signal = X[ep, ch, :]
            sig_win = signal[win_mask]

            peaks_max, _ = find_peaks(sig_win, distance=100) ## distance is set to 100, to find exactly one peak (because number of timepoints in my epoch is 90)
            n_max = len(peaks_max)
            if n_max == 1:
                count_good += 1
        percent = 100 * count_good / n_epochs
        print(f"Channel {ch} ({ch if hasattr(X, 'ch_names') else ''}): {percent:.1f}% epoch with p100 peak")
        summary.append(percent)

    return summary

def extract_P300_peak(X, times):

    peak_window=(0.2, 0.3)
    n_epochs, n_channels, n_times = X.shape
    win_mask = (times >= peak_window[0]) & (times <= peak_window[1])
    summary = []

    for ch in range(n_channels):
        count_good = 0
        for ep in range(n_epochs):
            signal = X[ep, ch, :]
            sig_win = signal[win_mask]
            peaks_max, _ = find_peaks(sig_win, distance=100) ## distance is set to 100, to find exactly one peak (because number of timepoints in my epoch is 90)
            n_max = len(peaks_max)

            if n_max == 1:
                count_good += 1
        percent = 100 * count_good / n_epochs
        print(f"Channel {ch} ({ch if hasattr(X, 'ch_names') else ''}): {percent:.1f}% epoch with p300 peak")
        summary.append(percent)

    return summary

def get_X_and_Y_from_epochs2(train_list, test_list, events, picks=None, t_min = -0.2, t_max = 0.5):

    epochs, epochs_test = read_eeg_epochs(train_list, test_list)

    #####---------------------------------------------------------------------------------------------------------

    epochs_list_event1 = epochs[events[0]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    epochs_list_event2 = epochs[events[1]].get_data(picks=picks, tmin=t_min, tmax=t_max)
    X_train = np.concatenate((epochs_list_event1, epochs_list_event2), axis=0)

    times = np.linspace(t_min, t_max, X_train.shape[2])
    print(X_train.shape)
    X_train_feats = extract_N170_peak_features2(X_train, times, window_size=0.02)


train_list, test_list = split_train_test_path_list(dir_path, file_name_template, train_ratio)
get_X_and_Y_from_epochs2(train_list, test_list, ["in", "out"], selected_channels)




Not setting metadata
18695 matching events found
Applying baseline correction (mode: mean)
Not setting metadata
4641 matching events found
Applying baseline correction (mode: mean)
(18695, 2, 90)


TypeError: extract_N170_peak_features2() got an unexpected keyword argument 'window_size'