Step 1: Import libraries

In [19]:
!pip install mne



In [20]:
!pip install pywavelets
# then restart the kernel



In [21]:
# ---- Core data wrangling ----
import os                      # file paths and saving output files
import re                      # regular expressions for delimiter detection
import csv                     # CSV constants (e.g., quoting options)
import warnings                # optionally silence non-critical warnings

import numpy as np             # arrays and numeric operations
import pandas as pd            # tables / DataFrames

# ---- Signal processing + EEG tooling ----
import mne                     # MNE-Python for EEG data structures & filtering
from scipy.signal import resample  # optional NumPy-based resampling (we'll prefer MNE's)
import pywt                    # wavelet transforms for wICA
from sklearn.decomposition import FastICA  # ICA for artifact removal

# ---- Display settings (optional) ----
pd.set_option('display.max_columns', None)  # show all columns when printing DataFrames
pd.set_option('display.width', 0)           # let pandas auto-size to the notebook width

# ---- Quality of life ----
warnings.filterwarnings("ignore")  # keep notebook tidy; comment out if you want full verbosity


Step 2 - Load the EEG CSV

In [22]:
# -------- CONFIG: point this to your CSV --------
path = r"C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Jack\Jack_EPOC_147367_2023.02.15T16.20.09Z.md.mc.pm.fe.bp.csv"  # raw EEG CSV path
expected_min_cols = 170                             # sanity check: expect at least this many columns in the raw file
encodings_to_try = ["utf-8-sig", "utf-8", "latin-1"]  # try multiple encodings until one works

# Candidate separators to test; r"\s+" means "any whitespace" (regex)
candidates = {
    ",":       {"is_regex": False},   # comma
    ";":       {"is_regex": False},   # semicolon
    "\t":      {"is_regex": False},   # tab
    "|":       {"is_regex": False},   # pipe
    r"\s+":    {"is_regex": True},    # regex: one-or-more whitespace
}

# -------- STEP 1: decode a text sample for analysis --------
raw = None                               # raw bytes placeholder
text = None                              # decoded text placeholder
used_encoding = None                     # which encoding worked

# Try each encoding in order until one decodes without error
for enc in encodings_to_try:
    try:
        with open(path, "rb") as f:      # open as bytes so we can attempt decodes manually
            raw = f.read()               # read entire file into memory
        text = raw.decode(enc, errors="strict")  # try decoding with strict error policy
        used_encoding = enc              # remember the encoding that worked
        break                            # stop on first successful decode
    except Exception:
        continue                         # try next encoding if this one failed

# If none worked, fail fast with a helpful message
if text is None:
    raise RuntimeError("Could not decode file with utf-8-sig, utf-8, or latin-1.")

# Split into lines and keep a short sample to search for header and delimiter
lines = text.splitlines()                # split by newline into a list of lines
sample_n = min(200, len(lines))          # inspect at most the first 200 lines
sample_lines = lines[:sample_n]          # take the sample segment

# -------- STEP 2: detect the header line and the best delimiter --------
def field_count(line, sep, is_regex):
    """Return the number of fields if 'line' is split by 'sep' (regex or literal)."""
    if not line.strip():                 # empty / whitespace-only lines have 0 fields
        return 0
    if is_regex:                         # regex split (e.g., whitespace)
        parts = re.split(sep, line.strip())
    else:                                # literal split (e.g., comma)
        parts = line.strip().split(sep)
    return len(parts)                    # number of fields

best = None  # will store a tuple: (max_fields, line_idx, sep, is_regex)

# Check each non-empty line in the sample against each candidate separator
for idx, line in enumerate(sample_lines):
    if not line.strip():
        continue
    for sep, meta in candidates.items():
        cnt = field_count(line, sep, meta["is_regex"])  # count fields if split by this sep
        if best is None or cnt > best[0]:               # keep the split with the most fields
            best = (cnt, idx, sep, meta["is_regex"])

# If we never found any plausible split, bail out
if best is None:
    raise RuntimeError("Could not identify any plausible header line.")

# Unpack the best guess for header location and delimiter
best_count, header_idx, best_sep, best_is_regex = best

# Light warning if the header doesn't have many columns (heuristic only)
if best_count < 50:
    print(f"⚠️ Detected only {best_count} fields on line {header_idx}. Continuing anyway.")

# -------- STEP 3: read the file from the detected header line --------
read_kwargs = {
    "encoding": used_encoding,           # encoding we detected above
    "header": 0,                         # first row after skiprows holds the column names
    "skiprows": header_idx,              # skip all lines before the header
    "dtype": str,                        # keep raw strings (preserve exact text)
    "quotechar": '"',                    # standard CSV quote settings
    "doublequote": True,                 # double quotes inside quoted fields
    "na_filter": False,                  # do not auto-convert empty strings to NaN
}

# Use pandas' C engine for literal delimiters; python engine when using regex
if best_is_regex:
    read_kwargs["sep"] = best_sep        # regex separator pattern
    read_kwargs["engine"] = "python"     # python engine supports regex separators
else:
    read_kwargs["sep"] = best_sep        # literal separator
    read_kwargs["engine"] = "c"          # faster C engine for standard splits

# Actually read the CSV into a DataFrame
df = pd.read_csv(path, **read_kwargs)

# -------- STEP 4: diagnostics and hard check --------
print(f"Detected encoding: {used_encoding}")
print(f"Detected header line index: {header_idx}")
print(f"Detected separator: {'REGEX ' if best_is_regex else ''}{best_sep}")
print(f"Shape: {df.shape}")

# Warn if fewer columns than expected_min_cols, but proceed
if df.shape[1] < expected_min_cols:
    hdr_preview = sample_lines[header_idx][:200].replace("\t", "\\t")  # show first ~200 chars of header
    print(f"⚠️ Columns < {expected_min_cols}. Header preview: {hdr_preview}")

# Hard assert to catch wildly wrong parses (adjust/remove if your files legitimately have fewer)
assert df.shape[1] >= expected_min_cols, (
    f"Only {df.shape[1]} columns parsed; expected ≥ {expected_min_cols}."
)

# Quick glance at the beginning of the table
print(df.columns.tolist()[:12])
print(df.head(3))

# Keep a working copy named 'data' (so we can transform without altering 'df')
data = df.copy()


Detected encoding: utf-8-sig
Detected header line index: 1
Detected separator: ,
Shape: (104228, 170)
['Timestamp', 'OriginalTimestamp', 'EEG.Counter', 'EEG.Interpolated', 'EEG.AF3', 'EEG.F7', 'EEG.F3', 'EEG.FC5', 'EEG.T7', 'EEG.P7', 'EEG.O1', 'EEG.O2']
           Timestamp  OriginalTimestamp EEG.Counter EEG.Interpolated  \
0  1676478009.622028  1676478009.622347   21.000000         0.000000   
1  1676478009.629847  1676478009.630067   22.000000         0.000000   
2  1676478009.637667  1676478009.637887   23.000000         0.000000   

       EEG.AF3       EEG.F7       EEG.F3      EEG.FC5       EEG.T7  \
0  3831.794922  4352.307617  4011.794922  4031.281982  4431.282227   
1  3832.820557  4346.666504  4013.846191  4034.871826  4429.743652   
2  3829.743652  4347.692383  4010.256348  4038.461426  4428.717773   

        EEG.P7       EEG.O1       EEG.O2       EEG.P8       EEG.T8  \
0  4326.153809  4316.922852  4007.692383  4249.230957  3736.923096   
1  4328.717773  4317.948730  4007.17

Step 3 - Data preprocessing

In [23]:
# -------- 3.1. Quick exploration (optional prints) --------
_ = data.info()                           # print dtypes/shape summary
_ = data.columns                          # list of column names (not printed unless you add 'print')

# Show a preview of the first rows to check parsing quality
print(data.head())

# -------- 3.2. Keep only the columns from E to R (Excel letters) --------
start_idx = 4                              # E corresponds to 0-based index 4
stop_idx_exclusive = 18                    # R is 17; iloc stop is exclusive so use 18

# Make sure we don't go past the actual number of columns
stop_idx_exclusive = min(stop_idx_exclusive, data.shape[1])

# Subset the DataFrame to columns E..R
data = data.iloc[:, start_idx:stop_idx_exclusive]

# Sanity check the new shape (should be 14 columns if E..R existed)
print("After E..R column keep:", data.shape)

# -------- 3.3. Extract EEG rows 23,040..76,800 (inclusive, 1-based indices) --------
TARGET_N = 76_800                          # target total rows in the *original* dataset
start_row_1based = 23_040                  # desired start row (1-based)
end_row_1based   = 76_800                  # desired end row (1-based, inclusive)

# Convert to 0-based [start, end) for pandas iloc
start_iloc = max(0, start_row_1based - 1)  # safe-guard lower bound
n_rows = data.shape[0]                      # total rows currently in 'data'
end_iloc_exclusive = min(n_rows, end_row_1based)  # clamp end to available rows

# Slice the requested range
data_sliced = data.iloc[start_iloc:end_iloc_exclusive].copy()

# Report what we got so far
print(f"Total rows in data: {n_rows}")
if n_rows == TARGET_N:
    print("✅ Dataset has exactly 76,800 rows.")
elif n_rows > TARGET_N:
    print(f"ℹ️ Dataset has MORE than 76,800 rows: {n_rows} (+{n_rows - TARGET_N}).")
else:
    print(f"⚠️ Dataset has FEWER than 76,800 rows: {n_rows}. Slice may be shorter than requested.")

print(f"Requested rows (1-based): {start_row_1based}..{end_row_1based}")
print(f"Sliced (0-based iloc): {start_iloc}:{end_iloc_exclusive}")
print(f"data_sliced shape: {data_sliced.shape}")
print(data_sliced.head())                  # preview the slice

# If there are rows beyond 76,800 in the original, report a separate "extra" segment
if n_rows > TARGET_N:
    extra_start = TARGET_N                 # 0-based index after 76,800th row
    extra_end = n_rows                     # go to the end
    extra = data.iloc[extra_start:extra_end].copy()
    print(f"\nExtra rows beyond 76,800: {n_rows - TARGET_N}")
    print(f"Extra slice (0-based iloc): {extra_start}:{extra_end} | shape: {extra.shape}")

# -------- 3.4. Keep the last 53,000 rows from the sliced window --------
TARGET_53K = 53_000                        # how many rows to keep from the tail

n_curr = data_sliced.shape[0]              # rows currently in the window
if n_curr >= TARGET_53K:
    data_last_53k = data_sliced.iloc[-TARGET_53K:].copy()  # take the last 53k rows
    n_drop = n_curr - TARGET_53K           # how many were dropped from the head
    print(f"Trimmed {n_drop} rows from the start to keep the last {TARGET_53K}.")
else:
    data_last_53k = data_sliced.copy()     # not enough rows; keep all
    print(f"Dataset has only {n_curr} rows (< {TARGET_53K}); keeping all.")

print(f"data_last_53k shape: {data_last_53k.shape}")

# Reset index to start from 0 after trimming (convenience)
data_last_53k = data_last_53k.reset_index(drop=True)

# -------- 3.5. Save the filtered table to CSV next to the source file --------
output_path = os.path.join(os.path.dirname(path), "adam_eeg_modified.csv")  # output file path

# Write CSV with BOM (Excel-friendly), no index, minimal quoting
data_last_53k.to_csv(
    output_path,
    index=False,
    encoding="utf-8-sig",
    quoting=csv.QUOTE_MINIMAL
)

print(f"✅ Saved: {output_path} | shape: {data_last_53k.shape}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104228 entries, 0 to 104227
Columns: 170 entries, Timestamp to POW.AF4.Gamma
dtypes: object(170)
memory usage: 135.2+ MB
           Timestamp  OriginalTimestamp EEG.Counter EEG.Interpolated  \
0  1676478009.622028  1676478009.622347   21.000000         0.000000   
1  1676478009.629847  1676478009.630067   22.000000         0.000000   
2  1676478009.637667  1676478009.637887   23.000000         0.000000   
3  1676478009.645486  1676478009.645607   24.000000         0.000000   
4  1676478009.653306  1676478009.653527   25.000000         0.000000   

       EEG.AF3       EEG.F7       EEG.F3      EEG.FC5       EEG.T7  \
0  3831.794922  4352.307617  4011.794922  4031.281982  4431.282227   
1  3832.820557  4346.666504  4013.846191  4034.871826  4429.743652   
2  3829.743652  4347.692383  4010.256348  4038.461426  4428.717773   
3  3817.948730  4340.512695  4002.564209  4033.333252  4428.205078   
4  3809.743652  4327.179688  3993.333252  4028

Step 4 - Create a 2D array

In [24]:
# Convert the filtered DataFrame to a NumPy array (keeps exact string values)
arr = data_last_53k.to_numpy(copy=True)    # shape should be (~53,000, 14) => rows x columns

# Optional: cast strings to float if your E..R columns are numeric signals
# (comment out if the CSV contains non-numeric strings in E..R)
arr = arr.astype(float)                    # convert to floats for signal processing

# Assert that we have 14 columns (E..R). Adjust/remove if your file differs.
assert arr.shape[1] == 14, f"Expected 14 columns (E..R); got {arr.shape[1]}."

# Transpose to (channels, timepoints) which MNE expects: n_channels × n_times
arr_T = arr.T                               # now shape is (14, ~53,000)

# Print shape summary for sanity check
print("Array shapes — before transpose:", arr.shape, "| after transpose:", arr_T.shape)

# (Optional) Save to .npy if you want a checkpoint file for later reuse
np.save(os.path.join(os.path.dirname(path), "adam_eeg_arr_T.npy"), arr_T)  # quick cache on disk


Array shapes — before transpose: (53000, 14) | after transpose: (14, 53000)


Step 5 - EEG preprocessing in MNE

In [25]:
# ===== Preprocessing to MATCH the "second code" steps =====
# (1) 60 Hz notch  →  (2) 0.1 Hz high-pass  →  (3) bad timepoints via mean±6*std (interpolate)
# (4) bad channels BEFORE wICA (interpolate)  →  (5) CAR BEFORE wICA
# (6) wICA = wavelet (db4, lvl=3) on data, ICA on low-freq (approx) coeffs, inverse ICA, wavelet recon
# (7) save + preview + status

import os
import numpy as np
import pandas as pd
import mne
import pywt
from sklearn.decomposition import FastICA

# -------- Parameters you can tune (aligned to "second code") --------
sfreq_original = 128.0                     # original sampling rate (Hz)
notch_freq     = 60.0                      # match second code: 60 Hz notch
highpass_hz    = 0.1                       # match second code: remove slow drifts below 0.1 Hz
z_thresh_time  = 6.0                       # match second code: mean ± 6*std per-channel timepoint rejection
ica_components = 25                        # match second code default (cap via FastICA anyway)

# -------- Build an MNE Raw object from the array (channels x time) --------
n_channels, n_times = arr_T.shape
ch_names = [f"EEG{i+1}" for i in range(n_channels)]
ch_types = ["eeg"] * n_channels

info = mne.create_info(ch_names=ch_names, sfreq=sfreq_original, ch_types=ch_types)
raw = mne.io.RawArray(arr_T.astype(float), info)

# -------- (1) Notch filter @ 60 Hz --------
raw.notch_filter(freqs=notch_freq)

# -------- (2) High-pass @ 0.1 Hz --------
raw.filter(l_freq=highpass_hz, h_freq=None)

# Pull filtered data for custom cleaning (channels x time)
data_filt = raw.get_data()

# -------- (3) Detect & remove bad time points via mean ± 6*std (per channel), then interpolate --------
chan_means = np.mean(data_filt, axis=1, keepdims=True)
chan_stds  = np.std(data_filt,  axis=1, keepdims=True)
upper_thr  = chan_means + z_thresh_time * chan_stds
lower_thr  = chan_means - z_thresh_time * chan_stds

eeg_cleaned = data_filt.copy()
bad_mask = (eeg_cleaned > upper_thr) | (eeg_cleaned < lower_thr)
eeg_cleaned[bad_mask] = np.nan

# Linear interpolation for NaNs within each channel
for ci in range(eeg_cleaned.shape[0]):
    y = eeg_cleaned[ci]
    if np.isnan(y).any():
        good = ~np.isnan(y)
        if good.sum() >= 2:
            eeg_cleaned[ci, ~good] = np.interp(
                np.flatnonzero(~good),
                np.flatnonzero(good),
                y[good]
            )
        else:
            eeg_cleaned[ci, ~good] = 0.0  # fallback if too few valid points

# Wrap back to Raw for channel-level ops
raw_cleaned = mne.io.RawArray(eeg_cleaned.astype(float), raw.info.copy())

# -------- (4) Detect & interpolate bad channels BEFORE wICA --------
# The "second code" calls mne.preprocessing.find_bad_channels_maxwell (MEG-specific).
# For EEG data, that function is not applicable. We'll try it, and if it fails, fall back
# to a robust EEG-friendly heuristic (high-variance channels).
bad_labels = []
try:
    from mne.preprocessing import find_bad_channels_maxwell
    # This will likely raise for non-MEG data; included to mirror the second code's intent.
    bad_labels = find_bad_channels_maxwell(raw_cleaned, verbose=True)
    # Some MNE versions return (bads, flats) tuple; normalize to list of names.
    if isinstance(bad_labels, tuple):
        bad_labels = list(bad_labels[0]) + list(bad_labels[1])
    if not isinstance(bad_labels, (list, tuple)):
        bad_labels = list(bad_labels)
except Exception:
    # Fallback: mark channels with z-scored variance > 5 as bad (common EEG heuristic)
    chan_var = np.var(eeg_cleaned, axis=1)
    var_z = (chan_var - chan_var.mean()) / (chan_var.std() + 1e-12)
    bad_idx = np.where(var_z > 5.0)[0].tolist()
    bad_labels = [raw_cleaned.ch_names[i] for i in bad_idx]

raw_cleaned.info["bads"] = list(dict.fromkeys(bad_labels))  # unique & ordered
if len(raw_cleaned.info["bads"]) > 0:
    raw_cleaned.interpolate_bads(reset_bads=True)
else:
    # Ensure bads is cleared if none detected
    raw_cleaned.info["bads"] = []

# -------- (5) Re-reference BEFORE wICA (Common Average Reference as projection, then apply) --------
raw_cleaned.set_eeg_reference("average", projection=True)
raw_cleaned.apply_proj()

# -------- (6) Wavelet-Enhanced ICA (wICA): wavelet first, ICA on low-freq coeffs, inverse, then wavelet recon --------
def wica_wavelet_then_ica(data, n_components=25, wavelet="db4", level=3, random_state=42):
    """
    data: ndarray (channels x time)
    Steps:
      a) Wavelet (wavedec) along time (axis=1) -> get approximation coeffs coeffs[0]
      b) Run ICA on coeffs[0] with shape (time x channels)
      c) Inverse ICA, put back into coeffs[0]
      d) Wavelet reconstruction (waverec) to get cleaned signal (channels x time)
    """
    # 1) Wavelet decompose each channel along time in a vectorized way
    # pywt.wavedec doesn't vectorize across channels, so we loop channels for clarity & parity with "second code"
    coeffs_list = []
    for ci in range(data.shape[0]):
        coeffs_list.append(pywt.wavedec(data[ci], wavelet=wavelet, level=level))

    # Determine the length of approximation coeffs and stack as (channels x T_approx)
    approx_list = [c[0] for c in coeffs_list]
    # Stack into (channels x T_approx)
    A_mat = np.vstack([a[np.newaxis, :] for a in approx_list])

    # 2) ICA on approx coeffs (time x channels)
    A_T = A_mat.T  # shape (T_approx x channels)
    ica = FastICA(n_components=min(n_components, A_T.shape[1]), random_state=random_state, max_iter=1000)
    sources = ica.fit_transform(A_T)          # (T_approx x n_comp)
    A_rec   = ica.inverse_transform(sources)  # (T_approx x channels)

    # 3) Put reconstructed approx back to each channel's coeffs and reconstruct with waverec
    A_rec_T = A_rec.T  # (channels x T_approx)
    cleaned = np.zeros_like(data)
    for ci in range(data.shape[0]):
        coeffs_ci = coeffs_list[ci]
        coeffs_ci[0] = A_rec_T[ci]
        cleaned[ci] = pywt.waverec(coeffs_ci, wavelet=wavelet)

    # Match original length (waverec can be off by a few samples depending on wavelet/padding)
    if cleaned.shape[1] != data.shape[1]:
        min_len = min(cleaned.shape[1], data.shape[1])
        cleaned = cleaned[:, :min_len]
        if data.shape[1] != min_len:
            # pad end with last sample if needed to match exactly
            pad = data.shape[1] - min_len
            if pad > 0:
                cleaned = np.hstack([cleaned, np.tile(cleaned[:, -1:], (1, pad))])

    return cleaned

# Prepare input for wICA
X_in = raw_cleaned.get_data()

# Apply wICA (wavelet first, then ICA on approx coeffs)
X_wica = wica_wavelet_then_ica(X_in, n_components=ica_components, wavelet="db4", level=3, random_state=42)

# Wrap the wICA-cleaned data back into a Raw object (preserve channel info)
raw_wica = mne.io.RawArray(X_wica.astype(float), raw_cleaned.info.copy())

# -------- (7) Save outputs and a short preview --------
base_dir = os.path.dirname(path)

np.save(os.path.join(base_dir, "adam_eeg_clean.npy"),      raw_cleaned.get_data())  # after cleaning + CAR (pre-wICA)
np.save(os.path.join(base_dir, "adam_eeg_clean_wica.npy"), raw_wica.get_data())     # after wICA (final)

seconds_preview = 5
samples_preview = int(seconds_preview * raw_wica.info["sfreq"])
preview = raw_wica.get_data()[:, :samples_preview].T  # (time x channels) for CSV readability
pd.DataFrame(preview, columns=raw_wica.ch_names).to_csv(
    os.path.join(base_dir, "adam_eeg_clean_wica_preview.csv"),
    index=False,
    encoding="utf-8-sig"
)

# -------- Final status report --------
print("✅ Preprocessing complete (MATCHES the 'second code' step order)")
print(" - Channels:", raw_wica.info['nchan'])
print(" - Sampling rate (Hz):", raw_wica.info['sfreq'])
print(" - Clean data shape (channels x time):", raw_wica.get_data().shape)
print(" - Saved:", os.path.join(base_dir, "adam_eeg_clean.npy"))
print(" - Saved:", os.path.join(base_dir, "adam_eeg_clean_wica.npy"))
print(" - Preview CSV:", os.path.join(base_dir, "adam_eeg_clean_wica_preview.csv"))


Creating RawArray with float64 data, n_channels=14, n_times=53000
    Range : 0 ... 52999 =      0.000 ...   414.055 secs
Ready.
Filtering raw data in 1 contiguous segment
Setting up band-stop filter from 59 - 61 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandstop filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 59.35
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 59.10 Hz)
- Upper passband edge: 60.65 Hz
- Upper transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 60.90 Hz)
- Filter length: 845 samples (6.602 s)

Filtering raw data in 1 contiguous segment
Setting up high-pass filter at 0.1 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal highpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuatio

Step 5- Segement to (14,200) shape

In [26]:
# -------- Segment the preprocessed EEG (from raw_wica) into (segments, channels, seg_len) --------

# Choose which cleaned signal to segment:
#   - raw_wica : after notch/high-pass, bad-point repair, bad-channel interpolation, CAR, and wICA (recommended)
#   - raw_clean: same as above but BEFORE wICA (uncomment the next line to use it instead)
source_raw = raw_wica                           # use the fully cleaned signal by default
# source_raw = raw_clean                        # <- alternative: use pre-wICA data

# Pull the NumPy array in (channels x time) format from the MNE Raw object
X = source_raw.get_data()                       # shape: (n_channels, n_times)

# Read useful metadata
sfreq = source_raw.info["sfreq"]                # sampling rate in Hz
n_channels, n_samples = X.shape                 # number of channels and total samples

# ---- Segmentation parameters ----
seg_len = 200                                   # segment length in samples (e.g., 200 samples)
# If you prefer to set segment length in seconds, you can compute it like:
# seg_seconds = 0.8                             # e.g., 0.8 s at 250 Hz would be 200 samples
# seg_len = int(round(seg_seconds * sfreq))

# Compute how many full segments fit without padding (non-overlapping)
n_segments = n_samples // seg_len               # integer number of segments that fully fit

# Guard against too-short recordings
if n_segments == 0:
    raise ValueError(
        f"Recording too short for seg_len={seg_len} samples at sfreq={sfreq} Hz. "
        f"Need ≥ {seg_len} samples, but only have {n_samples}."
    )

# Compute the usable sample count (drop any remainder to avoid partial last segment)
usable = n_segments * seg_len                   # total samples that reshape cleanly

# Trim the signal to the usable length
X_trim = X[:, :usable]                          # keep only the first 'usable' samples

# Reshape to (channels, segments, seg_len) then transpose to (segments, channels, seg_len)
segments = X_trim.reshape(n_channels, n_segments, seg_len).transpose(1, 0, 2)

# Report shapes and time context
print(f"Source (channels x time): {X.shape}")
print(f"Sampling rate (Hz): {sfreq}")
print(f"Segment length (samples): {seg_len}  →  {seg_len / sfreq:.3f} seconds per segment")
print(f"Usable samples: {usable} / {n_samples}  (dropped tail: {n_samples - usable})")
print(f"segments.shape: {segments.shape}")      # (n_segments, n_channels, seg_len)

Source (channels x time): (14, 53000)
Sampling rate (Hz): 128.0
Segment length (samples): 200  →  1.562 seconds per segment
Usable samples: 53000 / 53000  (dropped tail: 0)
segments.shape: (265, 14, 200)


In [27]:
import os                   # file/folder path utilities
import numpy as np          # saving/loading .npy arrays
import csv                  # keep if you’re also writing CSVs elsewhere (safe to remove if unused)

# Assume:
#   - `path` points to your original CSV (used to resolve the output folder)
#   - `segments` is your segmented array with shape (n_segments, n_channels, seg_len)
#   - (Optional) `y` is a labels vector of shape (n_segments,)

# Determine the output directory from the original CSV path
out_dir = os.path.dirname(path)                               # folder where we’ll save .npy

# Ensure the directory exists (no-op if it already does)
os.makedirs(out_dir, exist_ok=True)                           # create the folder if missing

# ---- Save segments as a NumPy array (.npy) with the requested filename ----
adam_eeg_pp_path = os.path.join(out_dir, "jack_eeg_pp.npy")   # target filename per your request
np.save(adam_eeg_pp_path, segments)                           # write the array to disk
print(f"✅ Saved segments → {adam_eeg_pp_path} | shape: {segments.shape}")  # confirm save

✅ Saved segments → C:\Self Learning\Research Papers\UOW Research Papers\eSport Players with EEG Data\EEG Data\Jack\jack_eeg_pp.npy | shape: (265, 14, 200)
