Cell 1 — Install packages

In [1]:
# Install required packages (run once)
%pip install -q mne PyWavelets scikit-learn seaborn imbalanced-learn
%pip install -q azureml-core azure-ai-ml azure-identity

print("✅ Packages installed (if no errors above).")


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
✅ Packages installed (if no errors above).


Cell 2 — Load libraries + set seeds

In [2]:
# Basic utilities
import os  # file ops
import random  # RNG

# Arrays
import numpy as np  # numpy arrays

# EEG processing
import mne  # EEG processing
import pywt  # wavelets

# ML tools
from sklearn.decomposition import FastICA  # ICA
from sklearn.model_selection import StratifiedKFold  # CV
from sklearn.metrics import confusion_matrix, roc_auc_score  # metrics

# Balancing
from imblearn.over_sampling import SMOTE, RandomOverSampler  # SMOTE and fallback

# Helpers
from collections import defaultdict  # grouping
from typing import Optional, Union, Sequence, Dict, Tuple, List  # typing

# Plotting
import matplotlib.pyplot as plt  # plotting
import seaborn as sns  # heatmaps

# TensorFlow / Keras
import tensorflow as tf  # TF
from tensorflow import keras  # keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger  # callbacks

# Try enabling interactive logs (optional)
try:
    tf.keras.utils.enable_interactive_logging()
except Exception:
    pass

# Set seeds
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

print("✅ Imports done + seeds set.")


2026-01-18 10:22:19.174495: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-18 10:22:19.474688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768731739.575114    3146 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768731739.601478    3146 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768731739.844713    3146 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

✅ Imports done + seeds set.


Cell 3 — Azure download + load AD arrays + build labels

In [3]:
# Azure ML workspace tools
from azureml.core import Workspace, Datastore, Dataset

print("[STEP 1] Connecting to Azure ML workspace...")

# Workspace details (given)
subscription_id = "eccc04ba-d8b0-4f70-864a-b4a6753bfc72"
resource_group  = "somnasnest"
workspace_name  = "SomnasNest"

# Folder path in datastore (given)
folder_path = "UI/2026-01-17_162622_UTC/Data Array/"

# Connect to workspace
ws = Workspace.get(
    name=workspace_name,
    subscription_id=subscription_id,
    resource_group=resource_group
)

print("[STEP 1] Workspace connected ✅")

# Get default datastore (workspaceblobstore)
datastore = ws.get_default_datastore()
print("[STEP 1] Default datastore loaded ✅")

# Build FileDataset from folder
ds = Dataset.File.from_files(path=(datastore, folder_path))
print("[STEP 1] FileDataset created ✅")

# Download locally
local_dir = "./_ad_data_array"
os.makedirs(local_dir, exist_ok=True)

print(f"[STEP 1] Downloading data to: {local_dir} ...")
ds.download(target_path=local_dir, overwrite=True)
print("[STEP 1] Download complete ✅")

# Load arrays
arrays = {}  # store loaded arrays

print("[STEP 1] Loading .npy files...")
for root, _, files in os.walk(local_dir):
    for f in files:
        p = os.path.join(root, f)
        if f.lower().endswith(".npy"):
            arrays[f] = np.load(p, allow_pickle=False)

# Safety check
if "ad_negative.npy" not in arrays or "ad_positive.npy" not in arrays:
    raise FileNotFoundError("❌ Could not find ad_negative.npy and ad_positive.npy in downloaded folder.")

# Extract arrays
X_neg = arrays["ad_negative.npy"].astype(np.float32)  # (31,127,150000)
X_pos = arrays["ad_positive.npy"].astype(np.float32)  # (46,127,150000)

print("✅ Arrays loaded:")
print("  ad_negative.npy:", X_neg.shape, X_neg.dtype)
print("  ad_positive.npy:", X_pos.shape, X_pos.dtype)

# Build labels
y_neg = np.zeros((X_neg.shape[0],), dtype=np.int32)  # label 0
y_pos = np.ones((X_pos.shape[0],), dtype=np.int32)   # label 1

# Combine into one dataset
X_all_trials = np.concatenate([X_neg, X_pos], axis=0)  # (77,127,150000)
y_all_trials = np.concatenate([y_neg, y_pos], axis=0)  # (77,)

print("[STEP 1] Combined dataset:")
print("  X_all_trials:", X_all_trials.shape)
print("  y_all_trials:", y_all_trials.shape)
print("  Class counts:", np.unique(y_all_trials, return_counts=True))


[STEP 1] Connecting to Azure ML workspace...
[STEP 1] Workspace connected ✅
[STEP 1] Default datastore loaded ✅
[STEP 1] FileDataset created ✅
[STEP 1] Downloading data to: ./_ad_data_array ...
{'infer_column_types': 'False', 'activity': 'download'}
{'infer_column_types': 'False', 'activity': 'download', 'activityApp': 'FileDataset'}
[STEP 1] Download complete ✅
[STEP 1] Loading .npy files...
✅ Arrays loaded:
  ad_negative.npy: (31, 127, 150000) float32
  ad_positive.npy: (46, 127, 150000) float32
[STEP 1] Combined dataset:
  X_all_trials: (77, 127, 150000)
  y_all_trials: (77,)
  Class counts: (array([0, 1], dtype=int32), array([31, 46]))


Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Cell 4 — Helper + leakage-safe preprocessing classes

In [4]:
# Helper: map channel names
def _names_from_index_mapping(n_channels, index_to_name):
    if index_to_name is None:
        return [f"EEG{i+1}" for i in range(n_channels)]
    keys = list(index_to_name.keys())
    is_zero_based = (0 in keys) and (1 not in keys)
    names = []
    for i in range(n_channels):
        key = i if is_zero_based else (i + 1)
        names.append(index_to_name.get(key, f"EEG{i+1}"))
    return names

# Helper: make MNE Raw
def _make_raw(eeg, sfreq, ch_names, use_standard_1020=True):
    ch_types = ['eog' if str(n).upper().startswith("EOG") else 'eeg' for n in ch_names]
    info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
    raw = mne.io.RawArray(eeg.astype(np.float32, copy=False), info, verbose=False)

    montage_applied = False
    if use_standard_1020:
        try:
            mont = mne.channels.make_standard_montage("standard_1020")
            raw.set_montage(mont, match_case=False, on_missing="ignore")
            montage_applied = True
        except Exception:
            montage_applied = False

    return raw, montage_applied

# Wavelet ICA
class WaveletICA:
    def __init__(self, wavelet="db4", level=3, n_components=10, random_state=42):
        self.wavelet = wavelet
        self.level = level
        self.n_components = n_components
        self.random_state = random_state
        self.ica_ = None
        self._n_ch = None

    def fit(self, X):
        C = X.shape[0]
        self._n_ch = C
        coeffs = pywt.wavedec(X, wavelet=self.wavelet, level=self.level, axis=1)
        A = coeffs[0]
        k = int(min(self.n_components, C))
        self.ica_ = FastICA(n_components=k, random_state=self.random_state)
        S = self.ica_.fit_transform(A.T)
        A_denoised = self.ica_.inverse_transform(S).T
        coeffs[0] = A_denoised
        _ = pywt.waverec(coeffs, wavelet=self.wavelet, axis=1)
        return self

    def transform(self, X):
        assert self.ica_ is not None, "WaveletICA not fitted yet."
        coeffs = pywt.wavedec(X, wavelet=self.wavelet, level=self.level, axis=1)
        A = coeffs[0]
        S = self.ica_.transform(A.T)
        A_denoised = self.ica_.inverse_transform(S).T
        coeffs[0] = A_denoised
        Y = pywt.waverec(coeffs, wavelet=self.wavelet, axis=1)

        if Y.shape[1] < X.shape[1]:
            Y = np.pad(Y, ((0, 0), (0, X.shape[1] - Y.shape[1])), mode="constant")
        elif Y.shape[1] > X.shape[1]:
            Y = Y[:, :X.shape[1]]

        return Y.astype(np.float32, copy=False)

# Leakage-safe preprocessor (same as your sleep deprivation pipeline)
class EEGPreprocessor:
    def __init__(
        self,
        *,
        index_to_name=None,
        use_standard_1020=True,
        resample_to=None,
        notch_freqs=50.0,
        highpass=0.05,
        bad_point_z=6.0,
        bad_channel_z=5.0,
        interpolate_bad_channels=False,
        car=True,
        use_wica=True,
        wica_components=10,
        wica_wavelet="db4",
        wica_level=3,
        wica_random_state=42
    ):
        self.index_to_name = index_to_name
        self.use_standard_1020 = use_standard_1020
        self.resample_to = resample_to
        self.notch_freqs = notch_freqs
        self.highpass = highpass
        self.bad_point_z = bad_point_z
        self.bad_channel_z = bad_channel_z
        self.interpolate_bad_channels = interpolate_bad_channels
        self.car = car
        self.use_wica = use_wica

        self._sfreq_out = None
        self._train_mu = None
        self._train_sd = None
        self._robust_med = None
        self._robust_mad = None
        self._train_eeg_names = None

        self._wica = WaveletICA(
            wavelet=wica_wavelet,
            level=wica_level,
            n_components=wica_components,
            random_state=wica_random_state
        )

    @property
    def sfreq_out(self):
        assert self._sfreq_out is not None, "Preprocessor not run yet."
        return float(self._sfreq_out)

    def _filter_and_reference(self, raw):
        if self.resample_to is not None and float(self.resample_to) != float(raw.info["sfreq"]):
            raw.resample(self.resample_to, npad="auto")
        self._sfreq_out = float(raw.info["sfreq"])

        if self.notch_freqs is not None:
            raw.notch_filter(freqs=self.notch_freqs, verbose=False)

        if self.highpass is not None:
            raw.filter(l_freq=self.highpass, h_freq=None, verbose=False)

        if self.car:
            raw.set_eeg_reference("average", projection=True)
            raw.apply_proj()

    def _repair_transients_with_train_stats(self, raw):
        X = raw.get_data()
        mu = self._train_mu
        sd = self._train_sd
        assert mu is not None and sd is not None, "Training stats not set."

        hi = mu + self.bad_point_z * sd
        lo = mu - self.bad_point_z * sd
        mask = (X > hi) | (X < lo)

        if np.any(mask):
            X_fixed = X.copy()
            t = np.arange(X.shape[1], dtype=float)
            for ch in range(X.shape[0]):
                m = mask[ch]
                if m.any():
                    good = ~m
                    if good.sum() >= 2:
                        X_fixed[ch, m] = np.interp(t[m], t[good], X_fixed[ch, good])
            raw._data = X_fixed

    def fit(self, X_train, sfreq):
        C = X_train.shape[0]
        ch_names = _names_from_index_mapping(C, self.index_to_name)

        raw_train, montage_applied = _make_raw(X_train, sfreq, ch_names, self.use_standard_1020)
        self._filter_and_reference(raw_train)

        Xt = raw_train.get_data()
        self._train_mu = Xt.mean(axis=1, keepdims=True)
        self._train_sd = Xt.std(axis=1, keepdims=True) + 1e-12

        if self.use_wica:
            self._wica.fit(Xt)

        return self

    def transform(self, X, sfreq):
        C = X.shape[0]
        ch_names = _names_from_index_mapping(C, self.index_to_name)

        raw, _ = _make_raw(X, sfreq, ch_names, self.use_standard_1020)
        self._filter_and_reference(raw)
        self._repair_transients_with_train_stats(raw)

        Xf = raw.get_data()
        if self.use_wica:
            Xf = self._wica.transform(Xf)

        return Xf.astype(np.float32, copy=False), self.sfreq_out

    def fit_transform(self, X_train, sfreq):
        self.fit(X_train, sfreq)
        X_clean, fs_out = self.transform(X_train, sfreq)
        return X_clean, fs_out

print("✅ Preprocessing classes loaded.")


✅ Preprocessing classes loaded.


Cell 5 — Set sampling frequency + Preprocess all trials

In [5]:
print("[STEP 2] Starting preprocessing...")

# IMPORTANT:
# Your data arrays do not include fs inside the .npy files.
# If you already know the sampling rate, set it here.
# If not, use the correct value from your dataset documentation.
fs = 1000.0  # <-- CHANGE THIS if your dataset uses a different sampling rate

print(f"[STEP 2] Using fs={fs} Hz")

CHANNEL_MAP = None  # keep same as before

# Create preprocessor (same settings)
pre = EEGPreprocessor(
    index_to_name=CHANNEL_MAP,
    use_standard_1020=True,
    resample_to=None,
    notch_freqs=[50.0, 100.0, 150.0],
    highpass=0.05,
    bad_point_z=6.0,
    bad_channel_z=5.0,
    interpolate_bad_channels=False,
    car=True,
    use_wica=True,
    wica_components=10,
    wica_wavelet="db4",
    wica_level=3,
    wica_random_state=42
)

# Fit preprocessor on a subset (like before)
max_calib_trials = min(10, X_all_trials.shape[0])
print(f"[STEP 2] Fitting preprocessor on first {max_calib_trials} trials...")

calib_trials = X_all_trials[:max_calib_trials]  # take first few
X_calib = np.concatenate(calib_trials, axis=1).astype(np.float32, copy=False)  # concat time

X_calib_clean, fs_out = pre.fit_transform(X_calib, fs)
print(f"[STEP 2] Preprocessor fitted ✅ | fs_out={fs_out} Hz")

# Apply to all trials
data_clean = []
for i in range(X_all_trials.shape[0]):
    X_clean, _ = pre.transform(X_all_trials[i], fs)
    data_clean.append(X_clean.astype(np.float32, copy=False))
    if (i + 1) % 5 == 0:
        print(f"[STEP 2] Preprocessed {i+1}/{X_all_trials.shape[0]} trials...")

data_clean = np.array(data_clean, dtype=np.float32)

print("[STEP 2] Done preprocessing ✅")
print("  data_clean shape:", data_clean.shape)
print("  labels shape    :", y_all_trials.shape)
print("  class counts    :", np.unique(y_all_trials, return_counts=True))


[STEP 2] Starting preprocessing...
[STEP 2] Using fs=1000.0 Hz
[STEP 2] Fitting preprocessor on first 10 trials...
EEG channel type selected for re-referencing
Adding average EEG reference projection.
1 projection items deactivated
Average reference projection was added, but has not been applied yet. Use the apply_proj method to apply it.
Created an SSP operator (subspace dimension = 1)
1 projection items activated
SSP projectors applied...
EEG channel type selected for re-referencing
Adding average EEG reference projection.
1 projection items deactivated
Average reference projection was added, but has not been applied yet. Use the apply_proj method to apply it.
Created an SSP operator (subspace dimension = 1)
1 projection items activated
SSP projectors applied...
[STEP 2] Preprocessor fitted ✅ | fs_out=1000.0 Hz
EEG channel type selected for re-referencing
Adding average EEG reference projection.
1 projection items deactivated
Average reference projection was added, but has not been a

Cell 6 — Augmentation

In [6]:
print("[STEP 3] Starting augmentation (segmentation)...")

def augment_data(trial_data, label, segment_size=100):
    segments = []
    labels = []
    n_segments = trial_data.shape[1] // segment_size

    for i in range(n_segments):
        seg = trial_data[:, i*segment_size:(i+1)*segment_size]
        segments.append(seg.astype(np.float32, copy=False))
        labels.append(int(label))

    return segments, labels

augmented = []
aug_targets = []

for trial, y in zip(data_clean, y_all_trials):
    segs, ys = augment_data(trial, y, segment_size=100)
    augmented.extend(segs)
    aug_targets.extend(ys)

augmented = np.array(augmented, dtype=np.float32)  # (N,127,100)
aug_targets = np.array(aug_targets, dtype=np.int32)

print("[STEP 3] Augmentation done ✅")
print("  augmented shape:", augmented.shape)
print("  aug_targets shape:", aug_targets.shape)
print("  class counts:", np.unique(aug_targets, return_counts=True))


[STEP 3] Starting augmentation (segmentation)...
[STEP 3] Augmentation done ✅
  augmented shape: (115500, 127, 100)
  aug_targets shape: (115500,)
  class counts: (array([0, 1], dtype=int32), array([46500, 69000]))


Cell 7 — “Fair selection”

In [7]:
print("[STEP 4] Fair selection step...")

# Group by class (no subject IDs exist in this array dataset)
class_data = {0: [], 1: []}

for x, y in zip(augmented, aug_targets):
    class_data[int(y)].append(x)

max_per_class = 20000  # adjust if needed

selected_data = []
selected_targets = []

for cls in [0, 1]:
    picked = class_data[cls][:max_per_class]
    selected_data.extend(picked)
    selected_targets.extend([cls] * len(picked))

selected_data = np.array(selected_data, dtype=np.float32)
selected_targets = np.array(selected_targets, dtype=np.int32)

print("[STEP 4] Selection done ✅")
print("  selected_data shape:", selected_data.shape)
print("  selected_targets shape:", selected_targets.shape)
print("  class counts:", np.unique(selected_targets, return_counts=True))


[STEP 4] Fair selection step...
[STEP 4] Selection done ✅
  selected_data shape: (40000, 127, 100)
  selected_targets shape: (40000,)
  class counts: (array([0, 1], dtype=int32), array([20000, 20000]))


Cell 8 — Reshape for Conformer input

In [8]:
print("[STEP 5] Reshaping data for Conformer...")

# (N, C, T) -> (N, C, T, 1)
X_all = selected_data[..., np.newaxis].astype(np.float32, copy=False)
y_all = selected_targets.astype(np.int32, copy=False)

print("[STEP 5] Done ✅")
print("  X_all shape:", X_all.shape)
print("  y_all shape:", y_all.shape)
print("  class counts:", np.unique(y_all, return_counts=True))


[STEP 5] Reshaping data for Conformer...
[STEP 5] Done ✅
  X_all shape: (40000, 127, 100, 1)
  y_all shape: (40000,)
  class counts: (array([0, 1], dtype=int32), array([20000, 20000]))


Cell 9 — Conformer model

In [11]:
# ==================== Cell 9 — Conformer Model (UNCHANGED ARCHITECTURE) ====================
# This is the SAME model architecture you provided.
# Only use create_conformer_model(input_shape=(127, 100, 1)) later in training.

import tensorflow as tf  # tensorflow ops
from tensorflow import keras  # keras base
from tensorflow.keras import layers  # keras layers
import numpy as np  # numpy arrays

print("[STEP 9] Loading Conformer model code (unchanged)...")

# ==================== GLU Activation ====================
class GLU(layers.Layer):
    """Gated Linear Unit - splits input in half and applies gating"""
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, x):
        a, b = tf.split(x, 2, axis=-1)  # split channels into 2 halves
        return a * tf.nn.sigmoid(b)  # gate: a * sigmoid(b)

# ==================== Depthwise Conv1D ====================
class DepthwiseConv1D(layers.Layer):
    """
    Custom Depthwise 1D Convolution using groups.
    TF >= 2.4 required for Conv1D(groups=...).
    """
    def __init__(self, kernel_size, padding='same', **kwargs):
        super().__init__(**kwargs)
        self.kernel_size = kernel_size  # kernel size
        self.padding = padding  # padding mode

    def build(self, input_shape):
        self.channels = input_shape[-1]  # number of channels
        self.conv = layers.Conv1D(
            filters=self.channels,  # output channels = input channels
            kernel_size=self.kernel_size,  # kernel size
            padding=self.padding,  # same padding
            groups=self.channels,  # depthwise conv
            use_bias=False  # no bias
        )

    def call(self, x):
        return self.conv(x)  # apply depthwise conv

# ==================== Transformer-XL Relative Multi-Head Attention ====================
class TransformerXLMultiHeadAttention(layers.Layer):
    """
    Transformer-XL relative multi-head attention with:
    - relative embeddings
    - relative shift trick
    - content bias u and position bias v
    """
    def __init__(self, d_model, num_heads, dropout_rate=0.1, max_len=5000, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model  # embedding dim
        self.num_heads = num_heads  # num heads
        self.dropout_rate = dropout_rate  # dropout
        self.max_len = max_len  # max sequence length

        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.depth = d_model // num_heads  # per-head dim

    def build(self, input_shape):
        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)  # pre-norm

        # Q, K, V projections
        self.wq = layers.Dense(self.d_model, use_bias=False)
        self.wk = layers.Dense(self.d_model, use_bias=False)
        self.wv = layers.Dense(self.d_model, use_bias=False)

        # relative projection W_r
        self.w_r = layers.Dense(self.d_model, use_bias=False)

        # sinusoidal relative position embeddings (2*max_len+1, d_model)
        pos = np.arange(-self.max_len, self.max_len + 1, dtype=np.float32)[:, None]
        dim = np.arange(self.d_model, dtype=np.float32)[None, :]
        angle = pos / (10000 ** (2 * (dim // 2) / self.d_model))
        pe = np.where(dim % 2 == 0, np.sin(angle), np.cos(angle)).astype(np.float32)

        self.rel_pos_emb = self.add_weight(
            name='rel_pos_emb',
            shape=(2 * self.max_len + 1, self.d_model),
            initializer=keras.initializers.Constant(pe),
            trainable=False
        )

        # biases u and v (per head)
        self.u_bias = self.add_weight(
            name='u_bias',
            shape=(self.num_heads, self.depth),
            initializer='zeros',
            trainable=True
        )
        self.v_bias = self.add_weight(
            name='v_bias',
            shape=(self.num_heads, self.depth),
            initializer='zeros',
            trainable=True
        )

        self.dense_out = layers.Dense(self.d_model)  # output projection
        self.dropout_attn = layers.Dropout(self.dropout_rate)  # dropout on attn weights
        self.dropout_out = layers.Dropout(self.dropout_rate)  # dropout on output

    def rel_shift(self, x):
        """
        Relative shift trick.
        Input:  (B, H, L, 2L-1)
        Output: (B, H, L, L)
        """
        batch_size = tf.shape(x)[0]
        heads = tf.shape(x)[1]
        seq_len = tf.shape(x)[2]

        x_padded = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]])  # pad left
        x_padded = tf.reshape(x_padded, [batch_size, heads, 2 * seq_len, seq_len])  # reshape
        x_shifted = x_padded[:, :, 1:, :]  # drop first row
        x_shifted = tf.reshape(x_shifted, [batch_size, heads, seq_len, 2 * seq_len - 1])  # reshape back
        return x_shifted[:, :, :, :seq_len]  # keep first L columns

    def call(self, x, mask=None, training=False):
        batch_size = tf.shape(x)[0]
        seq_len = tf.shape(x)[1]

        # ensure seq_len <= max_len
        tf.debugging.assert_less_equal(seq_len, self.max_len, message="seq_len too large for max_len")

        x_norm = self.layer_norm(x)  # pre-norm

        q = self.wq(x_norm)  # (B, L, D)
        k = self.wk(x_norm)  # (B, L, D)
        v = self.wv(x_norm)  # (B, L, D)

        # reshape to (B, L, H, d)
        q = tf.reshape(q, [batch_size, seq_len, self.num_heads, self.depth])
        k = tf.reshape(k, [batch_size, seq_len, self.num_heads, self.depth])
        v = tf.reshape(v, [batch_size, seq_len, self.num_heads, self.depth])

        # transpose to (B, H, L, d)
        q = tf.transpose(q, [0, 2, 1, 3])
        k = tf.transpose(k, [0, 2, 1, 3])
        v = tf.transpose(v, [0, 2, 1, 3])

        # relative indices for L -> (2L-1)
        r_indices = tf.range(2 * seq_len - 1, dtype=tf.int32)
        r_indices_centered = r_indices - (seq_len - 1) + self.max_len
        r_emb = tf.gather(self.rel_pos_emb, r_indices_centered)  # (2L-1, D)

        # project relative embedding
        r = self.w_r(r_emb)  # (2L-1, D)
        r = tf.reshape(r, [2 * seq_len - 1, self.num_heads, self.depth])  # (2L-1, H, d)
        r = tf.transpose(r, [1, 0, 2])  # (H, 2L-1, d)

        # AC = (Q+u)K^T
        q_with_u = q + self.u_bias[None, :, None, :]
        AC = tf.matmul(q_with_u, k, transpose_b=True)  # (B, H, L, L)

        # BD = (Q+v)R^T then rel_shift
        q_with_v = q + self.v_bias[None, :, None, :]
        BD = tf.einsum('bhld,hrd->bhlr', q_with_v, r)  # (B, H, L, 2L-1)
        BD_shifted = self.rel_shift(BD)  # (B, H, L, L)

        attn_score = AC + BD_shifted  # combine
        attn_score = attn_score / tf.math.sqrt(tf.cast(self.depth, tf.float32))  # scale

        # optional mask
        if mask is not None:
            mask = tf.cast(mask, attn_score.dtype)
            attn_score += (1.0 - mask) * -1e9

        attn_weights = tf.nn.softmax(attn_score, axis=-1)  # softmax
        attn_weights = self.dropout_attn(attn_weights, training=training)  # dropout

        attn_output = tf.matmul(attn_weights, v)  # (B, H, L, d)

        # concat heads: (B, L, D)
        attn_output = tf.transpose(attn_output, [0, 2, 1, 3])
        attn_output = tf.reshape(attn_output, [batch_size, seq_len, self.d_model])

        output = self.dense_out(attn_output)  # output proj
        output = self.dropout_out(output, training=training)  # dropout
        return output

# ==================== Feed Forward Module ====================
class FeedForwardModule(layers.Layer):
    """FFN with 4x expansion and swish"""
    def __init__(self, d_model, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)  # pre-norm
        self.dense1 = layers.Dense(self.d_model * 4)  # expand 4x
        self.swish = layers.Activation('swish')  # swish
        self.dropout1 = layers.Dropout(self.dropout_rate)  # dropout
        self.dense2 = layers.Dense(self.d_model)  # project back
        self.dropout2 = layers.Dropout(self.dropout_rate)  # dropout

    def call(self, x, training=False):
        x_norm = self.layer_norm(x)
        x = self.dense1(x_norm)
        x = self.swish(x)
        x = self.dropout1(x, training=training)
        x = self.dense2(x)
        x = self.dropout2(x, training=training)
        return x

# ==================== Convolution Module ====================
class ConvolutionModule(layers.Layer):
    """Conv module with GLU + depthwise conv"""
    def __init__(self, d_model, kernel_size=32, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.kernel_size = kernel_size
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)  # pre-norm
        self.pointwise_conv1 = layers.Conv1D(self.d_model * 2, kernel_size=1)  # expand
        self.glu = GLU()  # GLU gate
        self.depthwise_conv = DepthwiseConv1D(kernel_size=self.kernel_size, padding='same')  # depthwise
        self.batch_norm = layers.BatchNormalization()  # batch norm
        self.swish = layers.Activation('swish')  # swish
        self.pointwise_conv2 = layers.Conv1D(self.d_model, kernel_size=1)  # project back
        self.dropout = layers.Dropout(self.dropout_rate)  # dropout

    def call(self, x, training=False):
        x_norm = self.layer_norm(x)
        x = self.pointwise_conv1(x_norm)
        x = self.glu(x)
        x = self.depthwise_conv(x)
        x = self.batch_norm(x, training=training)
        x = self.swish(x)
        x = self.pointwise_conv2(x)
        x = self.dropout(x, training=training)
        return x

# ==================== Conformer Block ====================
class ConformerBlock(layers.Layer):
    """FFN(0.5) -> MHSA -> Conv -> FFN(0.5) -> LN"""
    def __init__(self, d_model, num_heads, kernel_size=32, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.ffn1 = FeedForwardModule(d_model, dropout_rate)
        self.mhsa = TransformerXLMultiHeadAttention(d_model, num_heads, dropout_rate)
        self.conv = ConvolutionModule(d_model, kernel_size, dropout_rate)
        self.ffn2 = FeedForwardModule(d_model, dropout_rate)
        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)

    def call(self, x, mask=None, training=False):
        x = x + 0.5 * self.ffn1(x, training=training)
        x = x + self.mhsa(x, mask=mask, training=training)
        x = x + self.conv(x, training=training)
        x = x + 0.5 * self.ffn2(x, training=training)
        return self.layer_norm(x)

# ==================== Conformer Encoder (Vision Adaptation) ====================
class ConformerEncoder(keras.Model):
    """Vision-style Conformer encoder for binary classification"""
    def __init__(self, input_shape_tuple, num_blocks=4, d_model=128, num_heads=4, kernel_size=32, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.input_shape_tuple = input_shape_tuple
        self.num_blocks = num_blocks
        self.d_model = d_model

        # Conv subsampling (same architecture)
        self.conv_subsample = keras.Sequential([
            layers.Conv2D(d_model // 2, kernel_size=3, strides=2, padding='same'),
            layers.ReLU(),
            layers.Conv2D(d_model, kernel_size=3, strides=2, padding='same'),
            layers.ReLU(),
        ])

        self.conformer_blocks = [
            ConformerBlock(d_model, num_heads, kernel_size, dropout_rate)
            for _ in range(num_blocks)
        ]

        self.global_pool = layers.GlobalAveragePooling1D()
        self.final_dropout = layers.Dropout(dropout_rate)
        self.classifier = layers.Dense(1, activation='sigmoid')

    def call(self, x, training=False):
        x = self.conv_subsample(x, training=training)

        batch_size = tf.shape(x)[0]
        height = tf.shape(x)[1]
        width = tf.shape(x)[2]

        x = tf.reshape(x, [batch_size, height * width, self.d_model])

        for block in self.conformer_blocks:
            x = block(x, mask=None, training=training)

        x = self.global_pool(x)
        x = self.final_dropout(x, training=training)
        return self.classifier(x)

# ==================== Model Creation ====================
def create_conformer_model(input_shape=(127, 100, 1)):
    """Create and compile Conformer model (same architecture)"""
    model = ConformerEncoder(
        input_shape_tuple=input_shape,
        num_blocks=4,
        d_model=128,
        num_heads=4,
        kernel_size=32,
        dropout_rate=0.1
    )

    dummy = tf.zeros((1,) + input_shape)  # dummy input
    _ = model(dummy, training=False)  # build model

    model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),  # optimizer
    loss='binary_crossentropy',  # loss
    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.5)]  # metric
)


    return model

print("[STEP 9] ✅ Conformer model code ready.")
print("[STEP 9] Example: model = create_conformer_model(input_shape=(127, 100, 1))")


[STEP 9] Loading Conformer model code (unchanged)...
[STEP 9] ✅ Conformer model code ready.
[STEP 9] Example: model = create_conformer_model(input_shape=(127, 100, 1))


Cell 10 — 5-Fold CV + train-only normalization + SMOTE + EarlyStopping + LR scheduling

In [12]:
print("[STEP 6] Starting cross-validation...")

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_metrics = []
fold_conf_mats = []

# Input shape for Conformer (H,W,C) = (127, 100, 1)
input_shape = (X_all.shape[1], X_all.shape[2], X_all.shape[3])
print("[STEP 6] Model input shape:", input_shape)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y_all), start=1):
    print("\n" + "=" * 70)
    print(f"[FOLD {fold}/{n_splits}] Splitting data...")

    # Split
    X_train_raw = X_all[train_idx].astype(np.float32, copy=False)
    y_train_raw = y_all[train_idx].astype(np.int32, copy=False)
    X_val_raw = X_all[val_idx].astype(np.float32, copy=False)
    y_val_raw = y_all[val_idx].astype(np.int32, copy=False)

    print(f"[FOLD {fold}] Train size: {X_train_raw.shape[0]} | Val size: {X_val_raw.shape[0]}")
    print(f"[FOLD {fold}] Train class count:", np.unique(y_train_raw, return_counts=True))
    print(f"[FOLD {fold}] Val class count  :", np.unique(y_val_raw, return_counts=True))

    # -------- Normalization (train only) --------
    print(f"[FOLD {fold}] Normalizing (train stats only)...")

    eps = 1e-6
    train_mean = np.mean(X_train_raw, axis=(0, 2, 3), keepdims=True).astype(np.float32)
    train_std = np.std(X_train_raw, axis=(0, 2, 3), keepdims=True).astype(np.float32)
    train_std = np.maximum(train_std, eps)

    X_train_norm = (X_train_raw - train_mean) / train_std
    X_val_norm = (X_val_raw - train_mean) / train_std

    print(f"[FOLD {fold}] Normalization done ✅")

    # -------- SMOTE (train only) --------
    print(f"[FOLD {fold}] Applying SMOTE (train only)...")

    X_train_2d = X_train_norm.reshape(X_train_norm.shape[0], -1)
    unique_before, counts_before = np.unique(y_train_raw, return_counts=True)
    minority_n = int(np.min(counts_before))

    if minority_n >= 2:
        k_neighbors = max(1, min(5, minority_n - 1))
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_bal_2d, y_bal = smote.fit_resample(X_train_2d, y_train_raw)
        print(f"[FOLD {fold}] SMOTE applied ✅ (k={k_neighbors})")
    else:
        ros = RandomOverSampler(random_state=42)
        X_bal_2d, y_bal = ros.fit_resample(X_train_2d, y_train_raw)
        print(f"[FOLD {fold}] SMOTE not possible → used RandomOverSampler ⚠️")

    X_train_bal = X_bal_2d.reshape(-1, *input_shape).astype(np.float32)
    y_train_bal = y_bal.astype(np.float32)

    print(f"[FOLD {fold}] Train balanced shape:", X_train_bal.shape)
    print(f"[FOLD {fold}] Train balanced counts:", np.unique(y_train_bal, return_counts=True))

    # -------- Build model --------
    print(f"[FOLD {fold}] Building Conformer model...")

    model = create_conformer_model(input_shape=input_shape)  # SAME ARCH, only input shape changes

    print(f"[FOLD {fold}] Model built ✅")

    # Callbacks
    early_stop = EarlyStopping(
        monitor="val_loss",
        patience=15,
        restore_best_weights=True,
        verbose=1
    )
    lr_plateau = ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )
    ckpt = ModelCheckpoint(
        filepath=f"ad_fold_{fold:02d}_best.keras",
        monitor="val_loss",
        save_best_only=True,
        verbose=1
    )
    csv_logger = CSVLogger(f"ad_fold_{fold:02d}_log.csv", append=False)

    callbacks = [early_stop, lr_plateau, ckpt, csv_logger]

    # Train
    print(f"[FOLD {fold}] Training started...")
    history = model.fit(
        X_train_bal, y_train_bal,
        validation_data=(X_val_norm, y_val_raw.astype(np.float32)),
        epochs=200,
        batch_size=16,
        shuffle=True,
        verbose=1,
        callbacks=callbacks
    )
    print(f"[FOLD {fold}] Training finished ✅")

    # Evaluate
    print(f"[FOLD {fold}] Evaluating...")
    val_loss, val_acc, val_auc = model.evaluate(X_val_norm, y_val_raw.astype(np.float32), verbose=1)
    print(f"[FOLD {fold}] val_loss={val_loss:.4f} | val_acc={val_acc:.4f} | val_auc={val_auc:.4f}")

    # Predictions
    y_prob = model.predict(X_val_norm, verbose=1).reshape(-1)
    y_pred = (y_prob > 0.5).astype(int)

    # Confusion matrix
    cm = confusion_matrix(y_val_raw, y_pred)
    fold_conf_mats.append(cm)
    print(f"[FOLD {fold}] Confusion matrix:\n{cm}")

    # Metrics from CM
    TN, FP, FN, TP = cm.ravel()
    accuracy = (TP + TN) / max((TP + TN + FP + FN), 1)
    precision = TP / max((TP + FP), 1)
    recall = TP / max((TP + FN), 1)
    specificity = TN / max((TN + FP), 1)
    f1 = (2 * precision * recall) / max((precision + recall), 1e-12)

    try:
        auc_val = float(roc_auc_score(y_val_raw, y_prob))
    except Exception:
        auc_val = float("nan")

    fold_metrics.append({
        "fold": fold,
        "val_loss": float(val_loss),
        "val_acc": float(val_acc),
        "auc": float(auc_val),
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "specificity": float(specificity),
        "f1": float(f1)
    })

    # Plot CM
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Fold {fold} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()

print("\n" + "=" * 70)
print("[STEP 6] Cross-validation finished ✅")

# Summaries
def safe_mean(vals):
    return float(np.nanmean(np.array(vals, dtype=float)))

def safe_std(vals):
    return float(np.nanstd(np.array(vals, dtype=float)))

avg = {k: safe_mean([m[k] for m in fold_metrics]) for k in fold_metrics[0].keys() if k != "fold"}
std = {k: safe_std([m[k] for m in fold_metrics]) for k in avg.keys()}

print("\n[FINAL] Average CV metrics (mean ± std):")
for k in avg:
    print(f"  {k:12s}: {avg[k]:.4f} ± {std[k]:.4f}")

cm_sum = np.sum(np.stack(fold_conf_mats, axis=0), axis=0)
print("\n[FINAL] Summed confusion matrix across folds:")
print(cm_sum)

plt.figure(figsize=(5, 4))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="Blues")
plt.title("Summed Confusion Matrix (All Folds)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()


[STEP 6] Starting cross-validation...
[STEP 6] Model input shape: (127, 100, 1)

[FOLD 1/5] Splitting data...
[FOLD 1] Train size: 32000 | Val size: 8000
[FOLD 1] Train class count: (array([0, 1], dtype=int32), array([16000, 16000]))
[FOLD 1] Val class count  : (array([0, 1], dtype=int32), array([4000, 4000]))
[FOLD 1] Normalizing (train stats only)...
[FOLD 1] Normalization done ✅
[FOLD 1] Applying SMOTE (train only)...
[FOLD 1] SMOTE applied ✅ (k=5)
[FOLD 1] Train balanced shape: (32000, 127, 100, 1)
[FOLD 1] Train balanced counts: (array([0., 1.], dtype=float32), array([16000, 16000]))
[FOLD 1] Building Conformer model...
[FOLD 1] Model built ✅
[FOLD 1] Training started...
Epoch 1/200
[1m   1/2000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20:14:58[0m 36s/step - accuracy: 0.4375 - loss: 0.7833[1m   2/2000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:18:02[0m 10s/step - accuracy: 0.5156 - loss: 0.8604 

KeyboardInterrupt: 