In [19]:
from __future__ import annotations
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
from dataclasses import dataclass
from typing import List, Tuple, Optional, Dict, Any

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix

data_dir = '../data/'
model_dir = '../checkpoints/'

In [22]:
@dataclass(frozen=True)
class FeatureSpec:
    channel: int
    shapelet_idx: int
    shapelet_len: int


def _zscore(x: np.ndarray, axis=None, eps: float = 1e-8) -> np.ndarray:
    m = x.mean(axis=axis, keepdims=True)
    s = x.std(axis=axis, keepdims=True)
    return (x - m) / (s + eps)


def min_sliding_distance(
    x: np.ndarray,
    s: np.ndarray,
    per_window_z: bool = False,
    eps: float = 1e-8,
) -> float:
    """
    Minimal Euclidean distance between a 1D time series x (len T)
    and a shapelet s (len L), computed over all T-L+1 windows.

    If per_window_z=True, each window and the shapelet are z-normalized
    before computing distances (useful for amplitude/offset invariance).
    """
    T = x.shape[0]
    L = s.shape[0]
    if L > T:
        return np.inf  # cannot slide shapelet longer than the series

    # rolling windows view: shape (T-L+1, L)
    W = sliding_window_view(x, L)

    if per_window_z:
        # z-normalize each window (row-wise) and the shapelet once
        Wn = _zscore(W, axis=1, eps=eps)
        sn = _zscore(s, axis=0, eps=eps)
        # broadcast subtract → (T-L+1, L)
        d2 = np.square(Wn - sn).sum(axis=1)
    else:
        d2 = np.square(W - s).sum(axis=1)

    return float(np.sqrt(d2.min()))


def build_shapelet_features(
    X: np.ndarray,                        # (n_samples, n_channels, n_timepoints)
    shapelets: List[List[np.ndarray]],    # list over channels -> list of 1D arrays
    *,
    per_window_z: bool = True,
    global_channel_z: bool = False,
    verbose: bool = True,
) -> Tuple[np.ndarray, List[FeatureSpec]]:
    """
    Returns:
      F: (n_samples, n_features) shapelet-distance features
      specs: list mapping feature index -> (channel, shapelet_idx, shapelet_len)
    """
    n_samples, n_channels, n_time = X.shape
    assert len(shapelets) == n_channels, "Provide shapelets per channel."

    # Optional: z-normalize each channel globally per sample (not window-wise).
    # This de-biases scale differences before the per-window computation.
    Xn = X.copy()
    if global_channel_z:
        for i in range(n_samples):
            for c in range(n_channels):
                Xn[i, c] = _zscore(Xn[i, c])

    # Precompute total features and a spec map
    specs: List[FeatureSpec] = []
    for c in range(n_channels):
        for j, shp in enumerate(shapelets[c]):
            specs.append(FeatureSpec(channel=c, shapelet_idx=j, shapelet_len=len(shp)))
    n_features = len(specs)

    if verbose:
        total_shapelets = sum(len(lst) for lst in shapelets)
        print(f"[build] samples={n_samples}, channels={n_channels}, time={n_time}")
        print(f"[build] total shapelets={total_shapelets}, features={n_features}")
        if per_window_z:
            print("[build] distance = min Euclidean on z-scored windows")
        elif global_channel_z:
            print("[build] distance = min Euclidean (channels globally z-scored)")
        else:
            print("[build] distance = min Euclidean (raw)")

    F = np.empty((n_samples, n_features), dtype=np.float32)

    # Compute features
    # Feature index traverses channel-major then shapelet index
    f_idx = 0
    for c in range(n_channels):
        S_c = shapelets[c]
        if verbose:
            print(f"[build] channel {c}: {len(S_c)} shapelets")
        for j, shp in enumerate(S_c):
            # ensure 1D float array
            s = np.asarray(shp, dtype=float).ravel()
            for i in range(n_samples):
                x = np.asarray(Xn[i, c], dtype=float).ravel()
                F[i, f_idx] = min_sliding_distance(x, s, per_window_z=per_window_z)
            f_idx += 1

    return F, specs


def summarize_top_features(
    importances: np.ndarray,
    specs: List[FeatureSpec],
    k: int = 20,
    title: str = "Top features",
) -> List[Dict[str, Any]]:
    idx = np.argsort(importances)[::-1]  # descending
    out = []
    for r in idx[:k]:
        spec = specs[r]
        out.append({
            "rank": len(out) + 1,
            "feature_index": int(r),
            "channel": spec.channel,
            "shapelet_idx": spec.shapelet_idx,
            "shapelet_len": spec.shapelet_len,
            "importance": float(importances[r]),
        })
    print(f"\n{title} (top {min(k, len(importances))}):")
    for row in out:
        print(
            f"#{row['rank']:>2}  f={row['feature_index']:>4}  "
            f"[ch={row['channel']}, shp={row['shapelet_idx']}, L={row['shapelet_len']}]  "
            f"imp={row['importance']:.6f}"
        )
    return out

def holdout_eval(model, F, y, test_size=0.25, random_state=0):
    X_tr, X_te, y_tr, y_te = train_test_split(
        F, y, test_size=test_size, stratify=y, random_state=random_state
    )
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    acc = accuracy_score(y_te, y_pred)

    # Optional extras
    try:
        auc = roc_auc_score(y_te, model.predict_proba(X_te)[:, 1])
    except Exception:
        auc = None
    f1 = f1_score(y_te, y_pred)
    cm = confusion_matrix(y_te, y_pred)

    print(f"[holdout] accuracy={acc:.3f} | f1={f1:.3f} | auc={auc if auc is not None else 'n/a'}")
    print("[holdout] confusion matrix:\n", cm)
    return {"accuracy": acc, "f1": f1, "auc": auc, "cm": cm}


def demo_train_and_rank(
    X: np.ndarray,
    y: np.ndarray,
    shapelets: List[List[np.ndarray]],
    *,
    per_window_z: bool = True,
    global_channel_z: bool = False,
    random_state: int = 0,
    n_perm_repeats: int = 10,
    use_random_forest: bool = True,
    rf_kwargs: Optional[dict] = None,
) -> Dict[str, Any]:
    """
    Build features, fit models, and compute feature importance.
    Returns a dict with features, mapping, models, and importance arrays.
    """
    # 1) Build features
    F, specs = build_shapelet_features(
        X, shapelets,
        per_window_z=per_window_z,
        global_channel_z=global_channel_z,
        verbose=True,
    )

    results: Dict[str, Any] = {"F": F, "specs": specs}

    # 2) Model A: Logistic Regression with L1 (sparse, interpretable weights)
    logi = make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        LogisticRegressionCV(
            Cs=20,
            penalty="l1",
            solver="liblinear",
            scoring="roc_auc",
            cv=5,
            max_iter=2000,
            n_jobs=None,
            random_state=random_state,
            refit=True,
        )
    )
    logi.fit(F, y)
    results["logistic_pipeline"] = logi

    # Extract absolute coefficients as a crude importance
    lr = logi.named_steps["logisticregressioncv"]
    # coef_ shape (1, n_features) for binary → flatten
    coef_abs = np.abs(lr.coef_.ravel())
    results["coef_abs"] = coef_abs
    summarize_top_features(coef_abs, specs, k=20, title="L1-LogReg | |coef|")

    # 3) Model B (optional): Random Forest + impurity importance
    if use_random_forest:
        rf = RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=random_state,
            n_jobs=-1,
            **(rf_kwargs or {})
        )
        rf.fit(F, y)
        results["rf"] = rf
        rf_imp = rf.feature_importances_
        results["rf_importance"] = rf_imp
        summarize_top_features(rf_imp, specs, k=20, title="RandomForest | impurity importance")

    # 4) Permutation importance (model-agnostic). Use the better of the two models by AUC.
    # Compute quick train AUCs to decide which model to permute on (not perfect, but simple).
    from sklearn.metrics import roc_auc_score
    auc_logi = roc_auc_score(y, logi.predict_proba(F)[:, 1])
    model_for_perm = logi
    model_name = "LogReg"
    if use_random_forest:
        auc_rf = roc_auc_score(y, rf.predict_proba(F)[:, 1])
        if auc_rf > auc_logi:
            model_for_perm = rf
            model_name = "RF"

    print(f"\n[perm] Using {model_name} for permutation importance (train AUC baseline).")
    perm = permutation_importance(
        model_for_perm, F, y,
        scoring="roc_auc",
        n_repeats=n_perm_repeats,
        random_state=random_state,
        n_jobs=-1,
    )
    perm_mean = perm.importances_mean
    perm_std = perm.importances_std
    results["perm_mean"] = perm_mean
    results["perm_std"] = perm_std
    summarize_top_features(perm_mean, specs, k=20, title="Permutation importance | mean ΔAUC")

    eval_res = holdout_eval(logi, F, y, test_size=0.25, random_state=0)

    return results, eval_res

In [4]:
import torch

import sys, os
sys.path.insert(0, os.path.abspath(".."))

from models.Shapelet import ShapeBottleneckModel
from data_provider.data_loader import UEAloader

In [18]:
root_path = data_dir + 'FingerMovements'
data = UEAloader(root_path, flag="TRAIN")

@dataclass
class Config:
    epsilon: float = 1.0
    distance_func: str = 'euclidean'
    memory_efficient: bool = True
    seq_len: int = 50
    enc_in: int = 28
    num_class: int = 2
    pool: str = 'max'
    sbm_cls: str = 'linear'
    dropout: float = 0.0
    lambda_div: float = 0.1
    lambda_reg: float = 0.1

config = Config()

path_large = model_dir + 'SBM/FingerMovements/dnn-FCN_seed-0_k-10_div-0.1_reg-0.1_eps-1.0_beta-constant_dfunc-euclidean_cls-linear'
                            
model_large =  ShapeBottleneckModel(num_shapelet=[10, 10, 10, 10, 10, 10], shapelet_len=[0.05, 0.1, 0.2, 0.3, 0.5, 0.8], pool='max', configs=config).eval()
model_large.load_state_dict(torch.load(f"{path_large}/checkpoint.pth"))

shapelets = [[] for i in range(28)]
for i in range(5):
    sh = model_large.shapelets[i].weights
    for j in range(10):
        for ch in range(28):
            shapelets[ch].append(sh[j, ch, :].cpu().detach().numpy())

316


In [23]:
X = np.array([x[0].numpy().T for x in data])
y = np.array([x[1].item() for x in data])

_ = demo_train_and_rank(
    X, y, shapelets,
    per_window_z=False,         # z-normalize each window + shapelet (recommended)
    global_channel_z=False,    # or True, if you want an extra global channel z-norm
    n_perm_repeats=8,
    use_random_forest=False,
)

[build] samples=316, channels=28, time=50
[build] total shapelets=1400, features=1400
[build] distance = min Euclidean (raw)
[build] channel 0: 50 shapelets
[build] channel 1: 50 shapelets
[build] channel 2: 50 shapelets
[build] channel 3: 50 shapelets
[build] channel 4: 50 shapelets
[build] channel 5: 50 shapelets
[build] channel 6: 50 shapelets
[build] channel 7: 50 shapelets
[build] channel 8: 50 shapelets
[build] channel 9: 50 shapelets
[build] channel 10: 50 shapelets
[build] channel 11: 50 shapelets
[build] channel 12: 50 shapelets
[build] channel 13: 50 shapelets
[build] channel 14: 50 shapelets
[build] channel 15: 50 shapelets
[build] channel 16: 50 shapelets
[build] channel 17: 50 shapelets
[build] channel 18: 50 shapelets
[build] channel 19: 50 shapelets
[build] channel 20: 50 shapelets
[build] channel 21: 50 shapelets
[build] channel 22: 50 shapelets
[build] channel 23: 50 shapelets
[build] channel 24: 50 shapelets
[build] channel 25: 50 shapelets
[build] channel 26: 50 shap

In [25]:
root_path = data_dir + 'SelfRegulationSCP1'
data = UEAloader(root_path, flag="TRAIN")

@dataclass
class Config:
    epsilon: float = 1.0
    distance_func: str = 'euclidean'
    memory_efficient: bool = True
    seq_len: int = 896
    enc_in: int = 6
    num_class: int = 2
    pool: str = 'max'
    sbm_cls: str = 'linear'
    dropout: float = 0.0
    lambda_div: float = 0.1
    lambda_reg: float = 0.1

config = Config()

path_large = model_dir + 'SBM/SelfRegulationSCP1/dnn-FCN_seed-0_k-10_div-0.1_reg-0.1_eps-1.0_beta-constant_dfunc-euclidean_cls-linear'
                            
model_large =  ShapeBottleneckModel(num_shapelet=[10, 10, 10, 10, 10, 10], shapelet_len=[0.05, 0.1, 0.2, 0.3, 0.5, 0.8], pool='max', configs=config).eval()
model_large.load_state_dict(torch.load(f"{path_large}/checkpoint.pth"))

shapelets = [[] for i in range(6)]
for i in range(5):
    sh = model_large.shapelets[i].weights
    for j in range(10):
        for ch in range(6):
            shapelets[ch].append(sh[j, ch, :].cpu().detach().numpy())

X = np.array([x[0].numpy().T for x in data])
y = np.array([x[1].item() for x in data])

_ = demo_train_and_rank(
    X, y, shapelets,
    per_window_z=False,         # z-normalize each window + shapelet (recommended)
    global_channel_z=False,    # or True, if you want an extra global channel z-norm
    n_perm_repeats=8,
    use_random_forest=False,
)

268
[build] samples=268, channels=6, time=896
[build] total shapelets=300, features=300
[build] distance = min Euclidean (raw)
[build] channel 0: 50 shapelets
[build] channel 1: 50 shapelets
[build] channel 2: 50 shapelets
[build] channel 3: 50 shapelets
[build] channel 4: 50 shapelets
[build] channel 5: 50 shapelets

L1-LogReg | |coef| (top 20):
# 1  f= 297  [ch=5, shp=47, L=448]  imp=7.230367
# 2  f= 258  [ch=5, shp=8, L=45]  imp=3.856688
# 3  f= 150  [ch=3, shp=0, L=45]  imp=2.672370
# 4  f= 254  [ch=5, shp=4, L=45]  imp=2.660120
# 5  f=  51  [ch=1, shp=1, L=45]  imp=2.543900
# 6  f= 193  [ch=3, shp=43, L=448]  imp=2.314850
# 7  f= 136  [ch=2, shp=36, L=269]  imp=2.029783
# 8  f= 287  [ch=5, shp=37, L=269]  imp=1.443823
# 9  f=  11  [ch=0, shp=11, L=90]  imp=1.237195
#10  f=  37  [ch=0, shp=37, L=269]  imp=1.169815
#11  f= 206  [ch=4, shp=6, L=45]  imp=0.904482
#12  f=  26  [ch=0, shp=26, L=180]  imp=0.853962
#13  f=   4  [ch=0, shp=4, L=45]  imp=0.651980
#14  f= 158  [ch=3, shp=8, 

In [26]:
root_path = data_dir + 'SelfRegulationSCP2'
data = UEAloader(root_path, flag="TRAIN")

@dataclass
class Config:
    epsilon: float = 1.0
    distance_func: str = 'euclidean'
    memory_efficient: bool = True
    seq_len: int = 1152
    enc_in: int = 7
    num_class: int = 2
    pool: str = 'max'
    sbm_cls: str = 'linear'
    dropout: float = 0.0
    lambda_div: float = 0.1
    lambda_reg: float = 0.1

config = Config()

path_large = model_dir + 'SBM/SelfRegulationSCP2/dnn-FCN_seed-0_k-10_div-0.1_reg-0.1_eps-1.0_beta-constant_dfunc-euclidean_cls-linear'
                            
model_large =  ShapeBottleneckModel(num_shapelet=[10, 10, 10, 10, 10, 10], shapelet_len=[0.05, 0.1, 0.2, 0.3, 0.5, 0.8], pool='max', configs=config).eval()
model_large.load_state_dict(torch.load(f"{path_large}/checkpoint.pth"))

shapelets = [[] for i in range(7)]
for i in range(5):
    sh = model_large.shapelets[i].weights
    for j in range(10):
        for ch in range(7):
            shapelets[ch].append(sh[j, ch, :].cpu().detach().numpy())

X = np.array([x[0].numpy().T for x in data])
y = np.array([x[1].item() for x in data])

_ = demo_train_and_rank(
    X, y, shapelets,
    per_window_z=False,         # z-normalize each window + shapelet (recommended)
    global_channel_z=False,    # or True, if you want an extra global channel z-norm
    n_perm_repeats=8,
    use_random_forest=False,
)

200
[build] samples=200, channels=7, time=1152
[build] total shapelets=350, features=350
[build] distance = min Euclidean (raw)
[build] channel 0: 50 shapelets
[build] channel 1: 50 shapelets
[build] channel 2: 50 shapelets
[build] channel 3: 50 shapelets
[build] channel 4: 50 shapelets
[build] channel 5: 50 shapelets
[build] channel 6: 50 shapelets

L1-LogReg | |coef| (top 20):
# 1  f=   0  [ch=0, shp=0, L=58]  imp=0.000000
# 2  f= 349  [ch=6, shp=49, L=576]  imp=0.000000
# 3  f= 348  [ch=6, shp=48, L=576]  imp=0.000000
# 4  f= 347  [ch=6, shp=47, L=576]  imp=0.000000
# 5  f= 346  [ch=6, shp=46, L=576]  imp=0.000000
# 6  f= 345  [ch=6, shp=45, L=576]  imp=0.000000
# 7  f= 344  [ch=6, shp=44, L=576]  imp=0.000000
# 8  f= 343  [ch=6, shp=43, L=576]  imp=0.000000
# 9  f= 342  [ch=6, shp=42, L=576]  imp=0.000000
#10  f= 341  [ch=6, shp=41, L=576]  imp=0.000000
#11  f= 340  [ch=6, shp=40, L=576]  imp=0.000000
#12  f= 339  [ch=6, shp=39, L=346]  imp=0.000000
#13  f= 338  [ch=6, shp=38, L=34