In [None]:
# ---- 0) Setup & Imports
!nvidia-smi -L || true
import os, sys, json, math, random, io, zipfile, glob, gc, shutil, time
from pathlib import Path

# Fast IO
!pip -q install polars==1.4.1 pyarrow==16.1.0 datasets==2.20.0
import polars as pl

# Torch
!pip -q install torch==2.3.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

GPU 0: Tesla T4 (UUID: GPU-75323e3c-b732-86a5-7635-412da39d837a)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-polars-cu12 25.6.0 requires polars<1.29,>=1.25, but you have polars 1.4.1 which is incompatible.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.5.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7

In [None]:
# Core
import numpy as np
import pandas as pd

In [None]:
# ---- 1) Paths
DRIVE_ROOT = Path('/content/drive/MyDrive/mindease_behavior')
RAW = DRIVE_ROOT/'raw'
INTER = DRIVE_ROOT/'intermediate'
ART = DRIVE_ROOT/'artifacts'
# for p in [RAW, INTER, ART]: p.mkdir(parents=True, exist_ok=True)

In [None]:
AALTO_ZIP = RAW/'aalto.zip'
BALABIT_DIR = RAW/'balabit'

In [None]:
# ---- 2) Download datasets

# Aalto 136M keystrokes: direct "Data" link (zip ~1.4GB). If this fails, visit the page and right-click copy link.
# We pull the anchor the page points to; if the URL changes, open the page and copy the "Text files of the keystrokes in a .zip".
AALTO_ZIP = RAW/'aalto.zip'
if not AALTO_ZIP.exists():
    # Attempt to fetch the target zip linked on the page. The link resolves under the same domain.
    # If you get a 403 due to hotlinking, open the page, copy the zip URL, and paste here.
    !wget -O "{AALTO_ZIP}" "https://userinterfaces.aalto.fi/136Mkeystrokes/data/Keystrokes.zip"

# Balabit mouse dynamics: clone repository (small)
BALABIT_DIR = RAW/'balabit'
if not BALABIT_DIR.exists():
    !git clone https://github.com/balabit/Mouse-Dynamics-Challenge "{BALABIT_DIR}"

print("Downloads present:")
print("Aalto zip exists:", AALTO_ZIP.exists(), AALTO_ZIP.stat().st_size if AALTO_ZIP.exists() else 0)
print("Balabit folder exists:", BALABIT_DIR.exists())

/content/drive/MyDrive/mindease_behavior/raw/aalto_136m.zip: No such file or directory
Cloning into '/content/drive/MyDrive/mindease_behavior/raw/balabit'...
remote: Enumerating objects: 1711, done.[K
remote: Total 1711 (delta 0), reused 0 (delta 0), pack-reused 1711 (from 1)[K
Receiving objects: 100% (1711/1711), 42.60 MiB | 11.68 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (1678/1678), done.
Downloads present:
Aalto zip exists: False 0
Balabit folder exists: True


In [None]:
AALTO_DIR = RAW/'aalto'

In [None]:
# ---- 3) Unpack Aalto zip (large!) → to RAW/aalto/
AALTO_DIR = RAW/'aalto'
if not AALTO_DIR.exists():
    AALTO_DIR.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(AALTO_ZIP, 'r') as zf:
        zf.extractall(AALTO_DIR)

In [None]:
# ---- 4) Preprocessing configs
CFG = {
    "keystroke": {
        "window_sec": 30,
        "stride_sec": 15,
        "max_users": 30,        # reduce if RAM/bandwidth tight; increase if you can
        "max_sessions_per_user": 6,  # cap per-user sessions from Aalto
        "min_events_per_window": 20  # skip super-sparse windows
    },
    "mouse": {
        "window_sec": 30,
        "stride_sec": 15,
        "min_events_per_window": 30
    },
    "embedding_dim": 96,
    "batch_size": 256,
    "epochs": 6,               # small for a one-week sprint
    "lr": 2e-3,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
}
with open(ART/'config.json', 'w') as f:
    json.dump(CFG, f, indent=2)
print("CFG:", CFG)


CFG: {'keystroke': {'window_sec': 30, 'stride_sec': 15, 'max_users': 30, 'max_sessions_per_user': 6, 'min_events_per_window': 20}, 'mouse': {'window_sec': 30, 'stride_sec': 15, 'min_events_per_window': 30}, 'embedding_dim': 96, 'batch_size': 256, 'epochs': 6, 'lr': 0.002, 'device': 'cuda'}


In [None]:
OUT_DIR = INTER
BLEND_LIST = INTER / "keystroke_sample_files_blend.txt"
SIZE_LIST   = INTER/'keystroke_sample_files_by_size.txt'

In [None]:
if BLEND_LIST.exists():
    SAMPLE_LIST = BLEND_LIST
elif SIZE_LIST.exists():
    SAMPLE_LIST = SIZE_LIST
else:
    raise SystemExit("No sample list found in INTER. Expected one of:\n"
                     f"- {BLEND_LIST}\n- {SIZE_LIST}")

with open(SAMPLE_LIST, 'r') as f:
    aalto_files = [ln.strip() for ln in f if ln.strip()]

print(f"Using sample list: {SAMPLE_LIST.name}  |  files: {len(aalto_files):,}")

Using sample list: keystroke_sample_files_blend.txt  |  files: 1,000


In [None]:
CFG.setdefault("keystroke", {})
CFG["keystroke"].setdefault("window_sec", 10)
CFG["keystroke"].setdefault("stride_sec", 5)
CFG["keystroke"].setdefault("min_events_per_window", 8)
print(f"Window params → window={CFG['keystroke']['window_sec']}s, "
      f"stride={CFG['keystroke']['stride_sec']}s, "
      f"min_events={CFG['keystroke']['min_events_per_window']}")

Window params → window=30s, stride=15s, min_events=20


In [None]:
# =========================
# Keystroke windowization (Aalto .txt, concat per user, lenient windows)
# =========================
# --- Helper functions (only define if missing in your notebook) ---
def _defined(name: str) -> bool:
    return name in globals() and callable(globals()[name])

if not _defined('parse_keystroke_file_aalto_txt'):
    def parse_keystroke_file_aalto_txt(path: str) -> pl.DataFrame | None:
        """Parse Aalto .txt (tab-separated) -> ['user','session','press_time','dwell'] in seconds."""
        try:
            df = pl.read_csv(
                path, separator='\t', has_header=True,
                infer_schema_length=10000, truncate_ragged_lines=True,
                ignore_errors=True, low_memory=True,
            )
        except Exception:
            return None
        df = df.rename({c: c.lower().strip() for c in df.columns})
        req = ['participant_id','test_section_id','press_time','release_time']
        if any(c not in df.columns for c in req):
            return None
        df = df.with_columns([
            pl.col('participant_id').cast(pl.Utf8).alias('user'),
            pl.col('test_section_id').cast(pl.Utf8).alias('session'),
            pl.col('press_time').cast(pl.Float64).alias('press_ms'),
            pl.col('release_time').cast(pl.Float64).alias('release_ms'),
        ]).with_columns([
            (pl.col('press_ms')/1000.0).alias('press_time'),
            ((pl.col('release_ms')-pl.col('press_ms'))/1000.0).alias('dwell'),
        ])
        out = df.select(['user','session','press_time','dwell']).filter(
            pl.col('press_time').is_not_null() &
            pl.col('dwell').is_not_null() &
            (pl.col('dwell') >= 0.0) & (pl.col('dwell') <= 5.0)
        )
        return out if not out.is_empty() else None

if not _defined('concat_per_user_keystrokes'):
    def concat_per_user_keystrokes(frames: list[pl.DataFrame]) -> pd.DataFrame:
        """Concat sessions per user with time offsets; output pandas DF ['user','t','dwell']."""
        if not frames:
            return pd.DataFrame(columns=['user','t','dwell'])
        df = pl.concat(frames, how='vertical').select(['user','session','press_time','dwell'])
        pdf = df.to_pandas()
        users = pdf['user'].unique().tolist()
        big_rows = []
        for u in users:
            g = pdf[pdf['user']==u].copy()
            g.sort_values(['session','press_time'], inplace=True)
            t_offset = 0.0
            for sid, seg in g.groupby('session'):
                seg = seg.sort_values('press_time')
                if len(seg)==0:
                    continue
                t = seg['press_time'].to_numpy(dtype=np.float64) + t_offset
                dwell = seg['dwell'].to_numpy(dtype=np.float32)
                big_rows.append(pd.DataFrame({'user': u, 't': t, 'dwell': dwell}))
                t_offset = float(t[-1] + 1.0)  # 1s gap between sessions
        big = pd.concat(big_rows, ignore_index=True) if big_rows else pd.DataFrame(columns=['user','t','dwell'])
        big.dropna(subset=['t','dwell'], inplace=True)
        return big

if not _defined('make_keystroke_windows_concat'):
    def make_keystroke_windows_concat(pdf: pd.DataFrame, window_sec: int, stride_sec: int, min_events: int):
        """Slide windows over per-user timelines; return list of (user,'concat',t_start,t_end, seq[N,2])."""
        out = []
        for u, g in pdf.groupby('user'):
            g = g.sort_values('t')
            t = g['t'].to_numpy(dtype=np.float32)
            dw = g['dwell'].to_numpy(dtype=np.float32)
            if t.size < 3:
                continue
            t0, tmax = float(t.min()), float(t.max())
            start = t0
            while start + window_sec <= tmax + 1e-6:
                m = (t >= start) & (t < start + window_sec)
                if int(m.sum()) >= min_events:
                    t_sub = t[m]
                    dw_sub = dw[m]
                    ikg = np.diff(t_sub, prepend=t_sub[0])  # inter-key gap
                    seq = np.stack([dw_sub, ikg], axis=1).astype(np.float32)
                    out.append((u, 'concat', float(start), float(start+window_sec), seq))
                start += stride_sec
        return out

  # --- Parse ONLY the sampled files, with light progress prints ---
parsed = []
t0 = time.time()
for i, fp in enumerate(aalto_files, 1):
    dfp = parse_keystroke_file_aalto_txt(fp)
    if dfp is not None and not dfp.is_empty():
        parsed.append(dfp)
    if i % 200 == 0:
        took = time.time() - t0
        print(f"Parsed {i}/{len(aalto_files)} files  |  parsed_ok={len(parsed)}  |  {took:.1f}s elapsed")

if not parsed:
    raise SystemExit("Parsed 0 usable keystroke files from the sample list — check paths/schema.")

Parsed 200/1000 files  |  parsed_ok=200  |  200.7s elapsed
Parsed 400/1000 files  |  parsed_ok=400  |  271.7s elapsed
Parsed 600/1000 files  |  parsed_ok=600  |  342.7s elapsed
Parsed 800/1000 files  |  parsed_ok=800  |  412.9s elapsed
Parsed 1000/1000 files  |  parsed_ok=1000  |  488.0s elapsed


In [None]:
from pathlib import Path
import os, pandas as pd

AALTO_TXT_DIR = RAW/'aalto'/'Keystrokes'/'files'
all_files = sorted([str(p) for p in Path(AALTO_TXT_DIR).glob('*.txt')])

# Get file sizes (metadata-only, fast)
sizes = []
for fp in all_files:
    try:
        sizes.append(os.stat(fp).st_size)
    except FileNotFoundError:
        sizes.append(0)

df = pd.DataFrame({'file': all_files, 'bytes': sizes})
df = df.sort_values('bytes', ascending=False).reset_index(drop=True)

TOPK = 10_000
top_df = df.head(min(TOPK, len(df)))
OUT_LIST = INTER/'keystroke_sample_files_top10k.txt'
top_df['file'].to_csv(OUT_LIST, index=False, header=False)
print("Saved:", OUT_LIST, "count:", len(top_df))

Saved: /content/drive/MyDrive/mindease_behavior/intermediate/keystroke_sample_files_top10k.txt count: 10000


In [None]:
# ============================================
# FAST BATCH KEYSTROKE WINDOWIZATION (10k files)
# - Uses your precomputed lists (by_size or blend)
# - Copies by tar streaming (fewer Drive roundtrips)
# - Polars scans many files directly (no merge)
# - Original window config: 30s / 15s / min 20 events
# ============================================
import os, time, shlex, subprocess, gc
from pathlib import Path
import numpy as np
import polars as pl

# ---------- Paths ----------
DRIVE_ROOT = Path('/content/drive/MyDrive/mindease_behavior')
RAW   = DRIVE_ROOT/'raw'
INTER = DRIVE_ROOT/'intermediate'
ART   = DRIVE_ROOT/'artifacts'
for p in [RAW, INTER, ART]: p.mkdir(parents=True, exist_ok=True)

AALTO_DIR = RAW/'aalto'
LOCAL_ROOT = Path('/content/aalto_batches')   # local SSD workspace
LOCAL_ROOT.mkdir(parents=True, exist_ok=True)

# ---------- Choose which saved list to use ----------
BLEND_LIST = INTER/'keystroke_sample_files_blend.txt'
SIZE_LIST  = INTER/'keystroke_sample_files_by_size.txt'

# Prefer by_size for throughput (bigger files first). Switch to BLEND for more diversity.
SAMPLE_LIST = INTER/'keystroke_sample_files_top10k.txt'
assert SAMPLE_LIST.exists(), "No sample list found in /intermediate (by_size or blend)."

# Load *all* file paths in the chosen list
all_list_files = [ln.strip() for ln in open(SAMPLE_LIST) if ln.strip()]
print(f"Sample list: {SAMPLE_LIST.name}  |  total paths in list: {len(all_list_files):,}")

Sample list: keystroke_sample_files_top10k.txt  |  total paths in list: 10,000


In [None]:
# ---------- Window config (original) ----------
CFG = globals().get('CFG', {})
CFG.setdefault('keystroke', {})
CFG['keystroke'].update({
    "window_sec": 30,
    "stride_sec": 15,
    "min_events_per_window": 20,
})
print("Window params:", CFG['keystroke'])

Window params: {'window_sec': 30, 'stride_sec': 15, 'max_users': 30, 'max_sessions_per_user': 6, 'min_events_per_window': 20}


In [None]:
# ---------- Window config (original) ----------
CFG = globals().get('CFG', {})
CFG.setdefault('keystroke', {})
CFG['keystroke'].update({
    "window_sec": 30,
    "stride_sec": 15,
    "min_events_per_window": 20,
})
print("Window params:", CFG['keystroke'])

# ---------- Scale controls ----------
TOTAL_FILES = 2000      # target number of files to process from the list
BATCH_SIZE  = 1_000       # local memory friendly; try 2000 if you have headroom

# Cap to what's available in the list
TOTAL_FILES = min(TOTAL_FILES, len(all_list_files))
files_to_use = all_list_files[:TOTAL_FILES]
num_batches = (len(files_to_use) + BATCH_SIZE - 1) // BATCH_SIZE
print(f"Planning {num_batches} batch(es): {len(files_to_use):,} files, batch size {BATCH_SIZE}")

# ---------- Utils ----------
def read_header_cols(path_str: str) -> list[str]:
    # Read just the header row quickly; tolerant to encoding
    with open(path_str, 'r', encoding='utf-8', errors='ignore') as f:
        line = f.readline().rstrip('\n\r')
    # split on tab
    return line.split('\t') if line else []

def normalize(col: str) -> str:
    return col.strip().lower()

def process_batch(batch_idx: int, batch_paths: list[str]):
    batch_dir = LOCAL_ROOT / f"batch_{batch_idx:03d}"
    out_npz   = INTER / f"keystroke_windows_batch{batch_idx:03d}.npz"
    if out_npz.exists():
        print(f"[batch {batch_idx}] already exists -> skipping")
        return

    t0 = time.time()
    batch_dir.mkdir(parents=True, exist_ok=True)

    # 1) Copy with tar stream (Drive -> local)
    filelist_txt = batch_dir / "files.txt"
    with open(filelist_txt, "w") as f:
        for p in batch_paths:
            f.write(p + "\n")

    print(f"[batch {batch_idx}] copying {len(batch_paths)} files via tar stream ...")
    subprocess.run(
        ["bash","-lc",
         f"tar -cf - -T {shlex.quote(str(filelist_txt))} | (cd {shlex.quote(str(batch_dir))} && tar -xf -)"],
        check=True
    )
    print(f"[batch {batch_idx}] copy done in {time.time()-t0:.1f}s")

    # 2) Build list of local paths in same order; ensure they exist
    local_paths = []
    for p in batch_paths:
        lp = batch_dir.joinpath(p.lstrip('/'))
        if lp.exists():
            local_paths.append(str(lp))
    if not local_paths:
        print(f"[batch {batch_idx}] nothing copied; skipping")
        return

    # 3) PER-BATCH HEADER SNIFF across all local files
    # Gather all header variants present in this batch
    observed_headers = set()
    # Limit header sniff to e.g. first 500 files for speed (adjust if needed)
    SAMPLE_H = min(len(local_paths), 500)
    for path in local_paths[:SAMPLE_H]:
        for col in read_header_cols(path):
            observed_headers.add(col)

    # Build lowercase map for all observed headers (each original -> normalized lower)
    lower_map = {orig: normalize(orig) for orig in observed_headers}

    # Build robust schema_overrides covering ALL observed variants
    # Anything that normalizes to these keys gets forced to the desired dtype
    time_keys = {"press_time", "release_time"}
    id_keys   = {"participant_id", "test_section_id"}

    schema_overrides = {}
    for orig, low in lower_map.items():
        if low in time_keys:
            schema_overrides[orig] = pl.Float64
        elif low in id_keys:
            schema_overrides[orig] = pl.Utf8
        # others: let Polars infer lazily

    # 4) Polars lazy scan of many files with robust schema + rename
    # Note: schema_overrides expects ORIGINAL column names (per file); then we rename to lowercase
    try:
        lazy = pl.scan_csv(
            local_paths,
            separator='\t',
            has_header=True,
            infer_schema_length=0,
            schema_overrides=schema_overrides,  # robust across header variants
            ignore_errors=True,
            null_values=["", "NA", "NaN"],
            low_memory=True,
        )
    except TypeError:
        # older polars uses 'dtypes' instead of 'schema_overrides'
        lazy = pl.scan_csv(
            local_paths,
            separator='\t',
            has_header=True,
            infer_schema_length=0,
            dtypes=schema_overrides,
            ignore_errors=True,
            null_values=["", "NA", "NaN"],
            low_memory=True,
        )

    # Normalize headers to lowercase (whatever they were)
    # We can only rename keys that actually exist; use intersection
    # Collect the current columns from the lazy scan plan
    # (Polars will union schemas; we can just attempt a broad rename)
    rename_map = {}
    # Try to obtain columns from a small eager read of just header rows of first file
    # but safer: use the observed headers set (covers batch)
    for orig in observed_headers:
        rename_map[orig] = lower_map[orig]
    lazy = lazy.rename(rename_map)

    # Required columns after rename
    req = ['participant_id','test_section_id','press_time','release_time']
    # It's possible some files miss a column; filtering after building features will drop them
    missing_after_rename = [c for c in req if c not in lazy.columns]
    if missing_after_rename:
        # Try to continue — rows missing required cols will be filtered out by .filter below
        print(f"[batch {batch_idx}] Warning: some required cols absent in union schema: {missing_after_rename}")

    # 5) Build features & dynamic windows
    lazy = (
        lazy
        .with_columns([
            pl.col('participant_id').cast(pl.Utf8).alias('user'),
            pl.col('test_section_id').cast(pl.Utf8).alias('session'),
            (pl.col('press_time').cast(pl.Float64) / 1000.0).alias('press_s'),
            ((pl.col('release_time').cast(pl.Float64) - pl.col('press_time').cast(pl.Float64)) / 1000.0).alias('dwell_s'),
        ])
        .filter(
            pl.col('press_s').is_not_null() &
            pl.col('dwell_s').is_not_null() &
            (pl.col('dwell_s') >= 0.0) & (pl.col('dwell_s') <= 5.0)
        )
        .with_columns(
            (pl.datetime(1970,1,1) + pl.duration(seconds=pl.col('press_s'))).alias('press_dt')
        )
        .select(['user','session','press_s','dwell_s','press_dt'])
        .sort(['user','session','press_dt'])
    )

    every  = f"{CFG['keystroke']['stride_sec']}s"
    period = f"{CFG['keystroke']['window_sec']}s"

    grouped = (
        lazy
        .group_by_dynamic(
            index_column='press_dt',
            every=every,
            period=period,
            group_by=['user','session'],
            closed='left',
            label='left',
            start_by='datapoint'
        )
        .agg([
            pl.min('press_s').alias('t_start'),
            pl.max('press_s').alias('t_end'),
            pl.col('press_s').implode().alias('pt_list'),
            pl.col('dwell_s').implode().alias('dwell_list'),
            pl.len().alias('n_events'),
        ])
        .filter(pl.col('n_events') >= CFG['keystroke']['min_events_per_window'])
    )

    try:
        win_df = grouped.collect(streaming=True)
    except Exception:
        win_df = grouped.collect()
    print(f"[batch {batch_idx}] windows table rows:", win_df.shape[0])

    # 6) Pack sequences & save batch npz
    def build_seq_row(pt_list, dwell_list):
        pt = np.asarray(pt_list, dtype=np.float64).reshape(-1)
        dw = np.asarray(dwell_list, dtype=np.float64).reshape(-1)
        n = min(pt.size, dw.size)
        if n == 0:
            return None
        pt, dw = pt[:n], dw[:n]
        m = np.isfinite(pt) & np.isfinite(dw)
        if not np.any(m):
            return None
        pt, dw = pt[m], dw[m]
        ikg = np.diff(pt, prepend=pt[:1])
        return np.column_stack((dw.astype(np.float32), ikg.astype(np.float32)))

    ks_windows = []
    for row in win_df.iter_rows(named=True):
        seq = build_seq_row(row['pt_list'], row['dwell_list'])
        if seq is None or seq.shape[0] < CFG['keystroke']['min_events_per_window']:
            continue
        ks_windows.append((row['user'], row['session'], float(row['t_start']), float(row['t_end']), seq))

    np.savez_compressed(out_npz, windows=np.array(ks_windows, dtype=object), allow_pickle=True)
    print(f"[batch {batch_idx}] saved: {out_npz}  windows: {len(ks_windows)}")

    # 7) Cleanup local to free space
    try:
        for p in batch_dir.glob("**/*"):
            if p.is_file(): p.unlink()
        for p in sorted(batch_dir.glob("**/*"), reverse=True):
            if p.is_dir(): p.rmdir()
        batch_dir.rmdir()
    except Exception:
        pass
    gc.collect()

# ---------- Run batches ----------
for b in range(num_batches):
    lo, hi = b*BATCH_SIZE, min((b+1)*BATCH_SIZE, len(files_to_use))
    batch_paths = files_to_use[lo:hi]
    process_batch(b, batch_paths)

# ---------- Combine all batch npz into one ----------
all_npz = sorted(INTER.glob("keystroke_windows_batch*.npz"))
print("Combining", len(all_npz), "batch files ...")
all_windows = []
for npz in all_npz:
    arr = np.load(npz, allow_pickle=True)['windows']
    if len(arr) > 0:
        all_windows.extend(arr.tolist())

KS_WIN_NPZ = INTER / "keystroke_windows.npz"
np.savez_compressed(KS_WIN_NPZ, windows=np.array(all_windows, dtype=object), allow_pickle=True)
print("Final keystroke windows:", len(all_windows))
print("Saved combined:", KS_WIN_NPZ)

Window params: {'window_sec': 30, 'stride_sec': 15, 'max_users': 30, 'max_sessions_per_user': 6, 'min_events_per_window': 20}
Planning 2 batch(es): 2,000 files, batch size 1000
[batch 0] copying 1000 files via tar stream ...
[batch 0] copy done in 409.1s


  missing_after_rename = [c for c in req if c not in lazy.columns]


[batch 0] windows table rows: 20150
[batch 0] saved: /content/drive/MyDrive/mindease_behavior/intermediate/keystroke_windows_batch000.npz  windows: 20150
[batch 1] copying 1000 files via tar stream ...
[batch 1] copy done in 227.1s


  missing_after_rename = [c for c in req if c not in lazy.columns]


[batch 1] windows table rows: 18003
[batch 1] saved: /content/drive/MyDrive/mindease_behavior/intermediate/keystroke_windows_batch001.npz  windows: 18003
Combining 10 batch files ...
Final keystroke windows: 172053
Saved combined: /content/drive/MyDrive/mindease_behavior/intermediate/keystroke_windows.npz


In [None]:
# Re-clone Balabit with git-lfs so data files are real
!apt-get -y -qq install git-lfs
!git lfs install
!rm -rf "{BALABIT_DIR}"
!git clone https://github.com/balabit/Mouse-Dynamics-Challenge "{BALABIT_DIR}"

# Quick sanity: count ALL regular files (no extension filter)
import os
cnt = 0
for root, dirs, files in os.walk(BALABIT_DIR):
    for f in files:
        p = os.path.join(root, f)
        if os.path.isfile(p):
            cnt += 1
print("Balabit regular files:", cnt)

Git LFS initialized.
Cloning into '/content/drive/MyDrive/mindease_behavior/raw/balabit'...
remote: Enumerating objects: 1711, done.[K
remote: Total 1711 (delta 0), reused 0 (delta 0), pack-reused 1711 (from 1)[K
Receiving objects: 100% (1711/1711), 42.60 MiB | 11.37 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (1678/1678), done.
Balabit regular files: 1704


In [None]:
# Make windows easier to get
CFG['mouse'].update({
    "window_sec": 6,           # small to tolerate short sessions
    "stride_sec": 3,
    "min_events_per_window": 6
})

In [None]:
# =========================
# Mouse windowization (no-extension files, robust reader + concat per user)
# =========================
import os, csv, glob, numpy as np, pandas as pd
from pathlib import Path

# Make windows easier to get
CFG['mouse'].update({
    "window_sec": 6,           # small to tolerate short sessions
    "stride_sec": 3,
    "min_events_per_window": 6
})

BALABIT_ROOT = BALABIT_DIR

def list_all_files(root: Path):
    files = []
    for r, d, fs in os.walk(root):
        for f in fs:
            p = os.path.join(r, f)
            if os.path.isfile(p):
                files.append(p)
    # keep only those under training_files/ or test_files/
    files = [p for p in files if ('training_files' in p or 'test_files' in p)]
    files = sorted(list(dict.fromkeys(files)))
    return files

def sniff_sep(path, default=','):
    try:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            sample = ''.join([next(f) for _ in range(50)])
        return csv.Sniffer().sniff(sample, delimiters=',\t;| ').delimiter
    except Exception:
        return default

def read_balabit_file_any(path):
    """
    Return DataFrame with columns t(sec), x, y. Tolerates header/no-header and any delimiter.
    Known Balabit columns: record_ts, client_ts, button, state, x, y
    """
    # try with sniffed sep and header
    sep = sniff_sep(path, default=',')
    try:
        df = pd.read_csv(path, sep=sep, engine='python', on_bad_lines='skip')
    except Exception:
        # try tab
        try:
            df = pd.read_csv(path, sep='\t', engine='python', on_bad_lines='skip')
        except Exception:
            # last resort: whitespace
            try:
                df = pd.read_csv(path, delim_whitespace=True, engine='python', on_bad_lines='skip', header=None)
            except Exception:
                return None

    cols = [str(c).strip().lower() for c in df.columns]
    df.columns = cols

    # choose time col
    tcol = None
    for c in ('record_ts','client_ts','time','timestamp','t'):
        if c in cols:
            tcol = c; break
    if tcol is None:
        tcol = cols[0]  # guess first col

    # choose x,y
    xcol = 'x' if 'x' in cols else (cols[-2] if len(cols)>=2 else None)
    ycol = 'y' if 'y' in cols else (cols[-1] if len(cols)>=1 else None)
    if xcol is None or ycol is None or xcol==tcol or ycol==tcol:
        return None

    t = pd.to_numeric(df[tcol], errors='coerce').astype(float)
    x = pd.to_numeric(df[xcol], errors='coerce').astype(float)
    y = pd.to_numeric(df[ycol], errors='coerce').astype(float)
    keep = (~t.isna()) & (~x.isna()) & (~y.isna())
    t, x, y = t[keep].values, x[keep].values, y[keep].values
    if t.size < 3:
        return None

    # normalize time units → seconds
    dt = np.diff(t, prepend=t[0])
    pos = dt[dt > 0]
    med_dt = np.median(pos) if pos.size else 0.0
    if med_dt > 10:       # looks like ms
        t = t / 1000.0
    elif med_dt < 1e-3:   # looks like microseconds
        t = t / 1e6

    # shift to zero
    t = t - t.min()

    return pd.DataFrame({'t': t, 'x': x, 'y': y})

def user_from_path(p: str):
    # training_files/<USER>/... or test_files/<USER>/...
    parts = Path(p).parts
    # find index of training_files or test_files
    for i, seg in enumerate(parts):
        if seg in ('training_files', 'test_files'):
            if i+1 < len(parts):
                return parts[i+1]
    return 'unknown'

def seq_from_df(df: pd.DataFrame):
    df = df.sort_values('t')
    t = df['t'].to_numpy(dtype=np.float32)
    x = df['x'].to_numpy(dtype=np.float32)
    y = df['y'].to_numpy(dtype=np.float32)
    if len(t) < 3:
        return None, None
    dx = np.diff(x, prepend=x[0])
    dy = np.diff(y, prepend=y[0])
    dt = np.clip(np.diff(t, prepend=t[0]), 1e-3, None)
    speed = np.sqrt(dx**2 + dy**2) / dt
    accel = np.diff(speed, prepend=speed[0]) / dt
    jerk  = np.diff(accel, prepend=accel[0]) / dt
    seq = np.stack([dx, dy, dt, speed, accel, jerk], axis=1).astype(np.float32)
    return t, seq

def make_mouse_windows_concat(root, window_sec, stride_sec, min_events):
    files = list_all_files(Path(root))
    print("Files considered:", len(files))
    # read all files into per-user buckets
    per_user = {}
    for fp in files:
        df = read_balabit_file_any(fp)
        if df is None or len(df) < 3:
            continue
        uid = user_from_path(fp)
        per_user.setdefault(uid, []).append(df)

    rows = []
    for uid, dfs in per_user.items():
        if not dfs:
            continue
        # concatenate with offsets so they form a continuous stream
        t_offset = 0.0
        big = []
        for df in sorted(dfs, key=lambda d: d['t'].min()):
            dfa = df.copy()
            dfa['t'] = dfa['t'] + t_offset
            big.append(dfa)
            t_offset = float(dfa['t'].max() + 1.0)
        bigdf = pd.concat(big, ignore_index=True)

        t, seq = seq_from_df(bigdf)
        if t is None:
            continue
        t0, tmax = float(t.min()), float(t.max())
        start = t0
        while start + window_sec <= tmax + 1e-6:
            m = (t >= start) & (t < start + window_sec)
            if int(m.sum()) >= min_events:
                rows.append((uid, float(start), float(start + window_sec), seq[m]))
            start += stride_sec
    print(f"[Balabit concat] users={len(per_user)} windows={len(rows)}")
    return rows

mouse_windows = make_mouse_windows_concat(
    BALABIT_ROOT,
    CFG['mouse']['window_sec'],
    CFG['mouse']['stride_sec'],
    CFG['mouse']['min_events_per_window']
)

# Save (object array for variable-length seqs)
MOUSE_WIN_NPZ = INTER / 'mouse_windows.npz'
np.savez_compressed(MOUSE_WIN_NPZ, windows=np.array(mouse_windows, dtype=object), allow_pickle=True)
print("Mouse windows:", len(mouse_windows))
print("Saved:", MOUSE_WIN_NPZ)

Files considered: 1676
[Balabit concat] users=10 windows=215980
Mouse windows: 215980
Saved: /content/drive/MyDrive/mindease_behavior/intermediate/mouse_windows.npz


In [None]:
# ---- 7) Robust NPZ inspector + loader (fixes "0 sequences" issue)
import os, glob, pprint, numpy as np
from pathlib import Path

DRIVE_ROOT = Path('/content/drive/MyDrive/mindease_behavior')
INTER = DRIVE_ROOT/'intermediate'
ART  = DRIVE_ROOT/'artifacts'
ART.mkdir(parents=True, exist_ok=True)

KS_NPZ = INTER/'keystroke_windows.npz'
MOUSE_NPZ = INTER/'mouse_windows.npz'

def describe_file(p: Path, label: str):
    print(f"{label}: {p}  |  exists={p.exists()}  |  sizeMB={p.stat().st_size/1e6:.2f}" if p.exists() else f"{label}: MISSING -> {p}")
describe_file(KS_NPZ, "Keystroke")
describe_file(MOUSE_NPZ, "Mouse")

def inspect_npz(npz_path: Path, prefer_key='windows', peek=3):
    if not npz_path.exists():
        print("  → file missing")
        return None, None
    data = np.load(npz_path, allow_pickle=True)
    print("Keys:", list(data.files))
    key = prefer_key if prefer_key in data.files else data.files[0]
    arr = data[key]
    print(f"Chosen key: {key} | dtype={arr.dtype} | ndim={arr.ndim} | shape={getattr(arr,'shape',None)} | size={arr.size}")
    # Print a few examples safely
    def safe_repr(x):
        r = repr(x)
        return r[:200] + ("..." if len(r)>200 else "")
    try:
        if arr.ndim == 0:
            obj = arr.item()
            print("0-D object; type(obj):", type(obj))
            if isinstance(obj, (list, tuple)):
                print("First items:", [safe_repr(obj[i]) for i in range(min(peek, len(obj)))])
            else:
                print("Scalar object:", safe_repr(obj))
        else:
            n = min(peek, len(arr))
            print(f"First {n} items:")
            for i in range(n):
                print(" ", i, "→", safe_repr(arr[i]))
    except Exception as e:
        print("Peek error:", e)
    return data, key

print("\n--- Inspect keystroke NPZ ---")
ks_data, ks_key = inspect_npz(KS_NPZ)

print("\n--- Inspect mouse NPZ ---")
mouse_data, mouse_key = inspect_npz(MOUSE_NPZ)

# --- Load robustly (fixed for 5-field keystrokes AND 4-field mouse rows) ---
def extract_sequences_from_array(arr, min_len=4):
    """
    Accepts:
      - 2-D object array where each row is:
          * ['user','session', t0, t1, seq]  (keystrokes)
          * ['user',            t0, t1, seq]  (mouse)
      - 0-D object array containing a list/tuple of the above
      - raw 2-D sequences
    Returns: (seqs, metas) where:
      - seqs: list[np.ndarray[T,C]]
      - metas: list[(user, session, t0, t1)]
    """
    seqs, metas = [], []

    # Normalize container
    if isinstance(arr, np.ndarray) and arr.ndim == 0:
        items = arr.item()
    else:
        items = arr

    if isinstance(items, np.ndarray) and items.dtype == object:
        # turn into Python list-of-rows
        try:
            items = items.tolist()
        except Exception:
            items = [items]

    if not isinstance(items, (list, tuple)):
        items = [items]

    for el in items:
        # If it's an object-typed row, convert to list for easy indexing
        if isinstance(el, np.ndarray) and el.dtype == object:
            el = el.tolist()

        seq = None
        meta = ("?", "?", 0.0, 0.0)

        if isinstance(el, (list, tuple)):
            L = len(el)
            if L >= 5:
                # ['user','session', t0, t1, seq]
                seq = np.asarray(el[-1])
                try:
                    meta = (str(el[0]), str(el[1]), float(el[2]), float(el[3]))
                except Exception:
                    pass
            elif L == 4:
                # ['user', t0, t1, seq]  (mouse)
                seq = np.asarray(el[-1])
                try:
                    meta = (str(el[0]), "?", float(el[1]), float(el[2]))
                except Exception:
                    pass
            elif L == 1 and isinstance(el[0], np.ndarray) and el[0].ndim == 2:
                seq = el[0]
            else:
                # maybe it's a raw 2-D array wrapped
                maybe = np.asarray(el)
                if maybe.ndim == 2:
                    seq = maybe

        elif isinstance(el, np.ndarray) and el.ndim == 2:
            seq = el

        # keep valid sequences
        if seq is not None and seq.ndim == 2 and seq.shape[0] >= min_len:
            seqs.append(seq.astype(np.float32))
            metas.append(meta)

    return seqs, metas

def load_sequences(npz_path: Path, prefer_key='windows', min_len=4):
    data = np.load(npz_path, allow_pickle=True)
    key = prefer_key if prefer_key in data.files else data.files[0]
    arr = data[key]
    seqs, metas = extract_sequences_from_array(arr, min_len=min_len)
    print(f"Loaded {len(seqs):,} sequences from {npz_path.name} (key='{key}')")
    return seqs, metas

print("\n--- Load robustly (fixed typo) ---")
KS_NPZ = Path('/content/drive/MyDrive/mindease_behavior/intermediate/keystroke_windows.npz')
MOUSE_NPZ = Path('/content/drive/MyDrive/mindease_behavior/intermediate/mouse_windows.npz')
ks_seqs, ks_meta = load_sequences(KS_NPZ, prefer_key='windows', min_len=4)
mouse_seqs, mouse_meta = load_sequences(MOUSE_NPZ, prefer_key='windows', min_len=4)
print(f"Keystroke sequences: {len(ks_seqs):,}  example: {ks_seqs[0].shape if ks_seqs else None}")
print(f"Mouse sequences:     {len(mouse_seqs):,}  example: {mouse_seqs[0].shape if mouse_seqs else None}")

# Recompute channel-wise normalization stats and save
def compute_channel_stats(seqs, sample_cap=20000, max_rows=1_000_000):
    if not seqs:
        return None, None
    idx = np.random.choice(len(seqs), size=min(sample_cap, len(seqs)), replace=False)
    rows = 0; chunks=[]
    for i in idx:
        s = seqs[i]
        chunks.append(s)
        rows += s.shape[0]
        if rows >= max_rows:
            break
    X = np.concatenate(chunks, axis=0)
    mu = X.mean(axis=0).astype(np.float32)
    sd = (X.std(axis=0) + 1e-8).astype(np.float32)
    return mu, sd

ks_mean, ks_std = compute_channel_stats(ks_seqs)
mouse_mean, mouse_std = compute_channel_stats(mouse_seqs)
print("KS mean/std:", ks_mean, ks_std)
print("Mouse mean/std:", mouse_mean, mouse_std)

ART = Path('/content/drive/MyDrive/mindease_behavior/artifacts')
ART.mkdir(parents=True, exist_ok=True)
np.savez(ART/'norm_stats.npz', ks_mean=ks_mean, ks_std=ks_std, mouse_mean=mouse_mean, mouse_std=mouse_std)
print("Saved stats:", ART/'norm_stats.npz')

Keystroke: /content/drive/MyDrive/mindease_behavior/intermediate/keystroke_windows.npz  |  exists=True  |  sizeMB=37.79
Mouse: /content/drive/MyDrive/mindease_behavior/intermediate/mouse_windows.npz  |  exists=True  |  sizeMB=73.22

--- Inspect keystroke NPZ ---
Keys: ['windows', 'allow_pickle']
Chosen key: windows | dtype=object | ndim=2 | shape=(172053, 5) | size=860265
First 3 items:
  0 → array(['100032', '1091453', 1473275637.037, 1473275646.683,
       array([[ 0.142     ,  0.        ],
              [ 0.104     ,  0.07000017],
              [ 0.103     ,  0.04900002],
              ...
  1 → array(['100032', '1091468', 1473275649.051, 1473275661.7480001,
       array([[ 0.104     ,  0.        ],
              [ 0.072     ,  0.07999992],
              [ 0.079     ,  0.07299995],
          ...
  2 → array(['100032', '1091487', 1473275664.148, 1473275672.755,
       array([[ 0.167     ,  0.        ],
              [ 0.08      ,  0.09500003],
              [ 0.088     ,  0.16799998]

In [None]:
# ---- 8) Small temporal encoder + InfoNCE
import torch
from torch import nn
import torch.nn.functional as F

class TCNEncoder(nn.Module):
    def __init__(self, in_ch, emb_dim=64, hidden=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, hidden, kernel_size=5, padding=2),
            nn.BatchNorm1d(hidden),
            nn.ReLU(inplace=True),
            nn.Conv1d(hidden, hidden, kernel_size=5, padding=2),
            nn.BatchNorm1d(hidden),
            nn.ReLU(inplace=True),
            nn.Conv1d(hidden, emb_dim, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
        )
        self.pool = nn.AdaptiveAvgPool1d(1)  # masked mean would be nicer if we had masks

    def forward(self, x):   # x: [B, C, T]
        z = self.net(x)     # [B, emb, T]
        z = self.pool(z).squeeze(-1)  # [B, emb]
        z = F.normalize(z, dim=1)
        return z


class InfoNCELoss(nn.Module):
    def __init__(self, temperature=0.2, diag_val=-1e4):
        super().__init__()
        self.t = float(temperature)
        self.diag_val = float(diag_val)  # safe for fp16

    def forward(self, z1, z2):
        # z1, z2: [B, D], already L2-normalized
        B, D = z1.shape

        # Concatenate
        z = torch.cat([z1, z2], dim=0)  # [2B, D]

        # Do similarity math in float32 for stability even under autocast
        with amp.autocast(enabled=False):
            z32 = z.float()
            sim = (z32 @ z32.T) / self.t  # [2B,2B] float32
            sim.fill_diagonal_(self.diag_val)  # large negative, fp16-safe

            # positives: i<->i+B and i+B<->i
            targets = torch.arange(B, device=sim.device)

            # Two cross-entropies (float32) – gradients still flow to z1/z2
            loss1 = F.cross_entropy(sim[:B, B:], targets)
            loss2 = F.cross_entropy(sim[B:, :B], targets)
            loss = 0.5 * (loss1 + loss2)

        return loss

In [None]:
# ---- 9) Training wrapper
from torch.cuda import amp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

def train_encoder(
    seqs,
    mean, std,
    in_ch,
    out_len,                 # fixed time steps for training (crop/pad)
    batch_size=256,
    epochs=8,
    lr=1e-3,
    emb_dim=64,
    hidden=128,
    aug_strength=1.0,
    ckpt_path: Path = None,
):
    ds = TwoViewDataset(seqs, mean, std, out_len=out_len, aug_strength=aug_strength)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)

    model = TCNEncoder(in_ch, emb_dim=emb_dim, hidden=hidden).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scaler = amp.GradScaler(enabled=(device.type=='cuda'))
    criterion = InfoNCELoss(temperature=0.2)

    model.train()
    best_loss = float('inf')
    for ep in range(1, epochs+1):
        running = 0.0
        for v1, v2 in dl:
            v1 = v1.to(device, non_blocking=True).float()
            v2 = v2.to(device, non_blocking=True).float()
            opt.zero_grad(set_to_none=True)
            with amp.autocast(enabled=(device.type=='cuda')):
                z1 = model(v1)
                z2 = model(v2)
                loss = criterion(z1, z2)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()
            running += loss.item() * v1.size(0)
        avg = running / (len(dl)*batch_size)
        print(f"Epoch {ep:02d} | loss {avg:.4f}")
        if avg < best_loss:
            best_loss = avg
            if ckpt_path is not None:
                torch.save({'model': model.state_dict(), 'in_ch': in_ch, 'emb_dim': emb_dim, 'hidden': hidden}, ckpt_path)
    return model

def export_onnx(model, in_ch, out_len, onnx_path: Path):
    model.eval()
    dummy = torch.zeros(1, in_ch, out_len, device=device)
    torch.onnx.export(
        model, dummy, str(onnx_path),
        input_names=['x'], output_names=['z'],
        opset_version=17, do_constant_folding=True,
        dynamic_axes={'x': {0: 'batch', 2: 'time'}, 'z': {0: 'batch'}}
    )
    print("Exported:", onnx_path)

Device: cuda


In [None]:
!pip install onnx onnxruntime

Collecting onnx
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m117.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46

In [None]:
# ---- 10) Kick off training (GPU recommended)
# Choose training time lengths (crop/pad per sample)
# For keystrokes (30s windows), per-event granularity varies; 128 steps is reasonable.
# For mouse (6s windows), 128 steps also OK (we pad/crop).
KS_TLEN    = 128
MOUSE_TLEN = 128

# Keystroke channels: [dwell, inter-key-gap] -> infer from one sample
ks_in_ch = ks_seqs[0].shape[1] if ks_seqs else 2
# Mouse channels: [dx, dy, dt, speed, accel, jerk]
mouse_in_ch = mouse_seqs[0].shape[1] if mouse_seqs else 6

# Modality-specific hyperparams
EPOCHS_KS    = 10
EPOCHS_MOUSE = 8
BATCH        = 256 if device.type == 'cuda' else 64    # smaller if CPU
LR           = 1e-3

# Paths
KS_CKPT   = ART/'encoder_keystroke.pt'
MOUSE_CKPT= ART/'encoder_mouse.pt'
KS_ONNX   = ART/'encoder_keystroke.onnx'
MOUSE_ONNX= ART/'encoder_mouse.onnx'

print("Training keystroke encoder...")
ks_model = train_encoder(
    ks_seqs, ks_mean, ks_std, ks_in_ch, KS_TLEN,
    batch_size=BATCH, epochs=EPOCHS_KS, lr=LR, emb_dim=64, hidden=128,
    aug_strength=1.0, ckpt_path=KS_CKPT
)

print("Training mouse encoder...")
mouse_model = train_encoder(
    mouse_seqs, mouse_mean, mouse_std, mouse_in_ch, MOUSE_TLEN,
    batch_size=BATCH, epochs=EPOCHS_MOUSE, lr=LR, emb_dim=64, hidden=128,
    aug_strength=1.0, ckpt_path=MOUSE_CKPT
)

# Export ONNX
export_onnx(ks_model, ks_in_ch, KS_TLEN, KS_ONNX)
export_onnx(mouse_model, mouse_in_ch, MOUSE_TLEN, MOUSE_ONNX)

Training keystroke encoder...
Epoch 01 | loss 2.3510
Epoch 02 | loss 2.1202
Epoch 03 | loss 2.1001
Epoch 04 | loss 2.1140
Epoch 05 | loss 2.0622
Epoch 06 | loss 2.0391
Epoch 07 | loss 2.0069
Epoch 08 | loss 1.9868
Epoch 09 | loss 1.9609
Epoch 10 | loss 1.9477
Training mouse encoder...
Epoch 01 | loss 2.0295
Epoch 02 | loss 1.8002
Epoch 03 | loss 1.7680
Epoch 04 | loss 1.7463
Epoch 05 | loss 1.7290
Epoch 06 | loss 1.7199
Epoch 07 | loss 1.7585
Epoch 08 | loss 1.7168


OnnxExporterError: Module onnx is not installed!

In [None]:
print(mouse_model)

TCNEncoder(
  (net): Sequential(
    (0): Conv1d(6, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (7): ReLU(inplace=True)
  )
  (pool): AdaptiveAvgPool1d(output_size=1)
)


In [None]:
export_onnx(ks_model, ks_in_ch, KS_TLEN, KS_ONNX)
export_onnx(mouse_model, mouse_in_ch, MOUSE_TLEN, MOUSE_ONNX)

Exported: /content/drive/MyDrive/mindease_behavior/artifacts/encoder_keystroke.onnx
Exported: /content/drive/MyDrive/mindease_behavior/artifacts/encoder_mouse.onnx


In [None]:
# ---- 11) Save everything needed for inference
BUNDLE = ART/'foundation_bundle.npz'
np.savez(
    BUNDLE,
    ks_mean=ks_mean, ks_std=ks_std, mouse_mean=mouse_mean, mouse_std=mouse_std,
    ks_tlen=np.array([KS_TLEN], dtype=np.int32),
    mouse_tlen=np.array([MOUSE_TLEN], dtype=np.int32),
    ks_in_ch=np.array([ks_in_ch], dtype=np.int32),
    mouse_in_ch=np.array([mouse_in_ch], dtype=np.int32),
)
print("Saved bundle:", BUNDLE)
print("Artifacts:")
print(" -", KS_CKPT)
print(" -", MOUSE_CKPT)
print(" -", KS_ONNX)
print(" -", MOUSE_ONNX)

Saved bundle: /content/drive/MyDrive/mindease_behavior/artifacts/foundation_bundle.npz
Artifacts:
 - /content/drive/MyDrive/mindease_behavior/artifacts/encoder_keystroke.pt
 - /content/drive/MyDrive/mindease_behavior/artifacts/encoder_mouse.pt
 - /content/drive/MyDrive/mindease_behavior/artifacts/encoder_keystroke.onnx
 - /content/drive/MyDrive/mindease_behavior/artifacts/encoder_mouse.onnx


In [None]:
# If you have a GPU runtime in Colab, use the GPU build of onnxruntime.
import sys, subprocess, torch

has_cuda = torch.cuda.is_available()
print("CUDA available:", has_cuda)

def pip(cmd):
    print(">>", cmd)
    subprocess.check_call([sys.executable, "-m", "pip"] + cmd.split())

try:
    import onnxruntime as ort
    print("onnxruntime version:", ort.__version__)
except Exception:
    if has_cuda:
        pip("uninstall -y onnxruntime")
        pip("install -U onnxruntime-gpu onnx")
    else:
        pip("install -U onnx onnxruntime")
    import onnxruntime as ort
    print("onnxruntime version:", ort.__version__)

# Show providers actually available
import onnxruntime as ort
sess = ort.InferenceSession
print("Available providers:", ort.get_available_providers())

CUDA available: True
onnxruntime version: 1.22.1
Available providers: ['AzureExecutionProvider', 'CPUExecutionProvider']


In [None]:
# Freeze encoders and generate embeddings
import os, math, numpy as np
from pathlib import Path
import onnxruntime as ort

DRIVE_ROOT = Path('/content/drive/MyDrive/mindease_behavior')
INTER = DRIVE_ROOT/'intermediate'
ART   = DRIVE_ROOT/'artifacts'
ART.mkdir(parents=True, exist_ok=True)

# Paths to encoders and stats
KS_ONNX   = ART/'encoder_keystroke.onnx'
MOUSE_ONNX= ART/'encoder_mouse.onnx'
STATS_NPZ = ART/'norm_stats.npz'

assert KS_ONNX.exists(), f"Missing {KS_ONNX}"
assert MOUSE_ONNX.exists(), f"Missing {MOUSE_ONNX}"
assert STATS_NPZ.exists(), f"Missing {STATS_NPZ}"

# Load normalization
stats = np.load(STATS_NPZ, allow_pickle=True)
ks_mean, ks_std     = stats['ks_mean'], stats['ks_std']
mouse_mean, mouse_std = stats['mouse_mean'], stats['mouse_std']
print("KS mean/std shapes:", None if ks_mean is None else ks_mean.shape, None if ks_std is None else ks_std.shape)
print("Mouse mean/std shapes:", None if mouse_mean is None else mouse_mean.shape, None if mouse_std is None else mouse_std.shape)

# Robust NPZ loaders (match your saved formats)
def load_keystroke_windows(npz_path: Path, min_len=4):
    arr = np.load(npz_path, allow_pickle=True)['windows']
    seqs, metas = [], []
    # rows: ['user','session', t0, t1, seq]
    for row in arr:
        row = row.tolist() if isinstance(row, np.ndarray) and row.dtype==object else row
        if isinstance(row, (list, tuple)) and len(row) >= 5:
            user, sess, t0, t1, seq = row[0], row[1], float(row[2]), float(row[3]), np.asarray(row[-1])
            if seq.ndim==2 and seq.shape[0] >= min_len:
                seqs.append(seq.astype(np.float32))
                metas.append((str(user), str(sess), t0, t1))
    return seqs, metas

def load_mouse_windows(npz_path: Path, min_len=4):
    arr = np.load(npz_path, allow_pickle=True)['windows']
    seqs, metas = [], []
    # rows: ['user', t0, t1, seq]  (no session)
    for row in arr:
        row = row.tolist() if isinstance(row, np.ndarray) and row.dtype==object else row
        if isinstance(row, (list, tuple)) and len(row) >= 4:
            user, t0, t1, seq = row[0], float(row[1]), float(row[2]), np.asarray(row[-1])
            if seq.ndim==2 and seq.shape[0] >= min_len:
                seqs.append(seq.astype(np.float32))
                metas.append((str(user), t0, t1))
    return seqs, metas

KS_WIN = INTER/'keystroke_windows.npz'
MOUSE_WIN = INTER/'mouse_windows.npz'
ks_seqs, ks_meta = load_keystroke_windows(KS_WIN)
mouse_seqs, mouse_meta = load_mouse_windows(MOUSE_WIN)
print(f"Loaded: keystroke={len(ks_seqs):,}  mouse={len(mouse_seqs):,}")

# Choose providers
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if 'CUDAExecutionProvider' in ort.get_available_providers() else ['CPUExecutionProvider']
print("Using providers:", providers)

# Create ORT sessions
ks_sess    = ort.InferenceSession(str(KS_ONNX), providers=providers)
mouse_sess = ort.InferenceSession(str(MOUSE_ONNX), providers=providers)

# Our ONNX expects input named 'x' of shape [B, C, T]; it outputs 'z' of shape [B, D]
ks_input_name    = ks_sess.get_inputs()[0].name
mouse_input_name = mouse_sess.get_inputs()[0].name
ks_output_name   = ks_sess.get_outputs()[0].name
mouse_output_name= mouse_sess.get_outputs()[0].name

# Helper: crop/pad variable length to fixed T for batching
def crop_pad_to_len(x, out_len):
    T, C = x.shape
    if T == out_len:
        return x
    if T > out_len:
        start = np.random.randint(0, T - out_len + 1)
        return x[start:start+out_len]
    # pad at end
    pad = np.zeros((out_len - T, C), dtype=x.dtype)
    return np.concatenate([x, pad], axis=0)

# If your ONNX was exported with dynamic time axis, you *can* feed varying T,
# but batching requires same T per batch. We'll use a fixed TLEN for speed.
KS_TLEN    = 128
MOUSE_TLEN = 128
BATCH = 1024  # adjust if you see OOM

def normalize(x, mean, std):
    if mean is None or std is None:
        return x
    return (x - mean) / std

def embed_all(seqs, sess, input_name, output_name, mean, std, out_len, batch, tag=""):
    N = len(seqs)
    D = None
    embs = []
    for i in range(0, N, batch):
        chunk = seqs[i:i+batch]
        # crop/pad + normalize + reshape to [B, C, T]
        X = []
        for s in chunk:
            s2 = crop_pad_to_len(s, out_len)
            s2 = normalize(s2, mean, std)
            X.append(s2.T)  # (C,T)
        X = np.stack(X, axis=0).astype(np.float32)  # (B,C,T)
        z = sess.run([output_name], {input_name: X})[0]  # (B,D)
        if D is None: D = z.shape[1]
        embs.append(z.astype(np.float32))
        if ((i//batch) % 50)==0:
            print(f"[{tag}] {i+len(chunk):,}/{N:,}")
    Z = np.concatenate(embs, axis=0)
    return Z  # (N,D)

# ---- Keystroke embeddings ----
ks_Z = embed_all(ks_seqs, ks_sess, ks_input_name, ks_output_name,
                 ks_mean, ks_std, KS_TLEN, BATCH, tag="KS")
print("Keystroke embeddings:", ks_Z.shape)

# Save
KS_EMB = ART/'embeddings_keystroke.npz'
np.savez_compressed(
    KS_EMB,
    user=np.array([u for (u, s, t0, t1) in ks_meta], dtype=object),
    session=np.array([s for (u, s, t0, t1) in ks_meta], dtype=object),
    t0=np.array([t0 for (u, s, t0, t1) in ks_meta], dtype=np.float64),
    t1=np.array([t1 for (u, s, t0, t1) in ks_meta], dtype=np.float64),
    z=ks_Z.astype(np.float32),
    allow_pickle=True
)
print("Saved:", KS_EMB)

# ---- Mouse embeddings (per 6s window) ----
mouse_Z = embed_all(mouse_seqs, mouse_sess, mouse_input_name, mouse_output_name,
                    mouse_mean, mouse_std, MOUSE_TLEN, BATCH, tag="MOUSE")
print("Mouse embeddings:", mouse_Z.shape)

MOUSE_EMB = ART/'embeddings_mouse.npz'
np.savez_compressed(
    MOUSE_EMB,
    user=np.array([u for (u, t0, t1) in mouse_meta], dtype=object),
    t0=np.array([t0 for (u, t0, t1) in mouse_meta], dtype=np.float64),
    t1=np.array([t1 for (u, t0, t1) in mouse_meta], dtype=np.float64),
    z=mouse_Z.astype(np.float32),
    allow_pickle=True
)
print("Saved:", MOUSE_EMB)

KS mean/std shapes: (2,) (2,)
Mouse mean/std shapes: (6,) (6,)
Loaded: keystroke=172,053  mouse=215,980
Using providers: ['CPUExecutionProvider']
[KS] 1,024/172,053
[KS] 52,224/172,053
[KS] 103,424/172,053
[KS] 154,624/172,053
Keystroke embeddings: (172053, 64)
Saved: /content/drive/MyDrive/mindease_behavior/artifacts/embeddings_keystroke.npz
[MOUSE] 1,024/215,980
[MOUSE] 52,224/215,980
[MOUSE] 103,424/215,980
[MOUSE] 154,624/215,980
[MOUSE] 205,824/215,980
Mouse embeddings: (215980, 64)
Saved: /content/drive/MyDrive/mindease_behavior/artifacts/embeddings_mouse.npz


In [None]:
# Pool 5 consecutive mouse windows per user to approximate 30s (6s * 5)
from collections import defaultdict

MOUSE_EMB = ART/'embeddings_mouse.npz'
me = np.load(MOUSE_EMB, allow_pickle=True)
u, t0, t1, z = me['user'], me['t0'], me['t1'], me['z']  # z: [M, 64]

by_user = defaultdict(list)
for i in range(len(u)):
    by_user[str(u[i])].append((float(t0[i]), float(t1[i]), z[i]))

pooled_user, pooled_t0, pooled_t1, pooled_z = [], [], [], []
K = 5  # group size (5*6s ≈ 30s)
for uid, lst in by_user.items():
    lst.sort(key=lambda x: x[0])  # sort by start time
    # non-overlapping groups of 5
    for i in range(0, len(lst)-K+1, K):
        seg = lst[i:i+K]
        zs = np.stack([e[2] for e in seg], axis=0)  # (K,64)
        pooled_user.append(uid)
        pooled_t0.append(seg[0][0])
        pooled_t1.append(seg[-1][1])
        pooled_z.append(zs.mean(axis=0))  # mean pool

pooled_z = np.stack(pooled_z, axis=0).astype(np.float32)
MOUSE_EMB_30 = ART/'embeddings_mouse_pooled30.npz'
np.savez_compressed(
    MOUSE_EMB_30,
    user=np.array(pooled_user, dtype=object),
    t0=np.array(pooled_t0, dtype=np.float64),
    t1=np.array(pooled_t1, dtype=np.float64),
    z=pooled_z,
    allow_pickle=True
)
print("Saved pooled mouse 30s embeddings:", MOUSE_EMB_30, pooled_z.shape)

Saved pooled mouse 30s embeddings: /content/drive/MyDrive/mindease_behavior/artifacts/embeddings_mouse_pooled30.npz (43192, 64)


# ============================
# Stage B: Global head (weak labels from face CSV)
# ============================

In [None]:
import os, math, json, time
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_fscore_support, brier_score_loss
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
DRIVE_ROOT = Path('/content/drive/MyDrive/mindease_behavior')
INTER = DRIVE_ROOT/'intermediate'
ART   = DRIVE_ROOT/'artifacts'
LAB   = DRIVE_ROOT/'labels'
ART.mkdir(parents=True, exist_ok=True)
LAB.mkdir(parents=True, exist_ok=True)

# ---- Inputs (edit path if needed)
FACE_CSV      = LAB/'face_stress_30s.csv'     # upload your CSV here in Drive
KS_EMB_NPZ    = ART/'embeddings_keystroke.npz'           # from Stage A freeze step
MOUSE30_NPZ   = ART/'embeddings_mouse_pooled30.npz'      # optional

# ---- Hyperparams / policies
CONF_COVERAGE_MIN = 0.30     # use only windows with good face coverage
Y_BIN_LOW  = 0.30            # confident "no-stress" if stress_prob <= 0.30
Y_BIN_HIGH = 0.70            # confident "stress"    if stress_prob >= 0.70
VAL_SIZE   = 0.25            # val split (small data → simple split)
RAND_SEED  = 42
USE_MOUSE  = MOUSE30_NPZ.exists()  # fuse if pooled mouse exists

In [None]:
# ============================
# B1) Load & prep label CSV
# ============================
def load_face_csv(csv_path: Path):
    df = pd.read_csv(csv_path)
    # normalize expected columns
    # expect at least: t0_unix, t1_unix, stress_prob [, confident, coverage]
    # add user_id if missing
    if 'user_id' not in df.columns:
        df['user_id'] = 'harsh'
    # put columns in known names
    rename_map = {}
    for c in df.columns:
        lc = c.strip().lower()
        if lc in ('t0', 't0_unix', 'start', 'start_time'):
            rename_map[c] = 't0_unix'
        elif lc in ('t1', 't1_unix', 'end', 'end_time'):
            rename_map[c] = 't1_unix'
        elif lc in ('stress', 'stress_prob', 'p_stress'):
            rename_map[c] = 'stress_prob'
        elif lc in ('conf', 'confident'):
            rename_map[c] = 'confident'
        elif lc in ('cov', 'coverage', 'face_coverage'):
            rename_map[c] = 'coverage'
        elif lc in ('user', 'user_id'):
            rename_map[c] = 'user_id'
        elif lc in ('session', 'session_id'):
            rename_map[c] = 'session_id'
    df = df.rename(columns=rename_map)

    # defaults
    if 'confident' not in df.columns:
        df['confident'] = 1
    if 'coverage' not in df.columns:
        df['coverage'] = 1.0
    if 'session_id' not in df.columns:
        # single-session default
        df['session_id'] = 's1'

    # types
    for col in ['t0_unix','t1_unix','stress_prob','coverage']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['confident'] = df['confident'].astype(int)
    df['user_id']   = df['user_id'].astype(str)
    df['session_id']= df['session_id'].astype(str)

    # filter usable rows
    df = df.dropna(subset=['t0_unix','t1_unix','stress_prob'])
    df = df[(df['confident'] == 1) & (df['coverage'] >= CONF_COVERAGE_MIN)].copy()

    # snap times to a 30s grid to ease joins (assuming CSV already logged on 30s cadence)
    def snap30(x):
        return (np.floor(x/30.0)*30.0).astype(np.float64)
    df['t0_unix'] = snap30(df['t0_unix'])
    df['t1_unix'] = df['t0_unix'] + 30.0

    print(f"[labels] usable rows: {len(df)} (after confident & coverage filter)")
    return df

labels = load_face_csv(FACE_CSV)

[labels] usable rows: 56 (after confident & coverage filter)


In [None]:
# ============================
# B2) Load embeddings + join
# ============================
def load_keystroke_emb(npz_path: Path):
    zf = np.load(npz_path, allow_pickle=True)
    Z = zf['z']          # [N, D]
    users = zf['user']   # object array
    sess  = zf['session']
    t0    = zf['t0'].astype(np.float64)
    t1    = zf['t1'].astype(np.float64)
    df = pd.DataFrame({
        'user_id': users.astype(str),
        'session_id': sess.astype(str),
        't0_unix': np.floor(t0/30.0)*30.0,
        't1_unix': np.floor(t1/30.0)*30.0,
    })
    return df, Z

def load_mouse30_emb(npz_path: Path):
    zf = np.load(npz_path, allow_pickle=True)
    Z = zf['z']          # [M, D]
    users = zf['user']
    t0    = zf['t0'].astype(np.float64)
    t1    = zf['t1'].astype(np.float64)
    df = pd.DataFrame({
        'user_id': users.astype(str),
        't0_unix': np.floor(t0/30.0)*30.0,
        't1_unix': np.floor(t1/30.0)*30.0,
    })
    df['session_id'] = 's1'  # mouse pooled had no session; fake one for key consistency
    return df, Z

assert KS_EMB_NPZ.exists(), f"Missing embeddings at {KS_EMB_NPZ}"
ks_df, ks_Z = load_keystroke_emb(KS_EMB_NPZ)

mouse_df, mouse_Z = None, None
if USE_MOUSE and MOUSE30_NPZ.exists():
    mouse_df, mouse_Z = load_mouse30_emb(MOUSE30_NPZ)
    USE_MOUSE = True
else:
    USE_MOUSE = False

# Keep only user 'harsh' for join (since your CSV will be user_id='harsh')
ks_df_h = ks_df[ks_df['user_id'] == 'harsh'].copy()
if USE_MOUSE:
    mouse_df_h = mouse_df[mouse_df['user_id'] == 'harsh'].copy()

# Left join labels -> embeddings on exact (user_id, session_id, t0, t1)
joined = labels.merge(ks_df_h, on=['user_id','session_id','t0_unix','t1_unix'], how='inner', suffixes=('','_ks'))
if len(joined) == 0:
    print("\n[WARNING] No keystroke embeddings matched your face-label rows.")
    print("This usually means your keystroke embeddings come from PUBLIC datasets, not your live session.")
    print("To proceed, you need to collect your own keystroke windows while logging face, then run the ONNX encoder to get embeddings.")
else:
    print(f"[join] Keystroke matches: {len(joined)}")

if USE_MOUSE:
    joined_m = labels.merge(mouse_df_h, on=['user_id','session_id','t0_unix','t1_unix'], how='inner', suffixes=('','_mouse'))
    print(f"[join] Mouse(30s) matches: {len(joined_m)}")
else:
    joined_m = None

# Build feature matrix X and targets y from joins
def build_dataset_from_join(jdf, base_df, Z, z_tag='ks'):
    if jdf is None or len(jdf)==0:
        return None, None, None
    # map each (user_id, session_id, t0, t1) row in jdf to embedding row index in base_df
    key_cols = ['user_id','session_id','t0_unix','t1_unix']
    base_key = base_df[key_cols].reset_index().rename(columns={'index':'idx'})
    j2 = jdf.merge(base_key, on=key_cols, how='left')
    # keep valid
    j2 = j2.dropna(subset=['idx'])
    j2['idx'] = j2['idx'].astype(int)
    X = Z[j2['idx'].values]
    y_prob = j2['stress_prob'].values.astype(np.float32)
    # binary mask for extreme labels
    mask_lo = y_prob <= Y_BIN_LOW
    mask_hi = y_prob >= Y_BIN_HIGH
    mask_bin = mask_lo | mask_hi
    y_bin = np.where(mask_hi, 1, 0)[mask_bin]
    X_bin = X[mask_bin]
    # also keep regression set (all confident rows)
    return {
        'X_all': X, 'y_prob_all': y_prob,
        'X_bin': X_bin, 'y_bin': y_bin,
        'mask_bin': mask_bin
    }, j2, key_cols

ks_ds, ks_join, _ = build_dataset_from_join(joined, ks_df_h, ks_Z, 'ks')
if USE_MOUSE:
    # For mouse we used fake 'session_id'='s1' for base; align labels too:
    labels_mouse = labels.copy()
    labels_mouse['session_id'] = 's1'
    joined_m = labels_mouse.merge(mouse_df_h, on=['user_id','session_id','t0_unix','t1_unix'], how='inner', suffixes=('','_mouse'))
    mouse_ds, mouse_join, _ = build_dataset_from_join(joined_m, mouse_df_h, mouse_Z, 'mouse')
else:
    mouse_ds, mouse_join = None, None

# Fuse features if both available
def fuse(ks_ds, mouse_ds):
    if ks_ds is None or mouse_ds is None: return None
    # Align by exact keys inside the joined DataFrames
    common = pd.merge(
        ks_join[['user_id','session_id','t0_unix','t1_unix']].assign(idx=np.arange(len(ks_join))),
        mouse_join[['user_id','session_id','t0_unix','t1_unix']].assign(jdx=np.arange(len(mouse_join))),
        on=['user_id','session_id','t0_unix','t1_unix'], how='inner'
    )
    if len(common)==0: return None
    Xk = ks_ds['X_all'][common['idx'].values]
    Xm = mouse_ds['X_all'][common['jdx'].values]
    X = np.concatenate([Xk, Xm], axis=1)
    y_prob = ks_ds['y_prob_all'][common['idx'].values]  # same y by construction
    # binary “extreme” subset
    mask_lo = y_prob <= Y_BIN_LOW
    mask_hi = y_prob >= Y_BIN_HIGH
    mask_bin = mask_lo | mask_hi
    X_bin = X[mask_bin]
    y_bin = np.where(mask_hi, 1, 0)[mask_bin]
    return {'X_all':X, 'y_prob_all':y_prob, 'X_bin':X_bin, 'y_bin':y_bin, 'mask_bin':mask_bin}

fused_ds = fuse(ks_ds, mouse_ds) if (ks_ds and mouse_ds) else None

# Choose which dataset to train on (preference: fused > ks)
train_source = 'ks'
ds = ks_ds
if fused_ds:
    train_source = 'fused'
    ds = fused_ds

if ds is None or ds['X_all'] is None:
    raise SystemExit("\n[STOP] No training data after join. Record keystroke/mouse during face logging and re-run embeddings.")

print(f"[train] Using source: {train_source}  | samples(all)={ds['X_all'].shape[0]}  | samples(bin)={ds['X_bin'].shape[0]}")


This usually means your keystroke embeddings come from PUBLIC datasets, not your live session.
To proceed, you need to collect your own keystroke windows while logging face, then run the ONNX encoder to get embeddings.
[join] Mouse(30s) matches: 0


SystemExit: 
[STOP] No training data after join. Record keystroke/mouse during face logging and re-run embeddings.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
