In [4]:
import os, glob, json
import numpy as np
import pandas as pd
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.onnx

In [5]:
# -----------------------
# Config
# -----------------------
SEQ_LEN     = 32
FEATS       = ["Hold_Time", "DD", "UD"]   # extend later if you add engineered features
BATCH_SIZE  = 64
EPOCHS      = 50
PATIENCE    = 5
LR          = 1e-3
DEVICE      = torch.device("cpu")

In [6]:
# -----------------------
# Find project root & dataset paths robustly
# -----------------------
def find_project_root() -> str:
    """
    Try current dir and up to 4 parents to locate a folder that contains 'backend/ml/dataset/raw'.
    If not found, return current working directory.
    """
    cwd = os.getcwd()
    candidates = [cwd]
    # try parents
    cur = cwd
    for _ in range(4):
        cur = os.path.dirname(cur)
        if cur and cur not in candidates:
            candidates.append(cur)
    for base in candidates:
        raw_dir = os.path.join(base, "backend", "ml", "dataset", "raw")
        if os.path.isdir(raw_dir):
            return base
    return cwd

PROJECT_ROOT = find_project_root()
RAW_DIR  = os.path.join(PROJECT_ROOT, "backend", "ml", "dataset", "raw")
PROC_DIR = os.path.join(PROJECT_ROOT, "backend", "ml", "dataset", "processed")
MODEL_DIR = os.path.join(PROJECT_ROOT, "backend", "ml", "models")
EVAL_DIR  = os.path.join(PROJECT_ROOT, "backend", "ml", "evaluation")

os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

def find_raw_csvs() -> List[str]:
    # Prefer repo raw folder
    repo_paths = sorted(glob.glob(os.path.join(RAW_DIR, "*.csv")))
    if repo_paths:
        print(f"Using raw CSVs from repo: {RAW_DIR}  (found {len(repo_paths)})")
        return repo_paths
    # Fallback to /mnt/data for quick tests
    mnt_paths = sorted(glob.glob("/mnt/data/*.csv"))
    if mnt_paths:
        print(f"Using raw CSVs from /mnt/data  (found {len(mnt_paths)})")
        return mnt_paths
    return []

raw_files = find_raw_csvs()
if not raw_files:
    raise FileNotFoundError(
        f"No CSV files found.\n"
        f"Looked in:\n - {RAW_DIR}/*.csv\n - /mnt/data/*.csv\n"
        f"Tip: put your raw files in {RAW_DIR}"
    )

print("CWD:", os.getcwd())
print("Example file:", raw_files[0])

Using raw CSVs from repo: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\dataset\raw  (found 6)
CWD: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\notebooks
Example file: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\dataset\raw\100_.tie5Roanl_keystroke_raw.csv


In [8]:
# -----------------------
# Robust CSV loader
# -----------------------
def sniff_sep(sample: str) -> str:
    # prefer ; if it appears more than commas in header line
    first = sample.splitlines()[0]
    if first.count(";") > first.count(","): 
        return ";"
    return ","

def load_csv_robust(path: str) -> pd.DataFrame:
    # Try several strategies to avoid ParserError (bad rows, odd quoting, different sep, encodings)
    try:
        return pd.read_csv(path)  # fastest path
    except Exception:
        pass

    # Read a small header to guess delimiter
    with open(path, "rb") as f:
        head = f.read(8192)
    try:
        sample = head.decode("utf-8", errors="ignore")
    except Exception:
        sample = head.decode("latin-1", errors="ignore")
    sep = sniff_sep(sample)

    # Try python engine with autodetected sep and robust options
    for enc in ("utf-8", "latin-1"):
        try:
            df = pd.read_csv(
                path,
                sep=sep,
                engine="python",
                encoding=enc,
                on_bad_lines="skip",     # skip malformed rows
                quoting=0,               # QUOTE_MINIMAL
                skip_blank_lines=True
            )
            if len(df) == 0:
                continue
            return df
        except Exception:
            continue

    # Final fallback: manually filter rows with wrong field counts
    rows = []
    expected_fields: Optional[int] = None
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = [p.strip() for p in line.rstrip("\n").split(sep)]
            if expected_fields is None:
                expected_fields = len(parts)
                rows.append(parts)
            else:
                if len(parts) == expected_fields:
                    rows.append(parts)
                else:
                    # skip malformed line
                    continue
    df = pd.DataFrame(rows[1:], columns=rows[0]) if rows else pd.DataFrame()
    return df

In [9]:
# -----------------------
# Load & normalize all raw CSVs
# -----------------------
dfs = []
for fp in raw_files:
    df = load_csv_robust(fp)

    # Required columns; some datasets use slightly different casing—normalize column names
    df.columns = [c.strip() for c in df.columns]
    rename_map = {
        "user_id":"User_ID", "session_id":"Session_ID",
        "press_time":"Press_Time", "release_time":"Release_Time",
        "hold_time":"Hold_Time"
    }
    for k,v in rename_map.items():
        if k in df.columns and v not in df.columns: df.rename(columns={k:v}, inplace=True)

    required = ["User_ID", "Session_ID", "Press_Time", "Release_Time"]
    miss = [c for c in required if c not in df.columns]
    if miss:
        raise ValueError(f"{os.path.basename(fp)} missing required columns: {miss}")

    # Derive Hold_Time/flight times if absent
    if "Hold_Time" not in df.columns:
        df["Hold_Time"] = pd.to_numeric(df["Release_Time"] - df["Press_Time"], errors="coerce")

    if ("DD" not in df.columns) or ("UD" not in df.columns):
        df = df.sort_values(["User_ID","Session_ID","Press_Time"]).reset_index(drop=True)
        def flights(g):
            g = g.sort_values("Press_Time").copy()
            g["DD"] = g["Press_Time"].diff().fillna(0.0)
            g["UD"] = g["Press_Time"].values - g["Release_Time"].shift(1).fillna(g["Press_Time"]).values
            return g
        df = df.groupby(["User_ID","Session_ID"], group_keys=False).apply(flights)

    keep = ["User_ID","Session_ID","Press_Time","Release_Time","Hold_Time","DD","UD"]
    for extra in ["Key_Pressed","Key_Pressed_Previous","Characters_Count"]:
        if extra in df.columns: keep.append(extra)

    # numeric cleanup
    for c in ["Hold_Time","DD","UD","Press_Time","Release_Time"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    df = df.dropna(subset=["Hold_Time","DD","UD","Press_Time","Release_Time"]).reset_index(drop=True)

    dfs.append(df[keep].copy())

raw = pd.concat(dfs, ignore_index=True)

# tame long tails
for c in ["Hold_Time","DD","UD"]:
    lo, hi = raw[c].quantile(0.001), raw[c].quantile(0.999)
    raw[c] = raw[c].clip(lo, hi).fillna(0.0)


In [10]:
# -----------------------
# Weak labels via per-session IQR (1 = timing anomaly)
# -----------------------
def iqr_mask(s: pd.Series):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = max(q3 - q1, 1e-6)
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    return (s < lower) | (s > upper)

labels = np.zeros(len(raw), dtype=np.int64)
for (_, _), g in raw.groupby(["User_ID","Session_ID"]):
    idx = g.index
    mask = iqr_mask(g["Hold_Time"]) | iqr_mask(g["DD"]) | iqr_mask(g["UD"])
    labels[idx] = mask.astype(np.int64)
raw["label"] = labels

# -----------------------
# Build windows (stride=1)
# -----------------------
def build_windows(df: pd.DataFrame, seq_len=SEQ_LEN, feat_cols=FEATS):
    X_list, y_list = [], []
    for (_uid, _sid), g in df.groupby(["User_ID","Session_ID"]):
        g = g.sort_values(["Press_Time","Release_Time"]).reset_index(drop=True)
        feats = g[feat_cols].values.astype(np.float32)
        labs  = g["label"].values.astype(np.int64)
        T = len(g)
        if T < seq_len: continue
        for i in range(T - seq_len + 1):
            X_list.append(feats[i:i+seq_len])
            y_list.append(labs[i:i+seq_len])
    if not X_list:
        raise ValueError("No sequences created. Consider reducing SEQ_LEN or check raw files.")
    X = np.stack(X_list, axis=0)
    y = np.stack(y_list, axis=0)
    return X, y

X, y = build_windows(raw, SEQ_LEN, FEATS)
print("Built windows:", X.shape, y.shape)


Built windows: (7965, 32, 3) (7965, 32)


In [11]:
# -----------------------
# Split → Standardize → Save processed
# -----------------------
N = len(X)
idx = np.arange(N)
idx_train, idx_tmp = train_test_split(idx, test_size=0.30, random_state=42, shuffle=True)
idx_val, idx_test = train_test_split(idx_tmp, test_size=0.50, random_state=42, shuffle=True)

X_train, y_train = X[idx_train], y[idx_train]
X_val,   y_val   = X[idx_val],   y[idx_val]
X_test,  y_test  = X[idx_test],  y[idx_test]

mean = X_train.reshape(-1, X_train.shape[-1]).mean(axis=0)
std  = X_train.reshape(-1, X_train.shape[-1]).std(axis=0) + 1e-6
def standardize(a): return (a - mean) / std

X_train = standardize(X_train).astype(np.float32)
X_val   = standardize(X_val).astype(np.float32)
X_test  = standardize(X_test).astype(np.float32)

np.save(os.path.join(PROC_DIR, "X_train.npy"), X_train)
np.save(os.path.join(PROC_DIR, "y_train.npy"), y_train)
np.save(os.path.join(PROC_DIR, "X_val.npy"),   X_val)
np.save(os.path.join(PROC_DIR, "y_val.npy"),   y_val)
np.save(os.path.join(PROC_DIR, "X_test.npy"),  X_test)
np.save(os.path.join(PROC_DIR, "y_test.npy"),  y_test)

with open(os.path.join(PROC_DIR, "feature_config.json"), "w") as f:
    json.dump({
        "seq_len": SEQ_LEN,
        "features": FEATS,
        "scaler_mean": mean.tolist(),
        "scaler_scale": std.tolist()
    }, f, indent=2)

print(f"Processed arrays saved to: {PROC_DIR}")

Processed arrays saved to: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\dataset\processed


In [12]:
# -----------------------
# Define TCN
# -----------------------
class Chomp1d(nn.Module):
    def __init__(self, chomp): super().__init__(); self.chomp = chomp
    def forward(self, x): return x[:, :, :-self.chomp] if self.chomp > 0 else x

class TemporalBlock(nn.Module):
    def __init__(self, in_ch, out_ch, k, stride, dilation, padding):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, out_ch, k, stride=stride, padding=padding, dilation=dilation),
            Chomp1d(padding), nn.ReLU(), nn.BatchNorm1d(out_ch),
            nn.Conv1d(out_ch, out_ch, k, stride=stride, padding=padding, dilation=dilation),
            Chomp1d(padding), nn.ReLU(), nn.BatchNorm1d(out_ch),
        )
        self.down = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else None
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.net(x)
        res = x if self.down is None else self.down(x)
        return self.relu(out + res)

class TCN(nn.Module):
    def __init__(self, input_size, output_size, channels=(32,32,32), kernel_size=3, dilations=(1,2,4)):
        super().__init__()
        layers = []
        in_c = input_size
        for c, d in zip(channels, dilations):
            pad = (kernel_size - 1) * d
            layers.append(TemporalBlock(in_c, c, kernel_size, 1, d, pad))
            in_c = c
        self.tcn  = nn.Sequential(*layers)
        self.head = nn.Conv1d(in_c, output_size, kernel_size=1)
    def forward(self, x):          # [B,L,F]
        x = x.transpose(1,2)       # [B,F,L]
        h = self.tcn(x)            # [B,C,L]
        y = self.head(h)           # [B,2,L]
        return y.transpose(1,2)    # [B,L,2]

In [13]:
# -----------------------
# Torch DataLoaders
# -----------------------
def to_tensors(Xa, ya): return torch.tensor(Xa, dtype=torch.float32), torch.tensor(ya, dtype=torch.long)
Xtr_t, ytr_t = to_tensors(X_train, y_train)
Xva_t, yva_t = to_tensors(X_val,   y_val)
Xte_t, yte_t = to_tensors(X_test,  y_test)

train_loader = DataLoader(TensorDataset(Xtr_t, ytr_t), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TensorDataset(Xva_t, yva_t), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(TensorDataset(Xte_t, yte_t), batch_size=BATCH_SIZE, shuffle=False)

# -----------------------
# Train + Early stopping
# -----------------------
INPUT_SIZE  = X_train.shape[2]
OUTPUT_SIZE = int(np.max([y_train.max(), y_val.max(), y_test.max()])) + 1  # should be 2

model = TCN(INPUT_SIZE, OUTPUT_SIZE).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

best_val = float("inf"); patience = 0
best_path = os.path.join(MODEL_DIR, "tcn_model.pt")

for epoch in range(EPOCHS):
    model.train(); tr_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)                              # [B,L,2]
        loss = criterion(logits.reshape(-1, OUTPUT_SIZE), yb.reshape(-1))
        loss.backward(); optimizer.step()
        tr_loss += loss.item() * xb.size(0)
    tr_loss /= len(train_loader.dataset)

    model.eval(); va_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss = criterion(logits.reshape(-1, OUTPUT_SIZE), yb.reshape(-1))
            va_loss += loss.item() * xb.size(0)
    va_loss /= len(val_loader.dataset)
    print(f"Epoch {epoch+1:02d} | train {tr_loss:.4f} | val {va_loss:.4f}")

    if va_loss < best_val:
        best_val = va_loss; patience = 0
        torch.save(model.state_dict(), best_path)
    else:
        patience += 1
        if patience >= PATIENCE:
            print("Early stopping."); break

Epoch 01 | train 0.3521 | val 0.1980
Epoch 02 | train 0.1600 | val 0.1363
Epoch 03 | train 0.1219 | val 0.1138
Epoch 04 | train 0.0983 | val 0.0855
Epoch 05 | train 0.0824 | val 0.0765
Epoch 06 | train 0.0682 | val 0.0669
Epoch 07 | train 0.0626 | val 0.0588
Epoch 08 | train 0.0557 | val 0.0495
Epoch 09 | train 0.0489 | val 0.0460
Epoch 10 | train 0.0431 | val 0.0456
Epoch 11 | train 0.0414 | val 0.0405
Epoch 12 | train 0.0397 | val 0.0412
Epoch 13 | train 0.0375 | val 0.0346
Epoch 14 | train 0.0384 | val 0.0336
Epoch 15 | train 0.0320 | val 0.0470
Epoch 16 | train 0.0389 | val 0.0312
Epoch 17 | train 0.0311 | val 0.0328
Epoch 18 | train 0.0306 | val 0.0329
Epoch 19 | train 0.0287 | val 0.0315
Epoch 20 | train 0.0313 | val 0.0305
Epoch 21 | train 0.0292 | val 0.0297
Epoch 22 | train 0.0308 | val 0.0274
Epoch 23 | train 0.0255 | val 0.0380
Epoch 24 | train 0.0243 | val 0.0302
Epoch 25 | train 0.0256 | val 0.0290
Epoch 26 | train 0.0262 | val 0.0306
Epoch 27 | train 0.0238 | val 0.0302
E

In [14]:
# -----------------------
# Evaluate
# -----------------------
model.load_state_dict(torch.load(best_path)); model.eval()
all_preds, all_tgts = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        pred = model(xb).argmax(dim=-1).cpu().numpy()
        all_preds.append(pred); all_tgts.append(yb.numpy())

all_preds = np.concatenate(all_preds, axis=0).reshape(-1)
all_tgts  = np.concatenate(all_tgts,  axis=0).reshape(-1)

prec, rec, f1, _ = precision_recall_fscore_support(all_tgts, all_preds, average='binary')
metrics = {"precision": float(prec), "recall": float(rec), "f1_score": float(f1)}
with open(os.path.join(EVAL_DIR, "tcn_metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)
print("Test metrics:", metrics)

Test metrics: {'precision': 0.9798201359947357, 'recall': 0.9695062398263701, 'f1_score': 0.974635902470954}


In [16]:
# -----------------------
# Export ONNX + feature config
# -----------------------
dummy = torch.randn(1, SEQ_LEN, INPUT_SIZE, device=DEVICE)
onnx_path = os.path.join(MODEL_DIR, "tcn_model.onnx")
torch.onnx.export(
    model, dummy, onnx_path,
    input_names=["input"], output_names=["output"],
    dynamic_axes={"input": {0:"batch", 1:"seq_len"}, "output": {0:"batch", 1:"seq_len"}},
    opset_version=14
)
print("Exported ONNX:", onnx_path)

with open(os.path.join(EVAL_DIR, "tcn_feature_config.json"), "w") as f:
    json.dump({
        "seq_len": SEQ_LEN,
        "features": FEATS,
        "scaler_mean": mean.tolist(),
        "scaler_scale": std.tolist()
    }, f, indent=2)
print("Saved feature config:", os.path.join(EVAL_DIR, "tcn_feature_config.json"))

  torch.onnx.export(


Exported ONNX: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\models\tcn_model.onnx
Saved feature config: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\evaluation\tcn_feature_config.json
