In [1]:
import os, glob
import numpy as np
import pandas as pd

from scipy.signal import savgol_filter, find_peaks

PROJECT_ROOT = r"C:\Users\aibel\OneDrive\Desktop\Heizel Ann Joseph\Parkinsons Disease"
DATA_ROOT = os.path.join(PROJECT_ROOT, "data", "PaHaW_dataset")
PAHAW_PUBLIC = os.path.join(DATA_ROOT, "PaHaW_public")

OUT_DIR = os.path.join(PROJECT_ROOT, "data", "processed", "per_file_seqs")
os.makedirs(OUT_DIR, exist_ok=True)

print("Output dir:", OUT_DIR)


Output dir: C:\Users\aibel\OneDrive\Desktop\Heizel Ann Joseph\Parkinsons Disease\data\processed\per_file_seqs


In [2]:
def load_svc(path):
    data = np.loadtxt(path, skiprows=1)
    df = pd.DataFrame({
        "y": data[:, 0],
        "x": data[:, 1],
        "time": data[:, 2] - np.min(data[:, 2]),
        "pen": data[:, 3],
        "azim": data[:, 4],
        "alt": data[:, 5],
        "press": data[:, 6],
    })
    return df


def preprocess_df(df):
    df = df.copy()

    if len(df) >= 7:
        df['x_s'] = savgol_filter(df['x'], 7, 2)
        df['y_s'] = savgol_filter(df['y'], 7, 2)
    else:
        df['x_s'] = df['x']
        df['y_s'] = df['y']

    t = df['time'].values.astype(float)
    vx = np.gradient(df['x_s'], t)
    vy = np.gradient(df['y_s'], t)

    df['speed'] = np.sqrt(vx**2 + vy**2)

    ax = np.gradient(vx, t)
    ay = np.gradient(vy, t)

    denom = (vx**2 + vy**2)**1.5
    denom[denom == 0] = np.nan
    df['curvature'] = np.abs(vx*ay - vy*ax) / denom

    return df


In [3]:
def segment_by_pen(df):
    strokes = []
    in_stroke = False
    start = 0

    for i, pen in enumerate(df['pen']):
        if pen == 1 and not in_stroke:
            start = i
            in_stroke = True
        elif pen == 0 and in_stroke:
            strokes.append((start, i-1))
            in_stroke = False

    if in_stroke:
        strokes.append((start, len(df)-1))

    return strokes


def split_stroke_by_velocity(df, start, end, prom=0.05, dist=8):
    seg = df.iloc[start:end+1]
    speed = seg['speed'].values

    peaks, _ = find_peaks(speed,
                          prominence=prom*np.nanmax(speed),
                          distance=dist)

    if len(peaks) == 0:
        return [(start, end)]

    cuts = [start] + [start + int(p) for p in peaks] + [end]
    subs = []

    for i in range(len(cuts)-1):
        s, e = cuts[i], cuts[i+1]
        if e - s >= 6:
            subs.append((s, e))

    return subs


In [4]:
def compute_substroke_feature_vector(seg, df_global):
    duration = seg['time'].iloc[-1] - seg['time'].iloc[0]

    dx = seg['x_s'].max() - seg['x_s'].min()
    dy = seg['y_s'].max() - seg['y_s'].min()
    amplitude = np.sqrt(dx*dx + dy*dy)

    mean_speed = seg['speed'].mean()
    mean_press = seg['press'].mean()
    mean_curv  = seg['curvature'].mean()

    # Beta proxy
    beta_A = mean_speed
    beta_a = 2.0
    beta_b = 2.0

    # Ellipse proxy
    ell_a = amplitude
    ell_b = amplitude / 2 if amplitude != 0 else 0
    ell_e = np.sqrt(1 - (ell_b**2)/(ell_a**2)) if ell_a != 0 else 0

    f_speed_high = 1.0 if mean_speed > np.nanmedian(df_global['speed']) else 0.0
    f_press_high = 1.0 if mean_press > np.nanmedian(df_global['press']) else 0.0
    f_curv_high  = 1.0 if mean_curv  > np.nanmedian(df_global['curvature']) else 0.0

    return np.array([
        duration, amplitude, mean_speed, mean_press,
        beta_A, beta_a, beta_b,
        ell_a, ell_b, ell_e,
        f_speed_high, f_press_high, f_curv_high,
        mean_curv   # ðŸ‘ˆ 14th feature (CRITICAL)
    ], dtype=float)


In [5]:
def process_file(svc_path):
    df = load_svc(svc_path)
    df = preprocess_df(df)

    pen_strokes = segment_by_pen(df)

    features = []
    for (s, e) in pen_strokes:
        subs = split_stroke_by_velocity(df, s, e)
        for (ss, ee) in subs:
            seg = df.iloc[ss:ee+1]
            if len(seg) < 6:
                continue
            vec = compute_substroke_feature_vector(seg, df)
            features.append(vec)

    if len(features) == 0:
        return None

    return np.vstack(features)


In [6]:
svc_files = sorted(glob.glob(os.path.join(PAHAW_PUBLIC, "*", "*.svc")))
print("Total svc files:", len(svc_files))

saved = 0

for svc in svc_files:
    feats = process_file(svc)
    if feats is None:
        continue

    fname = os.path.basename(svc).replace(".svc", ".npz")
    out_path = os.path.join(OUT_DIR, fname)

    np.savez_compressed(out_path, X=feats)
    saved += 1

print("Saved sequences:", saved)


Total svc files: 597


  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  a = -(dx2)/(dx1 * (dx1 + dx2))
  c = dx1 / (dx2 * (dx1 + dx2))
  out[tuple(slice1)] = a * f[tuple(slice2)] + b * f[tup

Saved sequences: 597


In [7]:
sample_npz = sorted(glob.glob(os.path.join(OUT_DIR, "*.npz")))[0]
d = np.load(sample_npz)

print("Sample file:", os.path.basename(sample_npz))
print("Feature shape:", d['X'].shape)  # (N_substrokes, 14)


Sample file: 00001__1_1.npz
Feature shape: (63, 14)
