In [1]:
# Cell 1 â€” imports & paths
import numpy as np, pandas as pd, pickle, math, os
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline

PROJECT = Path(r"C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease")
PROCESSED = PROJECT / "data" / "processed"
PAHAW_PUBLIC = PROJECT / "data" / "PaHaW_dataset" / "PaHaW_public"
PER_FILE_DIR = PROCESSED / "per_file_seqs"   # previously created .npz per-file sequences
PATIENT_INDEX = PROCESSED / "patient_index.pkl"   # created during preprocessing
ENCODER_PATH = PROCESSED / "pd_encoder.h5"        # encoder (BLSTM -> embedding)
PATIENT_CLF_PATH = PROCESSED / "patient_level_clf.h5"  # patient classifier
PER_FILE_DIR.exists(), PATIENT_INDEX.exists(), ENCODER_PATH.exists(), PATIENT_CLF_PATH.exists()


(True, True, True, True)

In [13]:

import numpy as np, pandas as pd, pickle
from pathlib import Path
from tensorflow.keras.models import load_model

PROJECT = Path(r"C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease")
PROCESSED = PROJECT / "data" / "processed"
PATIENT_INDEX = PROCESSED / "patient_index.pkl"
ENCODER_PATH = PROCESSED / "pd_encoder.h5"
PATIENT_CLF_PATH = PROCESSED / "patient_level_clf.h5"
PAHAW_PUBLIC = PROJECT / "data" / "PaHaW_dataset" / "PaHaW_public"

# load resources
assert PATIENT_INDEX.exists(), "patient_index.pkl missing"
ENC = load_model(str(ENCODER_PATH), compile=False)
PAT_CLF = load_model(str(PATIENT_CLF_PATH), compile=False)
with open(PATIENT_INDEX, "rb") as f:
    patient_rows = pickle.load(f)

timesteps = ENC.input_shape[1]
features = ENC.input_shape[2]

def load_per_file_seq(npz_path):
    d = np.load(str(npz_path), allow_pickle=True)
    seq = d['seq'].astype(np.float32)
    T, F = seq.shape
    if F != features:
        raise ValueError(f"Feature dim mismatch: expected {features}, got {F} for {npz_path}")
    if T < timesteps:
        pad = np.zeros((timesteps - T, features), dtype=np.float32)
        seq_p = np.vstack([seq, pad])
    else:
        seq_p = seq[:timesteps, :]
    return seq_p

SUBJECT = "00025"
row = next((r for r in patient_rows if r['subject'] == SUBJECT), None)
if row is None:
    raise ValueError(f"Subject {SUBJECT} not present in patient_index.pkl")

print("Subject:", SUBJECT, "true label:", row.get('label'))
per_file_info = []
for pstr in row['paths']:
    try:
        seq_p = load_per_file_seq(pstr)
    except Exception as e:
        per_file_info.append((Path(pstr).name, np.nan, "load_err"))
        continue
    emb = ENC.predict(seq_p.reshape(1,timesteps,features), verbose=0)
    emb_vec = emb.mean(axis=1).ravel() if emb.ndim==3 else emb.ravel()
    prob = float(PAT_CLF.predict(emb_vec.reshape(1,-1)).ravel()[0])
    per_file_info.append((Path(pstr).name, prob, seq_p.shape[0]))

df = pd.DataFrame(per_file_info, columns=['file','prob','timesteps']).sort_values('prob', ascending=False).reset_index(drop=True)
pd.set_option('display.max_rows', None)
print("\nPer-file probabilities (high->low):")
print(df.to_string(index=False))

# Aggregations
probs = df['prob'].dropna().to_numpy()
weights = df['timesteps'].to_numpy(dtype=float)
mean_p = float(np.mean(probs)) if len(probs)>0 else np.nan
median_p = float(np.median(probs)) if len(probs)>0 else np.nan
majority = int(np.sum(probs>=0.5) >= (len(probs)/2)) if len(probs)>0 else np.nan
wmean = float(np.average(probs, weights=weights)) if len(probs)>0 and weights.sum()>0 else mean_p

print("\nAggregations:")
print(f" mean prob = {mean_p:.4f}")
print(f" median prob = {median_p:.4f}")
print(f" weighted-mean (by timesteps) = {wmean:.4f}")
print(f" majority vote (per-file >=0.5) = {majority}  (count >=0.5 = {int((probs>=0.5).sum())}/{len(probs)})")

print("\nSuggested labels by rule:")
print(" mean  ->", "PD" if mean_p>=0.5 else "H")
print(" median->", "PD" if median_p>=0.5 else "H")
print(" wmean ->", "PD" if wmean>=0.5 else "H")
print(" majority->", "PD" if majority==1 else "H")


Subject: 00025 true label: 1

Per-file probabilities (high->low):
          file     prob  timesteps
00025__3_1.npz 0.509000        267
00025__1_1.npz 0.505612        267
00025__4_1.npz 0.504479        267
00025__7_1.npz 0.495415        267
00025__6_1.npz 0.495038        267
00025__2_1.npz 0.489056        267
00025__8_1.npz 0.485159        267
00025__5_1.npz 0.481189        267

Aggregations:
 mean prob = 0.4956
 median prob = 0.4952
 weighted-mean (by timesteps) = 0.4956
 majority vote (per-file >=0.5) = 0  (count >=0.5 = 3/8)

Suggested labels by rule:
 mean  -> H
 median-> H
 wmean -> H
 majority-> H


In [14]:
import numpy as np, pandas as pd, pickle, matplotlib.pyplot as plt
from pathlib import Path
from tensorflow.keras.models import load_model

PROJECT = Path(r"C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease")
PROCESSED = PROJECT / "data" / "processed"
PATIENT_INDEX = PROCESSED / "patient_index.pkl"
ENCODER_PATH = PROCESSED / "pd_encoder.h5"
PATIENT_CLF_PATH = PROCESSED / "patient_level_clf.h5"
PAHAW_PUBLIC = PROJECT / "data" / "PaHaW_dataset" / "PaHaW_public"

ENC = load_model(str(ENCODER_PATH), compile=False)
PAT_CLF = load_model(str(PATIENT_CLF_PATH), compile=False)
with open(PATIENT_INDEX, "rb") as f:
    patient_rows = pickle.load(f)
timesteps = ENC.input_shape[1]
features = ENC.input_shape[2]

def load_per_file_seq(npz_path):
    d = np.load(str(npz_path), allow_pickle=True)
    seq = d['seq'].astype(np.float32)
    T, F = seq.shape
    if T < timesteps:
        pad = np.zeros((timesteps - T, F), dtype=np.float32)
        seq = np.vstack([seq, pad])
    else:
        seq = seq[:timesteps,:]
    return seq

SUBJECT = "00025"
row = next((r for r in patient_rows if r['subject']==SUBJECT), None)
print("Subject:", SUBJECT, "true label:", row.get('label'))

per_file = []
svc_paths_for_plot = []
for pstr in row['paths']:
    try:
        seq = load_per_file_seq(pstr)
    except Exception as e:
        per_file.append((Path(pstr).name, np.nan))
        continue
    emb = ENC.predict(seq.reshape(1,timesteps,features), verbose=0)
    emb_vec = emb.mean(axis=1).ravel() if emb.ndim==3 else emb.ravel()
    prob = float(PAT_CLF.predict(emb_vec.reshape(1,-1)).ravel()[0])
    per_file.append((Path(pstr).name, prob))
    svc_candidate = PAHAW_PUBLIC / Path(pstr).name.replace(".npz", ".svc")
    if svc_candidate.exists():
        svc_paths_for_plot.append(svc_candidate)

df = pd.DataFrame(per_file, columns=['file','prob']).sort_values('prob', ascending=False).reset_index(drop=True)
print(df.to_string(index=False))

probs = df['prob'].dropna().to_numpy()
weights = np.array([int(name.split("__")[-1].split(".")[0].split("_")[-1]) if True else 1 for name in df['file']]) # fallback weights
mean_p = float(np.mean(probs))
median_p = float(np.median(probs))
majority = int(np.sum(probs>=0.5) >= (len(probs)/2))
wmean = float(np.average(probs, weights=weights)) if weights.sum()>0 else mean_p

print("\nAggregations:")
print(f" mean = {mean_p:.4f}  -> {'PD' if mean_p>=0.5 else 'H'}")
print(f" median = {median_p:.4f} -> {'PD' if median_p>=0.5 else 'H'}")
print(f" wmean = {wmean:.4f} -> {'PD' if wmean>=0.5 else 'H'}")
print(f" majority = {int((probs>=0.5).sum())}/{len(probs)} -> {'PD' if majority==1 else 'H'}")

if len(svc_paths_for_plot)>0:
    nshow = min(4, len(svc_paths_for_plot))
    plt.figure(figsize=(6, 2.2*nshow))
    for i, sp in enumerate(svc_paths_for_plot[:nshow]):
        lines = open(sp, 'r', errors='ignore').read().splitlines()
        pts = []
        for ln in lines:
            parts = ln.strip().split()
            if len(parts)>=8 and parts[0].isdigit():
                try:
                    _, x,y, *_ = parts[:8]
                    pts.append([float(x), float(y)])
                except:
                    continue
        if not pts:
            continue
        pts = np.array(pts)
        plt.subplot(nshow,1,i+1)
        plt.plot(pts[:,0], pts[:,1], linewidth=0.6, marker='.', markersize=1)
        plt.gca().invert_yaxis(); plt.axis('equal'); plt.title(sp.name)
    plt.suptitle(f"Sample spirals for subject {SUBJECT}")
    plt.show()
else:
    print("No .svc files found for plotting for this subject.")


Subject: 00025 true label: 1
          file     prob
00025__3_1.npz 0.509000
00025__1_1.npz 0.505612
00025__4_1.npz 0.504479
00025__7_1.npz 0.495415
00025__6_1.npz 0.495038
00025__2_1.npz 0.489056
00025__8_1.npz 0.485159
00025__5_1.npz 0.481189

Aggregations:
 mean = 0.4956  -> H
 median = 0.4952 -> H
 wmean = 0.4956 -> H
 majority = 3/8 -> H
No .svc files found for plotting for this subject.
