In [1]:
# Cell A — imports & constants
from pathlib import Path
import numpy as np, pickle, os
import pandas as pd

PROJECT = Path(r"C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease")
PER_FILE_DIR = PROJECT / "data" / "processed" / "per_file_seqs"
META_FILE = PROJECT / "data" / "PaHaW_dataset" / "PaHaW_files" / "corpus_PaHaW.xlsx"
MODEL_PATH = PROJECT / "data" / "processed" / "pd_blstm_model.h5"   # update if needed
OUT_DIR = PROJECT / "data" / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT:", PROJECT)
print("PER_FILE_DIR exists:", PER_FILE_DIR.exists())
print("META exists:", META_FILE.exists())
print("BLSTM model path (for encoder):", MODEL_PATH.exists())


PROJECT: C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease
PER_FILE_DIR exists: True
META exists: True
BLSTM model path (for encoder): True


In [2]:
# Cell B — build patient index mapping subject -> per-file npz list and label
meta = pd.read_excel(META_FILE)
meta['ID'] = meta['ID'].astype(str).str.zfill(5)
meta_map = dict(zip(meta['ID'], meta['Disease']))  # 'PD' or 'H'

# collect per-file .npz (created earlier by 04_feature_merge)
npz_files = sorted(PER_FILE_DIR.glob("*.npz"))
print("Per-file npz count:", len(npz_files))

patient_index = {}
for p in npz_files:
    sid = p.stem.split("__")[0]
    patient_index.setdefault(sid, []).append(str(p))

rows = []
for sid, paths in sorted(patient_index.items()):
    lab = 1 if meta_map.get(sid, "H") == "PD" else 0
    rows.append({'subject': sid, 'paths': paths, 'label': int(lab), 'n_files': len(paths)})

outp = OUT_DIR / "patient_index.pkl"
with open(outp, "wb") as f:
    pickle.dump(rows, f)
print("Saved patient_index len:", len(rows), "->", outp)

# quick peek
from collections import Counter
print("subjects by sample count (top 8):")
cnts = Counter([r['n_files'] for r in rows])
print(cnts.most_common(8))


Per-file npz count: 597
Saved patient_index len: 75 -> C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease\data\processed\patient_index.pkl
subjects by sample count (top 8):
[(8, 72), (7, 3)]


In [3]:
# Cell C — create encoder (extract intermediate layer output)
from tensorflow.keras.models import load_model, Model

print("Loading BLSTM model:", MODEL_PATH)
model = load_model(MODEL_PATH, compile=False)
print("Model loaded. Summary (run to inspect layer names):")
model.summary()

# auto-select encoder layer: try last Bidirectional, else last Dropout, else last Dense
encoder_layer_name = None
for layer in model.layers[::-1]:
    if 'bidirectional' in layer.name:
        encoder_layer_name = layer.name
        break
if encoder_layer_name is None:
    for layer in model.layers[::-1]:
        if 'dropout' in layer.name:
            encoder_layer_name = layer.name
            break
if encoder_layer_name is None:
    for layer in model.layers[::-1]:
        if 'dense' in layer.name:
            encoder_layer_name = layer.name
            break

print("Selected encoder layer:", encoder_layer_name)
encoder = Model(inputs=model.input, outputs=model.get_layer(encoder_layer_name).output)
# optionally save encoder (not mandatory)
encoder_save = OUT_DIR / "pd_encoder.h5"
encoder.save(encoder_save)
print("Encoder saved to:", encoder_save)


Loading BLSTM model: C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease\data\processed\pd_blstm_model.h5
Model loaded. Summary (run to inspect layer names):
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 267, 13)           0         
                                                                 
 bidirectional_2 (Bidirectio  (None, 267, 192)         84480     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 267, 192)          0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 96)               92544     
 nal)                                                            
                                                                 
 dropout_4 (Dropout)

In [6]:
# Cell D — encode per-file sequences and aggregate per-subject (fixed; pads/truncates to encoder input length)
import numpy as np, pickle, os
from tqdm import tqdm   # safe console progress bar
from tensorflow.keras.models import load_model

# load patient index
with open(OUT_DIR / "patient_index.pkl", "rb") as f:
    patient_rows = pickle.load(f)
print("Subjects found:", len(patient_rows))

# load encoder (Keras)
ENCODER = load_model(OUT_DIR / "pd_encoder.h5", compile=False)
print("Encoder loaded. Encoder input shape:", ENCODER.input_shape, "output shape:", ENCODER.output_shape)

# Determine expected timesteps (None, timesteps, features)
# input_shape can be like (None, 267, 13) => timesteps = 267
inp_shape = ENCODER.input_shape
if len(inp_shape) == 3:
    expected_timesteps = inp_shape[1]
    expected_features = inp_shape[2]
else:
    # fallback if input shape differs
    raise RuntimeError(f"Unexpected encoder.input_shape: {inp_shape}")

print("Expected timesteps:", expected_timesteps, "features:", expected_features)

patient_embeddings = []
labels = []
ids = []

skipped_files = 0
total_files = 0

for r in tqdm(patient_rows, desc="Subjects"):
    per_file_embs = []
    for pstr in r['paths']:
        total_files += 1
        try:
            d = np.load(pstr, allow_pickle=True)
            seq = d['seq'].astype(np.float32)  # shape (T, F)
            # ensure feature dimension matches
            if seq.ndim != 2 or seq.shape[1] != expected_features:
                # If features mismatch, try to handle or skip
                print("Skipping file (feature dim mismatch):", pstr, "expected features:", expected_features, "found:", seq.shape)
                skipped_files += 1
                continue

            T = seq.shape[0]
            if T == expected_timesteps:
                seq_p = seq
            elif T < expected_timesteps:
                # pad at the end with zeros (Masking layer must ignore zeros)
                pad_len = expected_timesteps - T
                pad = np.zeros((pad_len, expected_features), dtype=np.float32)
                seq_p = np.vstack([seq, pad])
            else:
                # T > expected_timesteps -> truncate (keep first expected_timesteps)
                seq_p = seq[:expected_timesteps, :]

            seq_p = seq_p.reshape(1, expected_timesteps, expected_features).astype(np.float32)  # (1, timesteps, features)

            out = ENCODER.predict(seq_p, verbose=0)
            arr = np.asarray(out)

            # if encoder returns sequences (1, timesteps, units), pool across time
            if arr.ndim == 3:
                emb = arr.mean(axis=1).ravel()
            elif arr.ndim == 2:
                emb = arr.ravel()
            else:
                emb = arr.ravel()

            per_file_embs.append(emb)
        except Exception as e:
            print("Skipping file", pstr, "error:", repr(e))
            skipped_files += 1
            continue

    if len(per_file_embs) == 0:
        print("No embeddings for subject", r['subject'], " — skipping subject")
        continue

    subj_emb = np.mean(np.vstack(per_file_embs), axis=0)
    patient_embeddings.append(subj_emb)
    labels.append(r['label'])
    ids.append(r['subject'])

if len(patient_embeddings) == 0:
    raise RuntimeError("No patient embeddings were produced. Check for earlier printed errors.")

X = np.vstack(patient_embeddings)
y = np.array(labels)
ids = np.array(ids)

out_file = OUT_DIR / "patient_embeddings.npz"
np.savez_compressed(out_file, X=X, y=y, ids=ids)

print("Done. Total files seen:", total_files, "skipped files:", skipped_files)
print("Saved patient embeddings:", out_file)
print("X shape:", X.shape, "y shape:", y.shape, "n_ids:", len(ids))


Exception ignored in: <function tqdm.__del__ at 0x000001AF939948B0>
Traceback (most recent call last):
  File "c:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease\.venv\lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease\.venv\lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Subjects found: 75
Encoder loaded. Encoder input shape: (None, 267, 13) output shape: (None, 96)
Expected timesteps: 267 features: 13


Subjects: 100%|██████████| 75/75 [00:35<00:00,  2.14it/s]

Done. Total files seen: 597 skipped files: 0
Saved patient embeddings: C:\Users\aibel\Desktop\Heizel Ann Joseph\Parkinsons Disease\data\processed\patient_embeddings.npz
X shape: (75, 96) y shape: (75,) n_ids: 75



