# Data Analysis

In this notebook, we load the framewise displacement data and perform some exploratory data analysis by generating some visualizations of the data.

In [17]:
# add modules to path
import sys
sys.path.insert(1, '../src')

# library imports
import torch
import random
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# project imports
import data_processing as dp
import train_eval as te
import models_nn as mnn

# autoreload all modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# Load the data from the .npz file
data_dict_ppmi, pd_keys, control_keys = dp.load_ppmi_data()
data_dict_hcp = dp.load_hcp_data()

Loaded 320/364 PD subjects and 44/88 Control subjects
Loaded 1445 subject-run combinations


In [19]:
# ----------------------- Hyper-parameters ----------------------- #
MAX_LEN = 200  # truncate / zero-pad sequences to this length
BATCH_SIZE = 32
EPOCHS = 100
LR = 2e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
RNG_SEED = 42

random.seed(RNG_SEED)
np.random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)

keys = pd_keys + control_keys
random.shuffle(keys)

# dataset = dp.MotionDataset(data_dict, keys, MAX_LEN)

dataset = dp.MotionDataset(
    datasets=[
        (data_dict_ppmi, pd_keys, 1),         # PPMI PD subjects, label 1
        (data_dict_ppmi, control_keys, 0),    # PPMI Control subjects, label 0
        (data_dict_hcp, list(data_dict_hcp.keys())[0:276], 0),  # HCP subjects, label 2 (or 0 if you want to treat as controls)
    ],
    metric_map=[
        "framewise_displacement",              # for PPMI PD
        "framewise_displacement",              # for PPMI Control
        "framewise_displacement_equivalent"    # for HCP
    ],
    max_len=200
)

train_loader, val_loader = te.train_val_split(
        dataset, val_size=0.2, random_state=RNG_SEED, batch_size=BATCH_SIZE
)


In [20]:
next(iter(train_loader))[1].shape

torch.Size([32])

In [None]:
models = {
    "GRU": mnn.RNNClassifier(cell="gru"),
    "LSTM": mnn.RNNClassifier(cell="lstm"),
    "Transformer": mnn.TransformerClassifier()
}

for name, model in models.items():
    print(f"\nTraining {name}…")
    te.train(
        model, train_loader, val_loader,
        device=DEVICE, epochs=EPOCHS, lr=LR,
    )
    acc, bal_acc, auc, _, _ = te.evaluate(model, val_loader, device=DEVICE)
    print(
        f"{name} final  ACC={acc:.3f}  BAL_ACC={bal_acc:.3f}  AUC={auc:.3f}"
    )

    preds, targets = te.get_predictions(model, val_loader, device=DEVICE)
    cm = confusion_matrix(targets, preds)
    print("Confusion matrix:\n", cm)
    print(classification_report(targets, preds, digits=3))



Training GRU…
Epoch 05/100  val_acc=0.523  val_bal_acc=0.539  val_auc=0.586
Epoch 10/100  val_acc=0.523  val_bal_acc=0.554  val_auc=0.596
Epoch 15/100  val_acc=0.570  val_bal_acc=0.581  val_auc=0.674
Epoch 20/100  val_acc=0.648  val_bal_acc=0.621  val_auc=0.697
Epoch 25/100  val_acc=0.625  val_bal_acc=0.613  val_auc=0.690
Epoch 30/100  val_acc=0.688  val_bal_acc=0.680  val_auc=0.704
Epoch 35/100  val_acc=0.688  val_bal_acc=0.680  val_auc=0.726
Epoch 40/100  val_acc=0.680  val_bal_acc=0.664  val_auc=0.756
Epoch 45/100  val_acc=0.688  val_bal_acc=0.685  val_auc=0.741
Epoch 50/100  val_acc=0.719  val_bal_acc=0.710  val_auc=0.791
Epoch 55/100  val_acc=0.703  val_bal_acc=0.703  val_auc=0.776
Epoch 60/100  val_acc=0.703  val_bal_acc=0.706  val_auc=0.763
Epoch 65/100  val_acc=0.695  val_bal_acc=0.696  val_auc=0.755
Epoch 70/100  val_acc=0.711  val_bal_acc=0.708  val_auc=0.770
Epoch 75/100  val_acc=0.711  val_bal_acc=0.714  val_auc=0.771
Epoch 80/100  val_acc=0.727  val_bal_acc=0.725  val_auc