# 01 — Preprocess NACC/SCAN MRI Volumes

This notebook:
1. Loads raw NACC UDS clinical data (CSV)
2. Constructs ordinal CDR labels and conversion labels
3. Performs subject-level train/val/test split
4. Preprocesses NIfTI volumes → normalised 128³ tensors
5. Saves a manifest CSV linking each scan to its labels and split

In [None]:
# Mount Drive (Colab)
# from google.colab import drive
# drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/alzheimer-research')  # adjust path

from config import Config
cfg = Config()
cfg.ensure_dirs()

## 1. Load NACC UDS Data

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# === CONFIGURE THESE PATHS ===
NACC_CSV_PATH = 'path/to/nacc_uds_longitudinal.csv'  # NACC UDS export
MRI_DIR = 'path/to/nifti_volumes/'                    # directory of .nii.gz files
OUTPUT_DIR = cfg.embedding_dir

# Load NACC UDS data
uds = pd.read_csv(NACC_CSV_PATH)
print(f'Total visits: {len(uds)}')
print(f'Unique subjects: {uds["NACCID"].nunique()}')
print(f'CDR distribution:\n{uds["CDRGLOB"].value_counts().sort_index()}')

## 2. Build Labels

In [None]:
from data.label_construction import (
    map_cdr_to_ordinal,
    build_conversion_labels,
    subject_level_split,
)

# Map CDR to ordinal classes
uds['ordinal_label'] = uds['CDRGLOB'].apply(map_cdr_to_ordinal)
print('Ordinal class distribution:')
print(uds['ordinal_label'].value_counts().sort_index())

# Build conversion labels for MCI subjects
conversion_df = build_conversion_labels(
    uds,
    window_months=cfg.conversion_window_months,
    subject_col='NACCID',
    date_col='VISITDATE',
    cdr_col='CDRGLOB',
)
print(f'\nMCI subjects: {len(conversion_df)}')
print(f'Status distribution:\n{conversion_df["status"].value_counts()}')

## 3. Subject-Level Split

In [None]:
# Get baseline CDR per subject for stratified splitting
baseline = uds.sort_values('VISITDATE').groupby('NACCID').first().reset_index()
subjects = baseline['NACCID'].values
baseline_labels = baseline['ordinal_label'].values

splits = subject_level_split(
    subjects, baseline_labels,
    train_frac=cfg.train_frac,
    val_frac=cfg.val_frac,
    seed=cfg.seed,
)

print(f'Train: {len(splits["train"])} subjects')
print(f'Val:   {len(splits["val"])} subjects')
print(f'Test:  {len(splits["test"])} subjects')

# Save splits
for split_name, ids in splits.items():
    np.save(OUTPUT_DIR / f'{split_name}_subject_ids.npy', ids)

## 4. Build Manifest CSV

Links each NIfTI scan to its NACCID, visit date, CDR, ordinal label, split, and conversion info.

In [None]:
import glob

# Discover NIfTI files and match to UDS visits
nii_files = sorted(glob.glob(str(Path(MRI_DIR) / '**/*.nii.gz'), recursive=True))
print(f'Found {len(nii_files)} NIfTI files')

# Build manifest — adapt this mapping to your NACC file naming convention
# Typical: NACCID_SCANDATE.nii.gz or similar
manifest_rows = []
for nii_path in nii_files:
    fname = Path(nii_path).stem.replace('.nii', '')
    # === ADAPT THIS PARSING TO YOUR FILE NAMING ===
    # Example: parts = fname.split('_')
    # naccid = parts[0]
    # scan_date = parts[1]
    # For now, placeholder:
    pass  # Replace with actual parsing logic

# After building manifest_rows, convert to DataFrame:
# manifest = pd.DataFrame(manifest_rows)
# manifest.to_csv(OUTPUT_DIR / 'nacc_mri_manifest.csv', index=False)
print('TODO: Adapt file naming parser for your NACC NIfTI naming convention')

## 5. Verify Preprocessing

Test the preprocessing pipeline on a single volume.

In [None]:
from data.preprocessing import MRIPreprocessor

preprocessor = MRIPreprocessor(target_shape=cfg.mri_volume_shape)

# Test on one file
# sample_path = nii_files[0]
# tensor = preprocessor.process_to_tensor(sample_path)
# print(f'Output shape: {tensor.shape}')      # Expected: (1, 128, 128, 128)
# print(f'Value range: [{tensor.min():.2f}, {tensor.max():.2f}]')
# print(f'Mean: {tensor.mean():.4f}, Std: {tensor.std():.4f}')