# 00 — Preview VitalDB Data

This notebook:
1. Discovers VitalDB cases with `ECG_II` and `ART`.
2. Loads a small subset at 100 Hz.
3. Preprocesses ECG and detects R-peaks.
4. Computes per-beat SBP/DBP/MBP from `ART` between consecutive R-peaks.
5. Saves a beat-level table to `data/interim/beat_table.parquet`.

> Ensure you've installed dependencies: `vitaldb`, `pandas`, `numpy`, `scipy`, `pyarrow`, `matplotlib`, `neurokit2`.

In [None]:
# Optional installs (uncomment locally)
# !pip install vitaldb neurokit2 wfdb pandas numpy scipy pyarrow matplotlib

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import neurokit2 as nk
import vitaldb

# Project paths (adjust if running outside repo)
DATA_RAW = Path('data/raw')
DATA_INTERIM = Path('data/interim')
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

## 1) Find cases with ECG+ART

In [None]:
caseids = list(map(int, vitaldb.find_cases(['ECG_II','ART'])))
print(f'Found {len(caseids)} cases with ECG_II and ART')
caseids[:10]

## 2) Load one case at 100 Hz

In [None]:
fs = 100
interval = 1.0/fs
cid = caseids[0]
arr = vitaldb.load_case(cid, ['ECG_II','ART'], interval)
print(arr.shape)

ecg = arr[:,0]
art = arr[:,1]
t = np.arange(len(ecg))*interval

# basic frame
import pandas as pd
df = pd.DataFrame({'t':t,'ECG_II':ecg,'ART':art,'subject_id':cid})
df.head()

## 3) Quick plots

In [None]:
plt.figure()
plt.plot(t[:5000], ecg[:5000])
plt.title('ECG_II (first 50 s)')
plt.xlabel('Time (s)')
plt.ylabel('mV')
plt.show()

In [None]:
plt.figure()
plt.plot(t[:5000], art[:5000])
plt.title('ART (first 50 s)')
plt.xlabel('Time (s)')
plt.ylabel('mmHg')
plt.show()

## 4) R-peaks + per-beat labels (SBP/DBP/MBP)

In [None]:
ecg_clean = nk.ecg_clean(ecg, sampling_rate=fs)
_, rdict = nk.ecg_peaks(ecg_clean, sampling_rate=fs)
r_samples = rdict['ECG_R_Peaks']
r_times = r_samples / fs

rows = []
for i in range(len(r_samples)-1):
    a, b = r_samples[i], r_samples[i+1]
    seg = art[a:b]
    if len(seg) < 5:
        continue
    sbp = float(np.max(seg))
    dbp = float(np.min(seg))
    mbp = float(np.trapz(seg, dx=1/fs) / ((b-a)/fs))
    rows.append((cid, i, r_times[i], sbp, dbp, mbp))

beat_df = pd.DataFrame(rows, columns=['subject_id','beat_idx','r_time','SBP','DBP','MBP'])
beat_df.head()

### Save beat table

In [None]:
outp = DATA_INTERIM / 'beat_table.parquet'
beat_df.to_parquet(outp, index=False)
outp