# Subject Data Exploration Notebook
corresponds with dev.ipynb from dbs_in_the_wild Repo

- loads subject data for exploration and quality checks
- investigation of data completeness, data types, and general structure.

In [None]:
# import public packages
import numpy as np
import importlib
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
from dbs_home.load_raw import main_load_raw, load_watch_raw, load_ema_raw
from dbs_home.plot_data import plot_helpers, plot_compliance
from dbs_home.preprocessing import acc_preprocessing

from dbs_home.preprocessing.acc_preprocessing import calc_svm, bandpass_filter

## Loading EMA and accelerometer data

In [None]:
# specify the subject and session you want to look examine
# for example:
sub_id = "hm26"
ses_id = "ses01"

In [None]:
importlib.reload(acc_preprocessing)
importlib.reload(main_load_raw)
importlib.reload(load_ema_raw)
importlib.reload(load_watch_raw)


dat = main_load_raw.loadSubject(
    sub=sub_id,
    ses =ses_id,
    incl_ACC = True,  # loads raw unfiltered acc-data
    incl_EMA= True,
    incl_STEPS = False,
    incl_EPHYS=False,
    verbose=True,
)

## Calculating Signal Vector Magnitude (SVM)

Using bandpass filtered acc data

In [None]:
# adding SVM and filtered acc-data to the dataClass
# in same style, one list per day
# NOTE: feature extraction based on SVM and filtered data would
# requires its own dataClass structure, as main object from which
# feature-extraction functions are called 

dat.filt_triax_acc, dat.acc_svm = [], []

for day_str, day_acc, day_times in zip(
    dat.watch_days, dat.acc_data, dat.acc_times
):
    accDay = acc_preprocessing.AccelDay(
        day=day_str, timestamps=day_times,
        raw_triax_acc=day_acc, INCL_SVM=True,
    )
    dat.filt_triax_acc.append(accDay.filt_triax_acc)
    dat.acc_svm.append(accDay.acc_svm)
    

## Visualizing available ACC and EMA data

In [None]:
importlib.reload(plot_compliance)
importlib.reload(plot_helpers)

# TODO: DEBUG PLOTTING FUNCTIONALITY IF NOT
# ALL DATA TYPES ARE INCLUDED
# TODO: check new EMA versions

plot_compliance.plot_data_presence(
    self=dat,
    incl_ACC=True,
    incl_LFP_EVENTS=False,
    incl_LFP_chronic=False,
    incl_EMA=True,
    incl_EMA_NIGHT=True,#False,#True,
    incl_EMA_MORNING=True,#False,#True,
    SAVE_PLOT=False,
    dpi=300,
)

## Data inspection and structure overview

In [None]:
print("Type of the object: ",type(dat))

In [None]:
# looking at all attributes of the object and their data types

df_attrs = pd.DataFrame([
    {"Attribute": attr, "Type": type(getattr(dat, attr))}
    for attr in dir(dat) if not callable(getattr(dat, attr)) and not attr.startswith("__")
])
df_attrs

# TODO Luisa: period_firstday is None - remove? gets initialized in load_ephys.py -> ask Jeroen
# TODO dat.plot_days and ema_days are exactly the same
# TODO acc_dict empty?


#### General Info

In [None]:
# getting the patient info of this patient (not only the selected session)
dat.pt_info

In [None]:
print("Subject ID: ", dat.sub)
print("Session ID: ", dat.ses)
print("First day: ", dat.ses_firstday)
print("Last day: ", dat.ses_lastday)
print("Which days are included in the data: ", dat.ema_days)

### Accelerometer data

#### dat.acc_data
- list of arrays of arrays of integers
- one array per day - and then one array for x y z acc data
- only days where accelerometer data were collected



In [None]:
print("Number of days with accelerometer data: ",len(dat.acc_data))
shape = [arr.shape for arr in dat.acc_data]
print("Shape of the data: ", shape)

#### dat.acc_times

- list of lists
- one list per day containing datetime times (the timestamps corresponding to the acc_data above)
- same length of data points in each list as acc_data

In [None]:
print("Number of days with accelerometer data: ",len(dat.acc_times))
shape = [np.array(a).shape for a in dat.acc_times]
print("Shape of the data: ", shape)
print("Examplary entry/datetime: ", dat.acc_times[0][3])

#### dat.acc_svm

- list of arrays of floats
- each array has the same length as acc_data
- here, the triaxial data is summarized into signal vector magnitude -> only one value instead of 3 per time point

In [None]:
print("Number of days with accelerometer data: ",len(dat.acc_svm))
shape = [arr.shape for arr in dat.acc_svm]
print("Shape of the data: ", shape)

### EMA data

- there are 3 different questionnaire types: EMA, MORNING, EVENING
- per questionnaire type there are 2 attributes (both are lists): _reports and _reports_questions
    - _reports: contains single_EMA_reports (the patient's answers to the questionnaires and their times etc.)
    - _reports_questions: contains the question labels (corresponding to Q1,..., Q15)
- _reports: contains submitted as well as non-submitted questionnaires

#### dat.EMA_reports

In [None]:
print("Number of total EMA questionnaires",len(dat.EMA_reports))
print("Type of list entrys: ", type(dat.EMA_reports[0]))
print("Examplary EMA report: ", dat.EMA_reports[0])

#### dat.EMA_reports_questions

In [None]:
dat.EMA_reports_questions