In [1]:
from datetime import datetime, time
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
DATA_DIR = Path("./pistachio_1_data")
DEVICE_DATA_DIR = DATA_DIR / "SMRawDec27"

cohort_df = pd.read_csv(DATA_DIR / "cohort_analysis.csv")
pdi_sheets = pd.read_excel(DATA_DIR / "Artificial_PDI_12-27.xlsx", sheet_name=None)
assert len(pdi_sheets) == 1
pdi_start_dates_df = pdi_sheets["PDI start dates"]

dyads_df = pd.merge(cohort_df, pdi_start_dates_df, on="ID", how="left")
dyads_df = dyads_df.set_index("ID")
dyads_df

Unnamed: 0_level_0,Therapy Start,Therapy End,Arm,Diagnosis,Diag.ADHD,Diag.ASD,Diag.Anxiety,Diag.SAD,Child.Age,Child sex,...,Pre.ECBI.Prob,Post.ECBI.Prob,CDI start date,PDI start date,PDI end date,Medication,Type of medication,Medication start date,Week,Therapy session
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4/1/22,,Sham,ADHD and PTSD and Conduct disorder and Reactiv...,Y,N,N,N,6,Male,...,,,2022-04-01,NaT,NaT,Child was on meds but dosage not changed,,NaT,,
2,4/1/22,7/8/22,Sham,No ASD or ADHD,N,N,N,N,3,Male,...,26.0,7.0,2022-04-01,2022-05-27,2022-07-08,No medication added,,NaT,,
3,4/8/22,7/15/22,Active,ADHD,Y,N,N,N,4,Female,...,7.0,4.0,2022-04-08,2022-05-25,2022-07-15,Ritalin 5mg,Stimulant,2022-05-29,8.0,7.0
4,4/11/22,8/19/22,Sham,No ASD or ADHD,N,N,N,N,4,Male,...,10.0,3.0,2022-04-11,2022-06-13,2022-08-19,No medication added,,NaT,,
5,4/20/22,7/18/22,Active,ASD,N,Y,N,N,4,Male,...,0.0,12.0,2022-04-20,2022-06-01,2022-07-18,No medication added,,NaT,,
6,5/2/22,12/8/22,Sham,Anxiety,N,N,Y,N,5,Male,...,32.0,30.0,2022-05-02,2022-07-18,2022-12-08,Focalin 5mg,Stimulant,2022-06-07,5.0,5.0
7,4/19/22,9/2/22,Sham,No ASD or ADHD,N,N,N,N,4,Female,...,18.0,23.0,2022-04-19,2022-06-14,2022-09-02,No medication added,,NaT,,
8,5/6/22,9/2/22,Sham,ADHD,Y,N,N,N,5,Male,...,23.0,23.0,2022-05-06,2022-06-17,2022-09-02,No medication added,,NaT,,
9,5/12/22,9/1/22,Active,ADHD,Y,N,N,N,5,Female,...,23.0,20.0,2022-05-12,2022-06-30,2022-09-01,Ritalin 5mg,Stimulant,2022-06-06,5.0,5.0
10,5/13/22,,Active,No ASD or ADHD,N,N,N,N,3,Male,...,,,2022-05-16,NaT,NaT,No medication added,,NaT,,


In [None]:
NIGHTTIME_CUTOFF = time(18, 0)
TANTRUM_INTERVAL_MINUTES = [15, 30, 45, 60]


def make_dyad_df(dyad: int) -> pd.DataFrame:
    dyad_code = f"{dyad:03d}"

    ## Activity
    # Data every 15 minutes
    epoch_df = get_child_garmin_df(dyad_code, "Epoch")
    epoch_log_df = get_child_garmin_df(dyad_code, "EpochLog")

    # Add total activity seconds by intensity to each epoch
    epoch_df_ext = epoch_df.copy().set_index("ActivityDateTime")
    for intensity in ["SEDENTARY", "ACTIVE", "HIGHLY_ACTIVE"]:
        epoch_df_ext = add_activity_intensity_to_epoch_df(
            epoch_df_ext, epoch_log_df, intensity
        )

    ## Sleep
    # Daily summaries of sleep (e.g., total REM for a night)
    sleep_df = get_child_garmin_df(dyad_code, "Sleep")
    sleep_df["CalendarDate"] = pd.to_datetime(
        sleep_df["CalendarDate"], format="%m/%d/%Y"
    )
    # Each detected sleep stage with its own row w/start time, duration
    sleep_details_df = get_child_garmin_df(dyad_code, "SleepDetails")
    # Summarize sleep details
    sleep_summary_df = summarize_sleep_details(
        sleep_details_df, nighttime_cutoff=NIGHTTIME_CUTOFF
    )

    ## Heart rate
    hr_df = get_child_garmin_df(dyad_code, "HeartRate")
    hr_df["ActivityTime"] = pd.to_datetime(
        hr_df["ActivityTime"], format="%m/%d/%Y %I:%M:%S %p"
    )

    ## Stress
    # Daily stress summary
    stress_df = get_child_garmin_df(dyad_code, "Stress")
    stress_df["ActivityDateTime"] = pd.to_datetime(stress_df["ActivityDateTime"])
    stress_df = stress_df.set_index(stress_df["ActivityDateTime"].dt.date).drop(
        ["ActivityDateTime"], axis=1
    )

    # High resolution stress (every 3 minutes)
    stress_details_df = get_child_garmin_df(dyad_code, "StressDetails")
    # Rollup stress_details_df to 15-minute bins, averaging StressLevelValue (ignoring -1)
    stress_details_df["ActivityDate"] = pd.to_datetime(
        stress_details_df["ActivityDate"]
    )
    stress_details_df = stress_details_df[stress_details_df["StressLevelValue"] != -1]
    stress_details_summary_df = (
        stress_details_df.set_index("ActivityDate")
        .resample("15min")["StressLevelValue"]
        .mean()
        .to_frame()
        .rename(columns={"StressLevelValue": "StressLevelValueAverage"})
    )

    ## EMA logs
    parent_ilumivu_df = get_parent_ilumivu_df(dyad_code, dyads_df)
    tantrums_df = tantrum_onsets_from_parent_ilumivu(parent_ilumivu_df)

    ## Now combine all the dataframes
    combined_df = epoch_df_ext.copy()
    for interval in ["10m", "30m", "60m"]:
        recent_hrs_by_time = pd.Series(
            [
                most_recent_hrs(
                    hr_df, pd.to_datetime(time), lookback=pd.Timedelta(interval)
                )
                for time, _ in epoch_df_ext.iterrows()
            ],
            index=epoch_df_ext.index,
        )
        combined_df[f"hr_moving_avg_{interval}"] = recent_hrs_by_time.map(np.mean)
        combined_df[f"hr_moving_std_{interval}"] = recent_hrs_by_time.map(np.std)
        combined_df[f"hr_moving_max_{interval}"] = recent_hrs_by_time.map(np.max)
        combined_df[f"hr_moving_min_{interval}"] = recent_hrs_by_time.map(np.min)

    combined_df["ActivityDateTimeDt"] = pd.to_datetime(combined_df.index)
    combined_df = combined_df.join(
        stress_details_summary_df, on="ActivityDateTimeDt", how="left"
    )
    combined_df = combined_df.drop(columns=["ActivityDateTimeDt"])

    # Look at stress over the past 1 to 6 days
    stress_lookback_days = 5
    for day in range(1, stress_lookback_days + 1):
        temp_col = f"StressDate_T-{day}"
        temp_df = stress_df.copy().add_suffix(f"_T-{day}")
        combined_df[temp_col] = pd.to_datetime(combined_df.index).date - pd.Timedelta(
            days=day
        )
        combined_df = combined_df.join(temp_df, on=temp_col, how="left")
        combined_df = combined_df.drop(columns=[temp_col])

    # Look at sleep over the past 0 to 5 days (sleep from the previous night is marked by the waking date)
    sleep_lookback_days = 5
    for day in range(sleep_lookback_days):
        temp_col = f"SleepNightT_{day}"
        temp_df = sleep_summary_df.copy().add_suffix(f"_T-{day}")
        combined_df[temp_col] = pd.to_datetime(combined_df.index).date - pd.Timedelta(
            days=day
        )
        combined_df = combined_df.join(temp_df, on=temp_col, how="left")
        combined_df = combined_df.drop(columns=[temp_col])

    activity_times = pd.to_datetime(combined_df.index)
    tantrum_starts = tantrums_df.sort_values().to_numpy()
    for interval in TANTRUM_INTERVAL_MINUTES:
        combined_df[f"tantrum_within_{interval}m"] = activity_times.map(
            lambda x: has_tantrum_within_period_minutes(tantrum_starts, x, interval)
        )

    for col in dyads_df.columns:
        combined_df[col] = dyads_df.loc[dyad, col]

    combined_df.reset_index(inplace=True)
    combined_df.rename(columns={"index": "ActivityDateTime"}, inplace=True)
    return combined_df


def get_child_garmin_df(dyad: str, csv_name: str) -> pd.DataFrame:
    # Example name: pistachio003_c_garminActivity_20220325_20220721.csv
    glob = f"*_garmin{csv_name}_*.csv"  # Need flanking underscores bc some names (e.g., garminActivity) are prefixes
    child = f"{dyad}_C"
    [csv] = list((DEVICE_DATA_DIR / child / "Garmin").glob(glob))
    return pd.read_csv(csv)


def summarize_sleep_details(
    sleep_details_df: pd.DataFrame, nighttime_cutoff: time
) -> pd.DataFrame:
    """Summarize sleep details similar to the garminSleep CSV, but using a different cutoff than midnight"""

    sleep_details_df["ActivityDateTime"] = pd.to_datetime(
        sleep_details_df["ActivityDateTime"]
    )
    sleep_date_col = "SleepNightT-1"
    sleep_details_df[sleep_date_col] = sleep_details_df["ActivityDateTime"].apply(
        lambda dt: dt.date()
        if dt.time() < nighttime_cutoff
        else dt.date() + pd.Timedelta(days=1)
    )
    sleep_summary = (
        sleep_details_df.groupby([sleep_date_col, "SleepStage"])["Duration"]
        .sum()
        .unstack(fill_value=0)
    )
    return sleep_summary


def add_activity_intensity_to_epoch_df(
    epoch_df_ext: pd.DataFrame,
    epoch_log_df: pd.DataFrame,
    intensity: str,
) -> pd.DataFrame:
    grouped = epoch_log_df.groupby(["ActivityDateTime", "Intensity"])[
        "ActiveTimeInSeconds"
    ].sum()
    intensity_seconds = grouped.loc[(epoch_df_ext.index, intensity)].drop(  # pyright: ignore[reportCallIssue,reportArgumentType]
        columns=["Intensity"]
    )
    intensity_seconds.index = intensity_seconds.index.get_level_values(0)
    col_name = f"activity_seconds_{intensity.lower()}"  # pyright: ignore[reportCallIssue]
    epoch_df_ext[col_name] = intensity_seconds
    epoch_df_ext[col_name] = epoch_df_ext[col_name].fillna(0)
    return epoch_df_ext


def get_ilumivu_df(subject_dir: Path) -> pd.DataFrame:
    ilumivu_dir = [
        # It looks like all filenames start with "Ilumivu" but some actually start with ZERO WIDTH SPACE?!?
        d
        for d in subject_dir.iterdir()
        if d.is_dir() and ("Ilumivu" in d.name or "Illumivu" in d.name)
    ]
    assert len(ilumivu_dir) == 1, print(
        f"Expected one Ilumivu directory, found {len(ilumivu_dir)}"
    )
    [ilumivu_dir] = ilumivu_dir
    [csv] = list(ilumivu_dir.iterdir())

    return pd.read_csv(csv)


def get_parent_ilumivu_df(dyad: str, dyads_df: pd.DataFrame) -> pd.DataFrame:
    """Ilumivu data for the child is only available if the dyad is in the Active (AI) arm."""
    # This directory doesn't have a standard name, but they all start with Ilumivu
    parent = f"{dyad}_P"

    parent_df = get_ilumivu_df(DEVICE_DATA_DIR / parent)
    return parent_df


def datetime_from_survey_answer(time: str) -> datetime:
    """Survey answer example: '1:45PM/1649443501.804'"""
    timestamp = float(time.split("/")[1])
    return datetime.fromtimestamp(timestamp)


def tantrum_onsets_from_parent_ilumivu(
    df: pd.DataFrame,
) -> pd.Series:
    tantrum_start_indices = df[df["question_code"] == "TIME_OF_ONSET_OF_TANTRUM"].index
    tantrums = []
    for start_index in tantrum_start_indices:
        tantrum_start = datetime_from_survey_answer(
            str(df.loc[start_index, "answer_code"])
        )

        tantrum_end = None
        if (
            start_index + 1 in df.index
            and df.loc[start_index + 1, "question_code"] == "END_TIME_OF_TANTRUM"
        ):
            tantrum_end = datetime_from_survey_answer(
                str(df.loc[start_index + 1, "answer_code"])
            )

        tantrum_end_input_date = datetime.fromisoformat(
            df.loc[start_index + 1, "instance_date"]
        )

        # Criteria that Kyle and Arjun discussed a while ago
        if tantrum_end is None:
            continue
        # We expect the survey to be submitted after the tantrum ends
        if tantrum_end_input_date < tantrum_end:
            continue
        duration = tantrum_end - tantrum_start
        if duration < pd.Timedelta(0):
            continue
        if duration > pd.Timedelta(seconds=30_000):
            continue
        tantrums.append((tantrum_start, tantrum_end))

    # Remove tantrums whose intervals overlap
    to_remove = set()
    for i, (start_i, end_i) in enumerate(tantrums):
        for j, (start_j, end_j) in enumerate(tantrums):
            if i != j and start_i < end_j and start_j < end_i:
                to_remove.add(i)
                to_remove.add(j)
    tantrums = [t for idx, t in enumerate(tantrums) if idx not in to_remove]

    tantrum_starts = [ts for (ts, _) in tantrums]
    print(
        f"Removed {len(tantrum_start_indices) - len(tantrum_starts)} invalid tantrums"
    )
    return pd.Series(tantrum_starts)


def has_tantrum_within_period_minutes(
    tantrum_starts: np.ndarray, activity_time: pd.DatetimeIndex, period_minutes: int
):
    # Find if any tantrum starts within the specified period after activity_time
    return np.any(
        (tantrum_starts >= activity_time)
        & (tantrum_starts < activity_time + pd.Timedelta(minutes=period_minutes))
    )


def most_recent_hrs(
    hr_df: pd.DataFrame, time: datetime, lookback: pd.Timedelta
) -> pd.Series:
    """Get the last N heart rate measurements before a certain time.

    Assumes hr_df is sorted by ActivityTime.

    Returns a Series of variable length
    """
    end = time
    start = time - lookback

    # Runs in O(log N) time if df is sorted by ActivityTime
    left_idx = hr_df["ActivityTime"].searchsorted(start)
    right_idx = hr_df["ActivityTime"].searchsorted(end)
    recent_hrs = hr_df.iloc[left_idx:right_idx]["HeartRate"]
    return recent_hrs


def pad_or_truncate_series(s: pd.Series, length: int):
    """Convert a Series to a fixed-length numpy array, padding with NaNs if necessary."""
    # Convert s to float type so padding with np.nan works correctly
    s = s.astype(float)
    arr = (
        s.values[-length:]
        if len(s) >= length
        else np.pad(s.values, (length - len(s), 0), constant_values=np.nan)
    )  # type: ignore
    return arr

In [4]:
# stress_details_df = get_child_garmin_df("001", "StressDetails")
# # Rollup stress_details_df to 15-minute bins, averaging StressLevelValue (ignoring -1)
# stress_details_df["ActivityDate"] = pd.to_datetime(stress_details_df["ActivityDate"])
# stress_details_df = stress_details_df[stress_details_df["StressLevelValue"] >= 0]
# # Rollup stress_details_df to 15-minute bins, averaging StressLevelValue (ignoring -1)
# stress_details_df["ActivityDate"] = pd.to_datetime(stress_details_df["ActivityDate"])
# stress_details_df = stress_details_df[stress_details_df["StressLevelValue"] >= 0]
# stress_details_summary_df = (
#     stress_details_df.set_index("ActivityDate")
#     .resample("15min")["StressLevelValue"]
#     .mean()
#     .to_frame()
#     .rename(columns={"StressLevelValue": "StressLevelValueAverage"})
# )
# stress_details_summary_df.sort_index()

In [5]:
data_df = pd.DataFrame()

# Missing Ilumivu data
DYADS_TO_SKIP = [49]

for dyad in tqdm(dyads_df.index):
    if dyad in DYADS_TO_SKIP:
        continue

    dyad_df = make_dyad_df(dyad)
    dyad_df["dyad"] = dyad

    data_df = pd.concat([data_df, dyad_df], ignore_index=True)

  0%|          | 0/50 [00:00<?, ?it/s]

  sleep_details_df["ActivityDateTime"] = pd.to_datetime(
  stress_df["ActivityDateTime"] = pd.to_datetime(stress_df["ActivityDateTime"])


Removed 9 invalid tantrums
Index(['ActivityDateTime', 'AverageStressLevel', 'MaxStressLevel',
       'StressDurationInSeconds', 'RestStressDurationInSeconds',
       'ActivityStressDurationInSeconds', 'LowStressDurationInSeconds',
       'MediumStressDurationInSeconds', 'HighStressDurationInSeconds',
       'StressQualifier'],
      dtype='object')
Index(['ActivityDateTime', 'AverageStressLevel', 'MaxStressLevel',
       'StressDurationInSeconds', 'RestStressDurationInSeconds',
       'ActivityStressDurationInSeconds', 'LowStressDurationInSeconds',
       'MediumStressDurationInSeconds', 'HighStressDurationInSeconds',
       'StressQualifier'],
      dtype='object')
Index(['ActivityDateTime', 'AverageStressLevel', 'MaxStressLevel',
       'StressDurationInSeconds', 'RestStressDurationInSeconds',
       'ActivityStressDurationInSeconds', 'LowStressDurationInSeconds',
       'MediumStressDurationInSeconds', 'HighStressDurationInSeconds',
       'StressQualifier'],
      dtype='object')
I

  sleep_details_df["ActivityDateTime"] = pd.to_datetime(


KeyboardInterrupt: 

In [None]:
data_df.to_csv(DATA_DIR / "all_dyads.csv", index=False)