In [None]:
from datetime import datetime, time
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
DATA_DIR = Path("./pistachio_1_data")
INPUT_DATA_DIR = DATA_DIR / "nurse_fitabase"
DEVICE_DATA_DIR = INPUT_DATA_DIR / "SMRawDec27"

cohort_df = pd.read_csv(INPUT_DATA_DIR / "cohort_analysis.csv")
pdi_sheets = pd.read_excel(INPUT_DATA_DIR / "Artificial_PDI_12-27.xlsx", sheet_name=None)
assert len(pdi_sheets) == 1
pdi_start_dates_df = pdi_sheets["PDI start dates"]

dyads_df = pd.merge(cohort_df, pdi_start_dates_df, on="ID", how="left")
dyads_df = dyads_df.set_index("ID")
dyads_df

In [None]:
NIGHTTIME_CUTOFF = time(18, 0)
TANTRUM_INTERVAL_MINUTES = [15, 30, 45, 60]


def make_dyad_df(dyad: int) -> pd.DataFrame:
    dyad_code = f"{dyad:03d}"

    ## Activity
    # Data every 15 minutes
    epoch_df = get_child_garmin_df(dyad_code, "Epoch")
    epoch_log_df = get_child_garmin_df(dyad_code, "EpochLog")
    # Add total activity seconds by intensity to each epoch
    epoch_df_ext = epoch_df.copy().set_index("ActivityDateTime")
    for intensity in ["SEDENTARY", "ACTIVE", "HIGHLY_ACTIVE"]:
        epoch_df_ext = add_activity_intensity_to_epoch_df(
            epoch_df_ext, epoch_log_df, intensity
        )

    ## Sleep
    # Daily summaries of sleep (e.g., total REM for a night)
    sleep_df = get_child_garmin_df(dyad_code, "Sleep")
    sleep_df["CalendarDate"] = pd.to_datetime(
        sleep_df["CalendarDate"], format="%m/%d/%Y"
    )
    # Each detected sleep stage with its own row w/start time, duration
    sleep_details_df = get_child_garmin_df(dyad_code, "SleepDetails")
    # Summarize sleep details with a custom nighttime cutoff
    sleep_summary_df = summarize_sleep_details(
        sleep_details_df, nighttime_cutoff=NIGHTTIME_CUTOFF
    )

    ## Heart rate
    hr_df = get_child_garmin_df(dyad_code, "HeartRate")
    hr_df["ActivityTime"] = pd.to_datetime(
        hr_df["ActivityTime"], format="%m/%d/%Y %I:%M:%S %p"
    )

    ## EMA logs
    parent_ilumivu_df = get_parent_ilumivu_df(dyad_code, dyads_df)
    tantrums_df = tantrum_onsets_from_parent_ilumivu(parent_ilumivu_df)

    ## Now combine all the datatframes
    combined_df = epoch_df_ext.copy()
    for interval in ["15m", "30m", "45m", "60m"]:
        recent_hrs_by_time = pd.Series(
            [
                most_recent_hrs(
                    hr_df, pd.to_datetime(time), lookback=pd.Timedelta(interval)
                )
                for time, _ in epoch_df_ext.iterrows()
            ],
            index=epoch_df_ext.index,
        )
        combined_df[f"hr_moving_avg_{interval}"] = recent_hrs_by_time.map(np.mean)
        combined_df[f"hr_moving_std_{interval}"] = recent_hrs_by_time.map(np.std)
        combined_df[f"hr_moving_max_{interval}"] = recent_hrs_by_time.map(np.max)
        combined_df[f"hr_moving_min_{interval}"] = recent_hrs_by_time.map(np.min)

    combined_df["SleepMorningDate"] = pd.to_datetime(combined_df.index).date
    combined_df = combined_df.join(sleep_summary_df, on="SleepMorningDate", how="left")

    activity_times = pd.to_datetime(combined_df.index)
    tantrum_starts = tantrums_df.sort_values().to_numpy()
    for interval in TANTRUM_INTERVAL_MINUTES:
        combined_df[f"tantrum_within_{interval}m"] = activity_times.map(
            lambda x: has_tantrum_within_period_minutes(tantrum_starts, x, interval)
        )

    for col in dyads_df.columns:
        combined_df[col] = dyads_df.loc[dyad, col]

    combined_df.reset_index(inplace=True)
    combined_df.rename(columns={"index": "ActivityDateTime"}, inplace=True)
    return combined_df


def get_child_garmin_df(dyad: str, csv_name: str) -> pd.DataFrame:
    # Example name: pistachio003_c_garminActivity_20220325_20220721.csv
    glob = f"*_garmin{csv_name}_*.csv"  # Need flanking underscores bc some names (e.g., garminActivity) are prefixes
    child = f"{dyad}_C"
    [csv] = list((DEVICE_DATA_DIR / child / "Garmin").glob(glob))
    return pd.read_csv(csv)


def summarize_sleep_details(
    sleep_details_df: pd.DataFrame, nighttime_cutoff: time
) -> pd.DataFrame:
    """Summarize sleep details similar to the garminSleep CSV, but using a different cutoff than midnight"""

    sleep_details_df["ActivityDateTime"] = pd.to_datetime(
        sleep_details_df["ActivityDateTime"]
    )
    sleep_date_col = "SleepMorningDate"
    sleep_details_df[sleep_date_col] = sleep_details_df["ActivityDateTime"].apply(
        lambda dt: dt.date()
        if dt.time() < nighttime_cutoff
        else dt.date() + pd.Timedelta(days=1)
    )
    sleep_summary = (
        sleep_details_df.groupby([sleep_date_col, "SleepStage"])["Duration"]
        .sum()
        .unstack(fill_value=0)
    )
    return sleep_summary


def add_activity_intensity_to_epoch_df(
    epoch_df_ext: pd.DataFrame,
    epoch_log_df: pd.DataFrame,
    intensity: str,
) -> pd.DataFrame:
    grouped = epoch_log_df.groupby(["ActivityDateTime", "Intensity"])[
        "ActiveTimeInSeconds"
    ].sum()
    intensity_seconds = grouped.loc[(epoch_df_ext.index, intensity)].drop(  # pyright: ignore[reportCallIssue,reportArgumentType]
        columns=["Intensity"]
    )
    intensity_seconds.index = intensity_seconds.index.get_level_values(0)
    col_name = f"activity_seconds_{intensity.lower()}"  # pyright: ignore[reportCallIssue]
    epoch_df_ext[col_name] = intensity_seconds
    epoch_df_ext[col_name] = epoch_df_ext[col_name].fillna(0)
    return epoch_df_ext


def get_ilumivu_df(subject_dir: Path) -> pd.DataFrame:
    ilumivu_dir = [
        # It looks like all filenames start with "Ilumivu" but some actually start with ZERO WIDTH SPACE?!?
        d
        for d in subject_dir.iterdir()
        if d.is_dir() and ("Ilumivu" in d.name or "Illumivu" in d.name)
    ]
    assert len(ilumivu_dir) == 1, print(
        f"Expected one Ilumivu directory, found {len(ilumivu_dir)}"
    )
    [ilumivu_dir] = ilumivu_dir
    [csv] = list(ilumivu_dir.iterdir())

    return pd.read_csv(csv)


def get_parent_ilumivu_df(dyad: str, dyads_df: pd.DataFrame) -> pd.DataFrame:
    """Ilumivu data for the child is only available if the dyad is in the Active (AI) arm."""
    # This directory doesn't have a standard name, but they all start with Ilumivu
    parent = f"{dyad}_P"

    parent_df = get_ilumivu_df(DEVICE_DATA_DIR / parent)
    return parent_df


def datetime_from_survey_answer(time: str) -> datetime:
    """Survey answer example: '1:45PM/1649443501.804'"""
    timestamp = float(time.split("/")[1])
    return datetime.fromtimestamp(timestamp)


def tantrum_onsets_from_parent_ilumivu(
    df: pd.DataFrame,
) -> pd.Series:
    tantrum_start_indices = df[df["question_code"] == "TIME_OF_ONSET_OF_TANTRUM"].index
    tantrum_starts = [
        datetime_from_survey_answer(str(df.loc[start_idx, "answer_code"]))
        for start_idx in tantrum_start_indices
    ]
    return pd.Series(tantrum_starts)


def has_tantrum_within_period_minutes(
    tantrum_starts: np.ndarray, activity_time: pd.DatetimeIndex, period_minutes: int
):
    # Find if any tantrum starts within the specified period after activity_time
    return np.any(
        (tantrum_starts >= activity_time)
        & (tantrum_starts < activity_time + pd.Timedelta(minutes=period_minutes))
    )


def most_recent_hrs(
    hr_df: pd.DataFrame, time: datetime, lookback: pd.Timedelta
) -> pd.Series:
    """Get the last N heart rate measurements before a certain time.

    Assumes hr_df is sorted by ActivityTime.

    Returns a Series of variable length
    """
    end = time
    start = time - lookback

    # Runs in O(log N) time if df is sorted by ActivityTime
    left_idx = hr_df["ActivityTime"].searchsorted(start)
    right_idx = hr_df["ActivityTime"].searchsorted(end)
    recent_hrs = hr_df.iloc[left_idx:right_idx]["HeartRate"]
    return recent_hrs


def pad_or_truncate_series(s: pd.Series, length: int):
    """Convert a Series to a fixed-length numpy array, padding with NaNs if necessary."""
    # Convert s to float type so padding with np.nan works correctly
    s = s.astype(float)
    arr = (
        s.values[-length:]
        if len(s) >= length
        else np.pad(s.values, (length - len(s), 0), constant_values=np.nan)
    )  # type: ignore
    return arr

In [None]:
data_df = pd.DataFrame()

# Missing Ilumivu data
DYADS_TO_SKIP = [49]

for dyad in tqdm(dyads_df.index):
    if dyad in DYADS_TO_SKIP:
        continue

    dyad_df = make_dyad_df(dyad)
    dyad_df["dyad"] = dyad

    data_df = pd.concat([data_df, dyad_df], ignore_index=True)

In [None]:
data_df.to_csv(DATA_DIR / "all_dyads.csv", index=False)