TODO

-   weekly performance, only show best model, weeks 0 to 20
-   run variations of prediction window (lookahead) + frequency, plot of sensitivity by frequency/lookahead
-   sensitivity + specificity + AUC+ROC
-   do weekly versions + trendline, not cum
-   for manuscript -- use terms e.g., "scheduled retraining", "closed loop"
-   why even do online learning -- capture "in situ" context, don't simply apply historical model to new cohort
-   experiment with oversampling recent data (or sample weighting, focus on the positive observations)
    -   look into regret minimization algos


In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

DATA_DIR = Path("./pistachio_1_data")
dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")

  dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")


In [9]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df["therapy_length_days"] = (
        pd.to_datetime(df["ActivityDateTime"]) - pd.to_datetime(df["Therapy Start"])
    ).dt.days

    df["day_of_week"] = pd.to_datetime(df["ActivityDateTime"]).dt.dayofweek
    df["hour"] = pd.to_datetime(df["ActivityDateTime"]).dt.hour

    df["therapy_week"] = (
        pd.to_datetime(df["ActivityDateTime"]) - pd.to_datetime(df["Therapy Start"])
    ).dt.days // 7

    # Select features
    df = df.drop(
        [
            "ActivityDateTime",
            "DurationInSeconds",  # total active time
            "SleepMorningDate",
            "Diagnosis",
            "CDI start date",
            "PDI start date",
            "PDI end date",
            # "Medication ",
            "Type of medication",
            "Medication start date",
            "Week",
            "Therapy session",
            "Therapy Start",
            "Therapy End",
            "Education Status",
            "Parental Status",
            "Pre.ECBI",
            "Pre.ECBI.Prob",
            "Post.ECBI",
            "Post.ECBI.Prob",
            "QuitStudy",
            "Employment Status",
            "ParticipatingParent.Sex",
            "Parent-PhoneType",
            # Data that is only available as "real time data" (more battery use?) in Companion SDK
            # NOTE: Removing these actually improves model accuracy?
            "DistanceInMeters",
            "ActiveKilocalories",
            "METmins",
            "METavg",
            "activity_seconds_sedentary",
            "activity_seconds_active",
            "activity_seconds_highly_active",
        ]
        # moving window stats
        + [
            "hr_moving_avg_15m",
            "hr_moving_std_15m",
            "hr_moving_min_15m",
            "hr_moving_max_15m",
            "hr_moving_avg_30m",
            "hr_moving_std_30m",
            "hr_moving_min_30m",
            "hr_moving_max_30m",
            "hr_moving_avg_45m",
            "hr_moving_std_45m",
            "hr_moving_min_45m",
            "hr_moving_max_45m",
        ],
        axis=1,
    )
    df = df.drop([col for col in df.columns if col.startswith("hr-prev")], axis=1)

    def yn_to_bool(df):
        """
        Convert columns with 'Y'/'N' strings to boolean True/False.
        """
        for col in df.columns:
            if df[col].nunique() == 2 and set(df[col].dropna().unique()) == {"Y", "N"}:
                df[col] = df[col].map({"Y": True, "N": False})
        return df

    df = yn_to_bool(df)

    # Convert categorical columns to dummy variables
    df = pd.get_dummies(df, drop_first=True)

    return df


def prep_X_y(df: pd.DataFrame, response_column: str) -> tuple[pd.DataFrame, pd.Series]:
    X = df.drop(
        [
            "tantrum_within_60m",
            "tantrum_within_45m",
            "tantrum_within_30m",
            "tantrum_within_15m",
            # Useful for indexing
            "Arm_Sham",
            "dyad",
            "week",
        ],
        axis=1,
    )
    y = df[response_column].astype(int)
    return X, y

In [3]:
from sklearn.base import BaseEstimator


class HrModel(BaseEstimator):
    def fit(self, X, y):
        return self

    def predict(self, X):
        mean_hr = X["hr_moving_avg_60m"]
        return (mean_hr > 80) & (mean_hr < 129)

    def predict_proba(self, X):
        preds = self.predict(X)
        proba = np.zeros((len(X), 2))
        proba[:, 1] = preds.astype(float)
        proba[:, 0] = 1 - proba[:, 1]
        return proba

In [4]:
cleaned_dyads_df = clean_data(dyads_df)

df_train = cleaned_dyads_df[cleaned_dyads_df["Arm_Sham"]]
X_train, y_train = prep_X_y(df_train, "tantrum_within_60m")
df_test = cleaned_dyads_df[~cleaned_dyads_df["Arm_Sham"]]
X_test, y_test = prep_X_y(df_test, "tantrum_within_60m")

  pd.to_datetime(df["ActivityDateTime"]) - pd.to_datetime(df["Therapy Start"])
  pd.to_datetime(df["ActivityDateTime"]) - pd.to_datetime(df["Therapy Start"])


In [5]:
from flaml import AutoML
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score
from tqdm.auto import tqdm


WEEK_CUTOFF = 20

automl_settings = {
    "time_budget": 60,  # seconds
    "task": "classification",
    "metric": "log_loss",
    "estimator_list": ["xgboost"],
    "train_time_limit": 10,  # seconds
    "early_stop": True,
}
