In [4]:
from pathlib import Path

import numpy as np
import pandas as pd

from util import engineer_features, prep_X_y

DATA_DIR = Path("./pistachio_1_data")
dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")

  dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")


In [5]:
sorted_dyads_df = dyads_df.sort_values(
    by="ActivityDateTime", key=lambda x: pd.to_datetime(x)
)
cleaned_dyads_df = engineer_features(
    sorted_dyads_df,
    stress_lookback_days=0,
)
cleaned_dyads_df = pd.concat(list(cleaned_dyads_df.values()), axis=1)

df_train = cleaned_dyads_df[cleaned_dyads_df["Arm_Sham"]]
df_test = cleaned_dyads_df[~cleaned_dyads_df["Arm_Sham"]]

  pd.to_datetime(df["ActivityDateTime"]) - pd.to_datetime(df["Therapy Start"])


In [None]:
from flaml import AutoML
from sklearn.model_selection import TimeSeriesSplit

automl = AutoML()


automl_settings = {
    "time_budget": 15,  # seconds
    # "train_time_limit": 1,  # seconds
    "task": "classification",
    "metric": "log_loss",
    "estimator_list": ["xgboost"],
    # "split_type": time_series_split,
    "early_stop": True,
}


X_train, y_train = prep_X_y(df_train, "tantrum_within_60m")
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
print("Best config:", automl.best_config)

[flaml.automl.logger: 01-19 18:24:05] {1752} INFO - task = classification
[flaml.automl.logger: 01-19 18:24:05] {1763} INFO - Evaluation method: holdout
[flaml.automl.logger: 01-19 18:24:05] {1862} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 01-19 18:24:05] {1979} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 01-19 18:24:05] {2282} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 01-19 18:24:05] {2417} INFO - Estimated sufficient time budget=4246s. Estimated necessary time budget=4s.
[flaml.automl.logger: 01-19 18:24:05] {2466} INFO -  at 1.2s,	estimator xgboost's best error=0.0262,	best estimator xgboost's best error=0.0262
[flaml.automl.logger: 01-19 18:24:05] {2282} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 01-19 18:24:05] {2466} INFO -  at 1.2s,	estimator xgboost's best error=0.0262,	best estimator xgboost's best error=0.0262
[flaml.automl.logger: 01-19 18:24:05] {2282} INFO - iteration 2, cur

-   Task = predict tantrum within 60m
-   Test = 15m, 30m, 60m period


In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score

from hr_model import HrModel

positive_model = DummyClassifier(strategy="constant", constant=1).fit(X_train, y_train)
hr_model = y_hat_hr = HrModel()
model = automl

df_test_60m = df_test[df_test["ActivityDateTime"].dt.minute.isin([0])]
df_test_30m = df_test[df_test["ActivityDateTime"].dt.minute.isin([0, 30])]

for label, df in [
    ("15m", df_test),
    ("30m", df_test_30m),
    ("60m", df_test_60m),
]:
    print(f"Evaluating for prediction frequency: {label}")
    X_test, y_test = prep_X_y(df, "tantrum_within_60m")

    y_hat_positive = positive_model.predict(X_test)
    print(
        f"Positive Model (Control) ROC-AUC: {roc_auc_score(y_test, y_hat_positive):.4f}"
    )

    y_hat_hr = hr_model.predict_proba(X_test)
    print(f"HR Model ROC-AUC: {roc_auc_score(y_test, y_hat_hr[:, 1]):.4f}")

    y_hat = model.predict_proba(X_test)
    print(f"XGB ROC-AUC: {roc_auc_score(y_test, y_hat[:, 1]):.4f}")
    print()

Evaluating for prediction frequency: 15m
Positive Model (Control) ROC-AUC: 0.5000
HR Model ROC-AUC: 0.5560
XGB ROC-AUC: 0.6915

Evaluating for prediction frequency: 30m
Positive Model (Control) ROC-AUC: 0.5000
HR Model ROC-AUC: 0.5536
XGB ROC-AUC: 0.6880

Evaluating for prediction frequency: 60m
Positive Model (Control) ROC-AUC: 0.5000
HR Model ROC-AUC: 0.5617
XGB ROC-AUC: 0.6921



Now, fix frequency at 15m and test 15m, 30m, 60m lookahead windows


In [11]:
for window in ["15m", "30m", "60m"]:
    automl = AutoML()
    X_train, y_train = prep_X_y(df_train, f"tantrum_within_{window}")
    automl.fit(X_train=X_train, y_train=y_train, verbose=False, **automl_settings)

    print(f"Evaluating for lookahead window: {window}")
    X_test, y_test = prep_X_y(df_test, response_column=f"tantrum_within_{window}")
    print(f"Total positive samples: {y_test.sum()} out of {len(y_test)}")

    y_hat = model.predict_proba(X_test)
    y_hat_positive = positive_model.predict(X_test)
    y_hat_hr = hr_model.predict_proba(X_test)
    y_hat = model.predict_proba(X_test)

    print(
        f"Positive Model (Control) ROC-AUC: {roc_auc_score(y_test, y_hat_positive):.4f}"
    )
    print(f"HR Model ROC-AUC: {roc_auc_score(y_test, y_hat_hr[:, 1]):.4f}")
    print(f"XGB ROC-AUC: {roc_auc_score(y_test, y_hat[:, 1]):.4f}")
    print()


Evaluating for lookahead window: 15m
Total positive samples: 159 out of 255981
Positive Model (Control) ROC-AUC: 0.5000
HR Model ROC-AUC: 0.5260
XGB ROC-AUC: 0.6901

Evaluating for lookahead window: 30m
Total positive samples: 313 out of 255981
Positive Model (Control) ROC-AUC: 0.5000
HR Model ROC-AUC: 0.5249
XGB ROC-AUC: 0.6954

Evaluating for lookahead window: 60m
Total positive samples: 612 out of 255981
Positive Model (Control) ROC-AUC: 0.5000
HR Model ROC-AUC: 0.5220
XGB ROC-AUC: 0.6915

