In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import pandas as pd
from flaml import AutoML

from util import engineer_features, prep_X_y

DATA_DIR = Path("./pistachio_1_data")
dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")

sorted_dyads_df = dyads_df.sort_values(
    by="ActivityDateTime", key=lambda x: pd.to_datetime(x)
)
cleaned_dyads_dfs = engineer_features(
    sorted_dyads_df,
    stress_lookback_days=0,
    sleep_days_to_keep=[1, 2],
)

feature_sets = {
    "index": True,
    "response": True,
    #
    "hr": True,
    "activity": True,
    "sleep": True,
    "stress": True,
    "overnight_hrv": True,
    "medical": True,
    "therapy": True,
    "child_demo": True,
    "parent_demo": True,
    "temporal": False,
}

df = pd.concat(
    [cleaned_dyads_dfs[key] for key, include in feature_sets.items() if include], axis=1
)
df_sham = df[df["Arm_Sham"]]
df_treat = df[~df["Arm_Sham"]]

  dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")
  pd.to_datetime(df["ActivityDateTime"]) - pd.to_datetime(df["Therapy Start"])


In [2]:
df_train = df_sham
df_test = df_treat

In [3]:
import numpy as np
from sklearn.model_selection import KFold, PredefinedSplit

dyad_labels = df_train["dyad"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = np.zeros(len(df_train), dtype=int)
for fold_idx, (_, val_idx) in enumerate(
    kf.split(np.zeros(len(dyad_labels)), dyad_labels)
):
    folds[val_idx] = fold_idx
cv = PredefinedSplit(folds)

In [None]:
import numpy as np

automl = AutoML()

groups = df_train["dyad"]
automl_settings = {
    "time_budget": 30,  # seconds
    "train_time_limit": 2,  # seconds
    "pred_time_limit": 1e-4,  # seconds
    "task": "classification",
    # "metric": "log_loss",
    "estimator_list": ["xgboost"],
    "early_stop": True,
    "split_type": cv,
    "retrain_full": False,
}


X_train, y_train = prep_X_y(df_train, "tantrum_within_60m")
automl.fit(
    X_train=X_train,
    y_train=y_train,
    **automl_settings,
)
print("Best config:", automl.best_config)

[flaml.automl.logger: 01-27 12:34:04] {1752} INFO - task = classification
[flaml.automl.logger: 01-27 12:34:04] {1763} INFO - Evaluation method: holdout
[flaml.automl.logger: 01-27 12:34:04] {1862} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 01-27 12:34:04] {1979} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 01-27 12:34:04] {2282} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 01-27 12:34:04] {2417} INFO - Estimated sufficient time budget=3683s. Estimated necessary time budget=4s.
[flaml.automl.logger: 01-27 12:34:04] {2466} INFO -  at 0.9s,	estimator xgboost's best error=0.3206,	best estimator xgboost's best error=0.3206
[flaml.automl.logger: 01-27 12:34:04] {2282} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 01-27 12:34:04] {2466} INFO -  at 1.0s,	estimator xgboost's best error=0.2880,	best estimator xgboost's best error=0.2880
[flaml.automl.logger: 01-27 12:34:04] {2282} INFO - iteration 2, cu

In [5]:
import shap
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Fit the model if not already fitted

window = "60m"
X_train, y_train = prep_X_y(df_train, f"tantrum_within_{window}")
X_test, y_test = prep_X_y(df_test, response_column=f"tantrum_within_{window}")
model = XGBClassifier(**automl.best_config)
model.fit(X_train, y_train)

# Predict probabilities for the positive class
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Compute ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

ROC AUC: 0.6637


In [None]:
# Create SHAP explainer
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
# Note: Bar plot does not accept "group_remaining_features" argument
shap.plots.bar(shap_values, max_display=15)

In [None]:
shap.plots.beeswarm(shap_values, max_display=15, group_remaining_features=False)

In [None]:
shap.plots.scatter(shap_values[:, "hr_moving_avg_10m"], color=shap_values)