In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import pandas as pd
from flaml import AutoML
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
from xgboost import XGBClassifier

from util import engineer_features, prep_X_y

DATA_DIR = Path("./pistachio_1_data")
dyads_df = pd.read_csv(DATA_DIR / "all_dyads.csv")

sorted_dyads_df = dyads_df.sort_values(
    by="ActivityDateTime", key=lambda x: pd.to_datetime(x)
)
cleaned_dyads_dfs = engineer_features(
    sorted_dyads_df,
    stress_lookback_days=0,
    sleep_days_to_keep=[1, 2],
)

In [None]:
import itertools

supersets = {
    "watch": [
        "hr",
        "activity",
        "sleep",
        "stress",
        "overnight_hrv",
    ],
    "demographic": [
        "child_demo",
        "parent_demo",
    ],
    "medical": [
        "medical",
        "therapy",
    ],
}

scores = {}

combinations = [
    combination
    for r in range(1, 4)
    for combination in itertools.combinations(supersets.keys(), r)
]
for superset_combination in tqdm(combinations):
    fs_subset = []
    for superset in superset_combination:
        fs_subset.extend(supersets[superset])

    combined_df = pd.concat(
        [
            cleaned_dyads_dfs["index"],
            cleaned_dyads_dfs["response"],
        ]
        + [cleaned_dyads_dfs[fs] for fs in fs_subset],
        axis=1,
    )

    window = "30m"

    # dyad_ids = combined_df["dyad"].unique()
    # train_dyads, test_dyads = train_test_split(dyad_ids, test_size=0.2, random_state=42)
    # df_train = combined_df[combined_df["dyad"].isin(train_dyads)]
    # df_test = combined_df[combined_df["dyad"].isin(test_dyads)]

    df_train = combined_df[combined_df["Arm_Sham"]]
    df_test = combined_df[~combined_df["Arm_Sham"]]
    X_train, y_train = prep_X_y(df_train, f"tantrum_within_{window}")
    X_test, y_test = prep_X_y(df_test, f"tantrum_within_{window}")

    automl_settings = {
        "time_budget": 15,  # seconds
        # "train_time_limit": 1,  # seconds
        "task": "classification",
        "metric": "log_loss",
        "estimator_list": ["xgboost"],
        # "split_type": time_series_split,
        "early_stop": True,
        "verbose": False,
    }
    automl = AutoML()
    automl.fit(X_train, y_train, **automl_settings)

    # model = XGBClassifier(**automl.best_config)
    # model.fit(X_train, y_train)
    y_pred_proba = automl.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_pred_proba)

    print(superset_combination, score)

    scores[", ".join(superset_combination)] = score

  0%|          | 0/7 [00:00<?, ?it/s]

('watch',) 0.6507570333451116
('demographic',) 0.6076651311622635
('medical',) 0.5581800286523741
('watch', 'demographic') 0.6369486903467712
('watch', 'medical') 0.6556223498915652
('demographic', 'medical') 0.5682957333194842
('watch', 'demographic', 'medical') 0.688656479966706


In [None]:
from pprint import pprint

# Get top trials with unique parameters from the study
unique_params = set()
top_unique_trials = []
for t in sorted(
    study.trials,
    key=lambda x: x.value,
    reverse=True,
):
    if len(top_unique_trials) == 10:
        break
    if t.params["hr"] is False:
        continue

    params_tuple = tuple(sorted(t.params.items()))
    if params_tuple not in unique_params and t.value is not None:
        unique_params.add(params_tuple)
        top_unique_trials.append(t)

for idx, trial in enumerate(top_unique_trials):
    print(f"AUC={trial.value:.4f}, Params={trial.params}")

    def param_diff(trial_a, trial_b):
        diff = {}
        for k in trial_a.params:
            if trial_a.params[k] != trial_b.params[k]:
                diff[k] = (trial_a.params[k], trial_b.params[k])
        return diff


print()
print("Parameters of the best trial:")
pprint(top_unique_trials[0].params)
for i in range(1, 6):
    diff = param_diff(top_unique_trials[0], top_unique_trials[i])
    print(f"Difference between trial 1 and trial {i + 1}: {diff}")

In [None]:
import shap

fs_subset = ["stress"]
combined_df = pd.concat(
    [
        cleaned_dyads_dfs["index"],
        cleaned_dyads_dfs["response"],
    ]
    + [cleaned_dyads_dfs[fs] for fs in fs_subset],
    axis=1,
)

df_sham = combined_df[combined_df["Arm_Sham"]]
df_treat = combined_df[~combined_df["Arm_Sham"]]

df_train = df_sham
df_test = df_treat
X_train, y_train = prep_X_y(df_train, f"tantrum_within_{window}")
X_test, y_test = prep_X_y(df_test, response_column=f"tantrum_within_{window}")
automl_settings = {
    "time_budget": 5,  # seconds
    # "train_time_limit": 1,  # seconds
    "task": "classification",
    "metric": "log_loss",
    "estimator_list": ["xgboost"],
    # "split_type": time_series_split,
    "early_stop": True,
    "verbose": False,
}
automl = AutoML()
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
model = XGBClassifier(**automl.best_config)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

# Create SHAP explainer
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
# Note: Bar plot does not accept "group_remaining_features" argument
shap.plots.bar(shap_values, max_display=15)