## 1) Loading Data and Packages + Util

In [1]:
import os
from google.colab import files
import shutil

import math
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold, cross_val_score, RandomizedSearchCV, GridSearchCV, TunedThresholdClassifierCV, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, roc_curve, ConfusionMatrixDisplay, precision_recall_curve, average_precision_score, make_scorer, f1_score, precision_score, recall_score
from sklearn.metrics import get_scorer_names, balanced_accuracy_score
import matplotlib.pyplot as plt
from typing_extensions import final
import seaborn as sns
import random

from collections import OrderedDict

Load data from local folder

In [3]:
os.makedirs('/content/data', exist_ok=True)
destination_folder = '/content/data'
uploaded = files.upload()
for filename in uploaded.keys():
    shutil.move(filename, os.path.join(destination_folder, filename))

Saving all_fitbit_data_daily_w_visit.csv to all_fitbit_data_daily_w_visit.csv
Saving data_daily_w_visits.csv to data_daily_w_visits.csv
Saving adh.csv to adh.csv


Setup fitbit_data

In [4]:
fitbit_data = pd.read_csv('/content/data/data_daily_w_visits.csv')
all_fitbit_data = pd.read_csv('/content/data/all_fitbit_data_daily_w_visit.csv')
full_fb_data = pd.read_csv('/content/data/full_fb_data.csv')
adh_data = pd.read_csv('/content/data/adh.csv')

In [5]:
# Organizing columns
measure_features = ['calories', 'heart', 'steps']
survey_features = ['diet', 'medication', 'symptoms']
target_col = 'visit_day'
all_features = ['avgWeight_per_day',
       'calories', 'heart', 'steps', 'minutes_asleep', 'minutes_awake', 'temp/skin_nightlyRelative',
       'spo2_avg', 'spo2_min', 'spo2_max', 'hrv_dailyRmssd', 'hrv_deepRmssd',
       'br_breathingRate', 'out_of_range_zone_cal', 'out_of_range_zone_max_hr',
       'out_of_range_zone_min_hr', 'fat_burn_zone_cal', 'fat_burn_zone_max_hr',
       'fat_burn_zone_min_hr', 'cardio_zone_cal', 'cardio_zone_max_hr',
       'cardio_zone_min_hr', 'peak_zone_cal', 'peak_zone_max_hr',
       'peak_zone_min_hr', 'diet_response_0.0', 'diet_response_1.0',
       'diet_response_2.0', 'diet_response_nan', 'medication_response_0.0',
       'medication_response_1.0', 'medication_response_2.0',
       'medication_response_nan', 'symptoms_response_0.0',
       'symptoms_response_1.0', 'symptoms_response_2.0',
       'symptoms_response_nan']
one_h_feaatures = ['diet_response_0.0', 'diet_response_1.0',
       'diet_response_2.0', 'diet_response_nan', 'medication_response_0.0',
       'medication_response_1.0', 'medication_response_2.0',
       'medication_response_nan', 'symptoms_response_0.0',
       'symptoms_response_1.0', 'symptoms_response_2.0',
       'symptoms_response_nan']
adh_info = ['synced_fitbit', 'checked_weight',	'did_survey',	'fb_streak',
            'bt_streak',	'surv_streak', 'above_min_thresh',
            'percent_of_day_worn',	'above_thresh']

In [6]:
# Ensure date is datetime and sort
fitbit_data['date'] = pd.to_datetime(fitbit_data['date'])
fitbit_data = fitbit_data.sort_values(by=['fitbit_user_id', 'date'])
all_fitbit_data['date'] = pd.to_datetime(all_fitbit_data['date'])
all_fitbit_data = all_fitbit_data.sort_values(by=['fitbit_user_id', 'date'])
full_fb_data['date'] = pd.to_datetime(full_fb_data['date'])
full_fb_data = full_fb_data.sort_values(by=['fitbit_user_id', 'date'])

# Shift survey columns
for col in survey_features:
  fitbit_data[col] = fitbit_data.groupby('fitbit_user_id')[col].shift(-1)
  all_fitbit_data[col] = all_fitbit_data.groupby('fitbit_user_id')[col].shift(-1)
  full_fb_data[col] = full_fb_data.groupby('fitbit_user_id')[col].shift(-1)

# Set remaining nan visit columns to 0
fitbit_data[target_col] = fitbit_data[target_col].fillna(0)
all_fitbit_data[target_col] = all_fitbit_data[target_col].fillna(0)
full_fb_data[target_col] = full_fb_data[target_col].fillna(0)

In [10]:
full_fb_data['has_visit'] = 0

for user_id, group in full_fb_data.groupby('fitbit_user_id'):
    if group[target_col].sum() != 0:
        full_fb_data.loc[group.index, 'has_visit'] = 1

In [12]:
def zscore_userwise(df, features):
    df_z = df.copy()
    for col in features:
        means = df.groupby('fitbit_user_id')[col].transform('mean')
        stds  = df.groupby('fitbit_user_id')[col].transform('std')
        df_z[col + '_z'] = (df[col] - means) / stds
    return df_z

In [13]:
z_features = ['avgWeight_per_day', 'calories', 'heart', 'steps']

z_fb_data = zscore_userwise(full_fb_data, z_features)

In [15]:
def extract_days_before_visit(df, days_before=14):
    output_rows = []

    for user_id, group in df.groupby('fitbit_user_id'):
        group = group.reset_index(drop=True)
        visit_indices = group.index[group['visit_day'] == 1].tolist()

        if not visit_indices:
            continue  # Skip users with no visits

        first_visit_idx = visit_indices[0]
        visit_day = group.loc[first_visit_idx, 'days']

        # Slice window before visit
        visit_window = group[(group['days'] >= visit_day - days_before) & (group['days'] <= visit_day)].copy()

        # Add relative day to visit
        visit_window['days_to_visit'] = visit_window['days'] - visit_day
        visit_window['fitbit_user_id'] = user_id

        output_rows.append(visit_window)

    result_df = pd.concat(output_rows, axis=0).reset_index(drop=True)

    return result_df

In [24]:
def extract_days_no_visits(df, window_days=15, max_windows=3, seed=42):
    np.random.seed(seed)
    output_rows = []

    for user_id, group in df[df['has_visit'] == 0].groupby('fitbit_user_id'):
        group = group.reset_index(drop=True)

        min_day = group['days'].min()
        max_day = group['days'].max()

        possible_starts = list(range(min_day, max_day - window_days + 1))
        np.random.shuffle(possible_starts)

        windows_added = 0

        for start in possible_starts:
            end = start + window_days
            window = group[(group['days'] >= start) & (group['days'] < end)].copy()

            if len(window) == window_days:  # Complete window
                window['days_to_window'] = window['days'] - end + 1
                window['fitbit_user_id'] = user_id
                output_rows.append(window)
                windows_added += 1

            if windows_added >= max_windows:
                break

    return pd.concat(output_rows, axis=0).reset_index(drop=True)

In [25]:
two_week_visit = extract_days_before_visit(z_fb_data)
three_week_visit = extract_days_before_visit(z_fb_data, days_before=21)
two_week = extract_days_no_visits(z_fb_data)
three_week = extract_days_no_visits(z_fb_data, window_days=22)

In [26]:
two_week

Unnamed: 0.1,Unnamed: 0,date,study_group,fitbit_user_id,avgWeight_per_day,calories,heart,steps,diet,medication,...,surv_streak,above_min_thresh,percent_of_day_worn,above_thresh,has_visit,avgWeight_per_day_z,calories_z,heart_z,steps_z,days_to_window
0,135,2021-12-09,App Money,1,220.462000,2423.0,61.0,3149.0,,,...,0.0,0.0,0.675000,1,0,-1.030152,-0.843576,-1.189101,-0.909884,-14
1,136,2021-12-10,App Money,1,223.107544,2952.0,61.0,8310.0,2.0,1.0,...,0.0,0.0,0.684722,1,0,-0.405511,0.738222,-1.189101,1.229473,-13
2,137,2021-12-11,App Money,1,223.548468,2426.0,61.0,3424.0,2.0,2.0,...,1.0,0.0,0.631250,1,0,-0.301404,-0.834605,-1.189101,-0.795890,-12
3,138,2021-12-12,App Money,1,223.217775,3131.0,61.0,8669.0,,,...,2.0,0.0,0.616667,1,0,-0.379484,1.273462,-1.189101,1.378287,-11
4,139,2021-12-13,App Money,1,222.666620,2584.0,60.0,4924.0,2.0,2.0,...,0.0,0.0,0.672222,1,0,-0.509618,-0.362159,-1.751858,-0.174105,-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8905,44722,2024-12-09,No App,307,,2120.0,56.0,3160.0,,,...,,0.0,0.586111,1,0,,-0.879743,-1.576522,-1.127575,-4
8906,44723,2024-12-10,No App,307,,2851.0,56.0,4713.0,,,...,,0.0,0.600694,1,0,,0.376569,-1.576522,-0.558701,-3
8907,44724,2024-12-11,No App,307,201.502268,2168.0,57.0,3707.0,,,...,,0.0,0.578472,1,0,0.498515,-0.797249,-1.281920,-0.927205,-2
8908,44725,2024-12-12,No App,307,,3556.0,57.0,10924.0,,,...,,0.0,0.587500,1,0,,1.588196,-1.281920,1.716429,-1
