# Setup

In [37]:
import os
from google.colab import files
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

Load data from local folder

In [45]:
destination_folder = '/content/data'
uploaded = files.upload()
for filename in uploaded.keys():
    shutil.move(filename, os.path.join(destination_folder, filename))

Saving all_fitbit_data_daily_w_visit.csv to all_fitbit_data_daily_w_visit.csv


Setup fitbit_data

In [46]:
fitbit_data = pd.read_csv('/content/data/data_daily_w_visits.csv')

In [47]:
# Organizing columns
measure_cols = ['avgWeight_per_day', 'calories', 'heart', 'steps']
survey_cols = ['diet', 'medication', 'symptoms']
result_col = 'visit_day'

In [48]:
# Ensure date is datetime and sort
fitbit_data['date'] = pd.to_datetime(fitbit_data['date'])
fitbit_data = fitbit_data.sort_values(by=['fitbit_user_id', 'date'])

# Shift surveuy columns
for col in survey_cols:
  fitbit_data[col] = fitbit_data.groupby('fitbit_user_id')[col].shift(-1)

# User has visit -> all days up to visit are 1
for user_id, group in fitbit_data.groupby('fitbit_user_id'):
    if (group[result_col] == 1).any():
        # Set all rows for this user to 1 in the original dataframe
        fitbit_data.loc[fitbit_data['fitbit_user_id'] == user_id, result_col] = 1

# Set remaining nan visit columns to 0
fitbit_data[result_col] = fitbit_data[result_col].fillna(0)

In [None]:
fitbit_data

In [None]:
fitbit_data.info()

In [None]:
fitbit_data.describe()

In [None]:
fitbit_data[result_col]

In [None]:
fitbit_data[measure_cols + survey_cols + [result_col]].corr()

Data cleaning

In [22]:
# Remove rows that have too many missing values
def mark_day_for_removal(df, cols, max_nans, window):
    df = df.copy()
    df['remove'] = False

    for _, group in df.groupby('fitbit_user_id'):
        group = group.copy()

        for col in cols:
            n = len(group)
            for start in range(n):
                end = min(start + window, n)
                window_slice = group.iloc[start:end]

                # Count NaNs in this window for this column
                nan_count = window_slice[col].isna().sum()

                if nan_count >= max_nans:
                    # Mark rows with NaN in this column for removal
                    for idx in window_slice.index:
                        if pd.isna(group.at[idx, col]):
                            df.at[idx, 'remove'] = True

    return df

fitbit_data = mark_day_for_removal(fitbit_data, measure_cols+survey_cols, 3, 7)

# Impute columns
def impute_forward_fill(df, cols):
    df = df.copy()
    for _, group in df.groupby('fitbit_user_id'):
        group = group.copy()
        for col in cols:
            group[col] = group[col].ffill()  # forward fill
        df.loc[group.index, cols] = group[cols]
    return df

fb_data = impute_forward_fill(fitbit_data, measure_cols+survey_cols)

# Remove rows with missing values
def final_clean(df, cols):
    df = df.copy()

    # Remove rows marked True in the 'remove' column
    df = df[df['remove'] != True]

    # Remove rows that have any NaN in the specified columns
    df = df.dropna(subset=cols)

    return df

fb_data = final_clean(fb_data, measure_cols+survey_cols)

# Scale numerical columns
scaler = MinMaxScaler().fit(fb_data[measure_cols])
fb_data[measure_cols] = scaler.transform(fb_data[measure_cols])

Creating sequences

In [29]:
user_ids = fitbit_data['fitbit_user_id'].unique()
train_ids, test_ids = train_test_split(user_ids, test_size=0.2, random_state=42)

train_mask = fitbit_data['fitbit_user_id'].isin(train_ids)
test_mask  = fitbit_data['fitbit_user_id'].isin(test_ids)

sequence_len = 14
feature_cols = measure_cols + survey_cols

def make_sequences(df, user_ids, sequence_len):
    X, y = [], []

    for user_id in user_ids:
        group = df[df['fitbit_user_id'] == user_id].sort_values('date')
        features = group[feature_cols].values
        labels = group['visit_day'].values

        for i in range(len(group) - sequence_len + 1):
            X.append(features[i:i+sequence_len])
            y.append(labels[i + sequence_len - 1])  # label for last day in sequence

    return np.array(X), np.array(y)

# Use split user IDs
X_train, y_train = make_sequences(fitbit_data, train_ids, sequence_len)
X_test,  y_test  = make_sequences(fitbit_data, test_ids,  sequence_len)

In [33]:
X_train.shape

(33233, 14, 7)