## Abstract

First Data for Passive Data Collection using Smartwatches and GPS from the PREACT Study. 

## Introduction

Treatment personalization is highly discussed to counteract insufficient response rates in psychotherapy. In the quest for criteria allowing informed selection or adaptation, ambulatory assessment data (i.e. EMA, passive sensing)are a key component, as processes happening outside of therapy sessions can be depicted in high temporal and/or spatial resolution.

PREACT is a multicenter prospective-longitudinal study investigating different predictors of non-response (i.e. EEG, fMRI) in around 500 patients undergoing cognitive behavioral therapy for internalizing disorders (https://forschungsgruppe5187.de/de). 

## Methods
Patients can enroll for therapy-accompanying ambulatory assessment. They are provided with a customized study app and a state-of-the-art smartwatch collecting passive data like GPS and heart rate for up to 365 days. In parallel, three 14-day EMA phases (pre-, mid- and post-therapy) cover transdiagnostic (i.e. emotion regulation), contextual and therapy-related aspects.  

Here, we present first results on data compliance and quality for the passive sensing data as well as EMA assessments.


In [1]:
import os
import glob
import pickle
import sys
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)

import pandas as pd
import datetime as dt
from datetime import date, datetime
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import regex as re


from server_config import datapath, proj_sheet,preprocessed_path, raw_path, redcap_path, preprocessed_path_freezed

today = date.today().strftime("%d%m%Y")
today_day = pd.to_datetime('today').normalize()
today = "24032025"

df_monitoring = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{proj_sheet}/export?format=csv")

In [2]:
# actual passive + ema_data
datapath1 = raw_path + f"/export_tiki_{today}/"
file_pattern = os.path.join(datapath1, "epoch_part*.csv")
file_list = glob.glob(file_pattern)
file_list.sort()
df_complete = pd.concat((pd.read_csv(f, encoding="latin-1", low_memory=False) for f in file_list), ignore_index=True)

In [3]:
### 1.1 Import epoch level passive + GPS data

# Extract customer identifier
df_complete["customer"] = df_complete.customer.str.split("@").str.get(0)
df_complete["customer"] = df_complete["customer"].str[:4]

# Convert timestamps from milliseconds since epoch to datetime
df_complete["startTimestamp"] = pd.to_datetime(df_complete["startTimestamp"], unit='ms')
df_complete["endTimestamp"] = pd.to_datetime(df_complete["endTimestamp"], unit='ms')

In [4]:
## Timezone offset adjustment

# Fill NaN timezoneOffsets with 0
df_complete['timezoneOffset_filled'] = df_complete['timezoneOffset'].fillna(0)

# Convert timezoneOffset to timedelta
df_complete['timezoneOffset_timedelta'] = pd.to_timedelta(df_complete['timezoneOffset_filled'], unit='ms')

# Adjust the timestamps
df_complete['startTimestamp'] = df_complete['startTimestamp'] + df_complete['timezoneOffset_timedelta']
df_complete['endTimestamp'] = df_complete['endTimestamp'] + df_complete['timezoneOffset_timedelta']

# Calculate the duration after adjusting timestamps
df_complete["start_end"] = df_complete["endTimestamp"] - df_complete["startTimestamp"]

In [5]:
# Convert Timedelta to total seconds (float)
df_complete['start_end'] = df_complete['start_end'].dt.total_seconds()

# Extract date and hour
df_complete["startTimestamp_day"] = df_complete.startTimestamp.dt.normalize()
df_complete["startTimestamp_hour"] = df_complete.startTimestamp.dt.hour

# Drop temporary columns if not needed
df_complete.drop(columns=['timezoneOffset_filled', 'timezoneOffset_timedelta'], inplace=True)


In [6]:
# Merge with backup data
backup_path = preprocessed_path + "/backup_data_passive_actual.feather"

df_backup = pd.read_feather(backup_path)

In [7]:
latest_timestamp = df_backup['startTimestamp'].max()

# Filter the second dataframe to include only entries after the latest timestamp
df_complete_filtered = df_complete[df_complete['startTimestamp'] > latest_timestamp]

### 

In [8]:
df_complete_filtered = df_complete_filtered.drop(columns=['valueType', 'createdAt', 'source', 
                                                              'trustworthiness', 'medicalGrade', 'generation'])


## Monitoring data

In [9]:
df_monitoring = df_monitoring.copy()
df_monitoring.rename(columns = {"Pseudonym": "customer", "EMA_ID": "ema_id", "Status": "status",
                                "Studienversion":"study_version", "FOR_ID":"for_id", 
                           "Start EMA Baseline": "ema_base_start", "Ende EMA Baseline": "ema_base_end", 
                           "Freischaltung/ Start EMA T20": "ema_t20_start","Ende EMA T20":"ema_t20_end", 
                                "Freischaltung/ Start EMA Post":"ema_post_start",
                               "Ende EMA Post":"ema_post_end", "T20=Post":"t20_post" }, inplace=True)

df_monitoring = df_monitoring[['for_id', 'ema_id', 'customer', 'study_version', 'status',
       't20_post', 'ema_base_start', 'ema_base_end', 'ema_t20_start', 'ema_t20_end',
       'ema_post_start', 'ema_post_end']]

df_monitoring["customer"] = df_monitoring["customer"].str[:4]
df_monitoring["for_id"] = df_monitoring.for_id.str.strip()

df_monitoring["ema_base_start"] = pd.to_datetime(df_monitoring["ema_base_start"], dayfirst=True)
df_monitoring["ema_base_end"] = pd.to_datetime(df_monitoring["ema_base_end"], dayfirst=True)

df_monitoring_short = df_monitoring[["customer", "for_id","ema_id","status", "study_version", "ema_base_start","ema_base_end"]]


## Passive data

In [10]:
df_complete_filtered= df_complete_filtered.merge(df_monitoring_short, on="customer", how="right")

In [11]:
object_cols = ["booleanValue", "stringValue","customer", "type", "status", "study_version"] 

# Fill NaN values with -99 for the specified columns
for col in object_cols:
    df_complete_filtered[col] = df_complete_filtered[col].fillna(-99)

# Convert "booleanValue" to boolean
df_complete_filtered['booleanValue'] = df_complete_filtered['booleanValue'].apply(lambda x: bool(x) if x != -99 else False)

# Convert "stringValue", "status", "study_version" to string using StringDtype
df_complete_filtered['stringValue'] = df_complete_filtered['stringValue'].astype('string')
df_complete_filtered['status'] = df_complete_filtered['status'].astype('string')
df_complete_filtered['study_version'] = df_complete_filtered['study_version'].astype('string')
df_complete_filtered['customer'] = df_complete_filtered['customer'].astype('string')
df_complete_filtered['type'] = df_complete_filtered['type'].astype('string')


  df_complete_filtered[col] = df_complete_filtered[col].fillna(-99)


In [12]:
data_type_groups = {
    'GPS': ["Latitude"],
    # Add more groups as needed
    'Activity': ["Steps"],
    'Sleep': ["SleepBinary"],
    'Heart_Rate': ["HeartRate"]
    # Add more groups as needed
}

## EMA data

#### 1. Load and match relevant data from separate .csv files

In [13]:
# load data from separate csv files
session = pd.read_csv(datapath1 + "questionnaireSession.csv",low_memory=False)
answers = pd.read_csv(datapath1 + "answers.csv", low_memory=False)
choice = pd.read_csv(datapath1 + "choice.csv",low_memory=False)
questions = pd.read_csv(datapath1 + "questions.csv",low_memory=False)
questionnaire = pd.read_csv(datapath1 + "questionnaires.csv", low_memory=False)


In [14]:
# session data
session["user"] = session["user"].str[:4]
session.rename(columns = {"user":"customer","completedAt": "quest_complete", "createdAt": "quest_create", "expirationTimestamp": "quest_expir"}, inplace=True)
session["quest_create"] = (pd.to_datetime(session["quest_create"],unit='ms'))
session["quest_complete"] = (pd.to_datetime(session["quest_complete"],unit='ms'))

df_sess = session[["customer", "sessionRun", "quest_create", "quest_complete", "study"]]

In [15]:
# answer data 
answers["user"] = answers["user"].str[:4]
answers = answers[["user", "questionnaire", "study", "question","element", "createdAt"]]
answers["createdAt"] = (pd.to_datetime(answers["createdAt"],unit='ms'))
answers.rename(columns={"user":"customer", "createdAt": "quest_create"}, inplace=True)

In [16]:
# item description data
choice = choice[["element", "choice_id", "text", "question"]]
choice.rename(columns={"text":"choice_text"}, inplace=True)

In [17]:
# question description data
questions = questions[["id", "title"]]
questions.rename(columns={"id":"question","title":"quest_title"}, inplace=True)

In [18]:
questionnaire = questionnaire[["id", "name"]]
questionnaire.rename(columns={"id":"questionnaire","name":"questionnaire_name"}, inplace=True)

In [19]:
answer_merged = pd.merge(answers, choice, on= ["question","element"])
answer_merged = pd.merge(answer_merged, questions, on= "question")
answer_merged = pd.merge(answer_merged, questionnaire, on= "questionnaire")
answer_merged["quest_create_day"] = answer_merged.quest_create.dt.normalize()

In [20]:
answer_merged

Unnamed: 0,customer,questionnaire,study,question,element,quest_create,choice_id,choice_text,quest_title,questionnaire_name,quest_create_day
0,APbN,56,25,315,1707.0,2023-04-27 08:07:14.748,1,1,panas_selfassurance,TIKI_1A_E1,2023-04-27
1,APbN,56,25,316,1720.0,2023-04-27 08:07:15.748,7,7,panas_joviality2,TIKI_1A_E1,2023-04-27
2,APbN,56,25,317,1726.0,2023-04-27 08:07:16.645,6,6,panas_fatigue,TIKI_1A_E1,2023-04-27
3,APbN,56,25,318,1734.0,2023-04-27 08:07:17.559,7,7,panas_joviality1,TIKI_1A_E1,2023-04-27
4,APbN,56,25,319,1740.0,2023-04-27 08:07:18.516,6,6,panas_fear1,TIKI_1A_E1,2023-04-27
...,...,...,...,...,...,...,...,...,...,...,...
1152831,M7TE,123,38,358,1968.0,2025-03-23 22:51:06.659,4,4,ta_behavioral_2,TIKI_8_E2_S2,2025-03-23
1152832,M7TE,123,38,359,1976.0,2025-03-23 22:51:10.600,5,5,ta_kognitiv,TIKI_8_E2_S2,2025-03-23
1152833,M7TE,123,38,360,1983.0,2025-03-23 22:51:14.085,5,5,ta_kognitiv_2,TIKI_8_E2_S2,2025-03-23
1152834,M7TE,123,38,357,1961.0,2025-03-23 22:51:16.327,4,4,ta_behavioral,TIKI_8_E2_S2,2025-03-23


In [21]:
answer_merged = pd.merge(answer_merged, df_monitoring, on = "customer")

#### 2. Calculate EMA coverage

In [22]:
df_sess = pd.merge(df_sess, df_monitoring, on = "customer")

In [23]:
df_sess = df_sess[['customer', 'sessionRun', 'quest_create', 'quest_complete', 'study',
       'for_id', 'ema_id', 'study_version', 'status', 't20_post',
       'ema_base_start', 'ema_base_end','ema_t20_start', 'ema_t20_end',
       'ema_post_start', 'ema_post_end']]

In [24]:
df_sess = df_sess.copy()
df_sess["quest_create_day"] = df_sess.quest_create.dt.normalize()
df_sess["quest_complete_day"] = df_sess.quest_complete.dt.normalize()

df_sess["quest_create_hour"] = df_sess.quest_create.dt.hour
df_sess["quest_complete_hour"] = df_sess.quest_complete.dt.hour

In [25]:
# count number of completed EMA beeps in first phase
df_sess1 = df_sess.loc[df_sess.study.isin([24,25])]
df_sess1 = df_sess1.copy()

df_sess2 = df_sess.loc[df_sess.study.isin([33,34])]
df_sess2 = df_sess2.copy()

df_sess3 = df_sess.loc[df_sess.study.isin([38,39])]
df_sess3 = df_sess3.copy()

In [26]:
df_sess1['quest_complete_relative1'] = (df_sess1['quest_complete_day'] - df_sess1['ema_base_start']).dt.days


sess_count1 = df_sess1.dropna(subset=["quest_create"]).groupby("customer")["quest_create"].size()\
.reset_index()
sess_count1 = sess_count1.rename(columns = {"quest_create":"nquest_EMA1"})

# count number of completed EMA beeps in second phase
sess_count2 = df_sess2.dropna(subset=["quest_create"]).groupby("customer")["quest_create"].size()\
.reset_index()
sess_count2 = sess_count2.rename(columns = {"quest_create":"nquest_EMA2"})

# count number of completed EMA beeps in second phase
sess_count3 = df_sess3.dropna(subset=["quest_create"]).groupby("customer")["quest_create"].size()\
.reset_index()
sess_count3 = sess_count3.rename(columns = {"quest_create":"nquest_EMA3"})

In [27]:
df_sess = df_sess.merge(sess_count1, on=['customer'], how='left')
df_sess = df_sess.merge(sess_count2, on=['customer'], how='left')
df_sess = df_sess.merge(sess_count3, on=['customer'], how='left')

#### 3. Calculate auxiliary variables

In [28]:
df_ema_content = answer_merged.copy()

In [29]:
import pandas as pd
import numpy as np

# Assuming df_ema_content is your DataFrame and is already loaded

### 1. Date and Time Manipulations

df_ema_content['weekday'] = df_ema_content['quest_create'].dt.day_name()
df_ema_content['createdAt_day'] = df_ema_content['quest_create'].dt.floor('D')

date_cols = ['ema_base_start', 'ema_t20_start', 'ema_post_start']
for col in date_cols:
    df_ema_content[col] = pd.to_datetime(df_ema_content[col], dayfirst=True, errors='coerce')

# **Additions Start Here**

### 1a. Calculate "Season"

def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Winter'

df_ema_content['season'] = df_ema_content['quest_create'].dt.month.apply(get_season)

### 1b. Calculate "Time of Day"

def get_time_of_day(hour):
    if 5 <= hour < 8:
        return 'Early Morning'
    elif 8 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df_ema_content['time_of_day'] = df_ema_content['quest_create'].dt.hour.apply(get_time_of_day)

# **Additions End Here**

### 2. Study Mapping and String Manipulation

study_mapping = {
    24: 0,
    25: 0,
    33: 1,
    34: 1,
    38: 2,
    39: 2
}
df_ema_content['assess'] = df_ema_content['study'].map(study_mapping)
df_ema_content['quest_title'] = df_ema_content['quest_title'].str.replace('_morning', '', regex=False)

### 3. Weekend Indicator

df_ema_content['weekend'] = df_ema_content['weekday'].isin(['Saturday', 'Sunday']).astype(int)

### 4. Extract Questionnaire Number

df_ema_content['quest_nr'] = df_ema_content['questionnaire_name'].str.extract(r'(\d+)')
df_ema_content['quest_nr'] = df_ema_content['quest_nr'].astype(float)

### 5. Count of Unique Questionnaires per Day

df_ema_content['n_quest'] = df_ema_content.groupby(
    ['study', 'customer', 'createdAt_day']
)['questionnaire_name'].transform('nunique')

### 6. Create Unique Day Identifier

df_ema_content['quest_nr_str'] = df_ema_content['quest_nr'].fillna('unknown').astype(str)
df_ema_content['unique_day_id'] = df_ema_content['createdAt_day'].dt.strftime('%Y%m%d') + '_' + df_ema_content['quest_nr_str']

### 7. Compute Relative Start and End Dates per Phase and Customer

# Calculate start and end dates for each phase
phase_dates = df_ema_content.groupby(['customer', 'assess']).agg(
    ema_relative_start=('createdAt_day', 'min'),
    ema_relative_end=('createdAt_day', 'max')
).reset_index()

# Pivot the data to get phase-specific columns
phase_dates_pivot = phase_dates.pivot(
    index='customer', 
    columns='assess', 
    values=['ema_relative_start', 'ema_relative_end']
)

# Flatten MultiIndex columns
phase_dates_pivot.columns = [f"{col[0]}_phase{int(col[1])}" for col in phase_dates_pivot.columns]
phase_dates_pivot = phase_dates_pivot.reset_index()

# Merge the phase-specific dates back into the main DataFrame
df_ema_content = df_ema_content.merge(
    phase_dates_pivot, on='customer', how='left'
)

### 8. Calculate Absolute and Relative Day Indices

# Create a mapping from 'assess' to the corresponding 'ema_relative_start_phaseX' column
assess_to_start_col = {
    0: 'ema_relative_start_phase0',
    1: 'ema_relative_start_phase1',
    2: 'ema_relative_start_phase2'
}

# Assign the appropriate 'ema_relative_start' based on 'assess'
df_ema_content['ema_relative_start'] = df_ema_content.apply(
    lambda row: row.get(assess_to_start_col.get(row['assess'], np.nan), np.nan), axis=1
)

# Calculate the Absolute Day Index
df_ema_content['absolute_day_index'] = (
    df_ema_content['createdAt_day'] - df_ema_content['ema_relative_start']
).dt.days + 1

# Calculate the Relative Day Index by ranking unique days per 'customer' and 'assess'
df_ema_content['relative_day_index'] = df_ema_content.groupby(
    ['customer', 'assess']
)['createdAt_day'].rank(method='dense').astype(int)

### 9. Remove Entries with Absolute Day Index > 16

# Define the maximum allowed absolute day index
max_allowed_days = 16

# Filter the DataFrame to keep only entries with absolute_day_index <= 16
df_ema_content = df_ema_content[df_ema_content['absolute_day_index'] <= max_allowed_days]

# Optionally, reset the index after filtering
df_ema_content.reset_index(drop=True, inplace=True)

### 10. Check for High Absolute Day Indices (Post-Filtering)

# Verify that no entries have absolute_day_index > 16
high_indices = df_ema_content[df_ema_content['absolute_day_index'] > max_allowed_days]

if not high_indices.empty:
    print("Warning: Some entries still have unexpectedly high absolute day indices:")
    print("Customers with high absolute day indices:")
    print(high_indices['customer'].unique())
else:
    print("All entries have absolute_day_index <= 16.")

### 11. Calculate Questionnaire Counter

df_unique = df_ema_content.drop_duplicates(subset=['customer', 'assess', 'unique_day_id']).copy()
df_unique['questionnaire_counter'] = df_unique.groupby(['customer', 'assess']).cumcount() + 1

df_ema_content = df_ema_content.merge(
    df_unique[['customer', 'assess', 'unique_day_id', 'questionnaire_counter']],
    on=['customer', 'assess', 'unique_day_id'],
    how='left'
)

### 12. Handle Missing Data

df_ema_content['assess'] = df_ema_content['assess'].fillna('unknown')
df_ema_content['absolute_day_index'] = df_ema_content['absolute_day_index'].where(
    df_ema_content['ema_relative_start'].notna(), np.nan
)

# **Optional: View the Updated DataFrame**
# print(df_ema_content.head())


All entries have absolute_day_index <= 16.


In [30]:
filter_criteria = (df_ema_content['customer'] == 'UfMn') & \
                  (df_ema_content['study'] == 25) & \
                  (df_ema_content['quest_create'] > '2024-02-08')

# Drop the entries that match the criteria of the wrong individual
df_ema_content = df_ema_content[~filter_criteria]

In [31]:
df_ema_base = df_ema_content[["customer", 'ema_relative_start_phase0', 'ema_relative_end_phase0', 
                             'ema_relative_start_phase1', 'ema_relative_end_phase1',
                              'ema_relative_start_phase2', 'ema_relative_end_phase2']]
df_ema_base = df_ema_base.drop_duplicates()

In [32]:
df_complete_ema = df_complete_filtered.merge(df_ema_base, on = "customer", how="left")

In [33]:
df_complete_ema_final = pd.concat([df_backup, df_complete_ema], ignore_index=True)

In [34]:
df_complete_ema_final.loc[df_complete_ema_final.type.isin(["HeartRate"])].head()

Unnamed: 0,customer,type,startTimestamp,endTimestamp,doubleValue,longValue,booleanValue,dateValue,stringValue,userReliability,...,Heart_Rate_actual_days_with_data,Heart_Rate_data_coverage_per,ema_relative_start_phase0,ema_relative_end_phase0,ema_relative_start_phase1,ema_relative_end_phase1,ema_relative_start_phase2,ema_relative_end_phase2,for_id,ema_id
3,4MLe,HeartRate,2023-05-17 18:58:01,2023-05-17 18:58:38,,74.0,False,,-99,,...,343.0,72.362869,NaT,NaT,NaT,NaT,NaT,NaT,,
22,4MLe,HeartRate,2023-05-17 19:18:03,2023-05-17 19:18:42,,64.0,False,,-99,,...,343.0,72.362869,NaT,NaT,NaT,NaT,NaT,NaT,,
23,4MLe,HeartRate,2023-05-17 19:28:00,2023-05-17 19:29:00,,69.0,False,,-99,,...,343.0,72.362869,NaT,NaT,NaT,NaT,NaT,NaT,,
30,4MLe,HeartRate,2023-05-17 19:38:27,2023-05-17 19:39:30,,72.0,False,,-99,,...,343.0,72.362869,NaT,NaT,NaT,NaT,NaT,NaT,,
40,4MLe,HeartRate,2023-05-17 19:48:20,2023-05-17 19:49:15,,40.0,False,,-99,,...,343.0,72.362869,NaT,NaT,NaT,NaT,NaT,NaT,,


In [39]:
# If you want to keep only certain columns:
keep_cols = [
    'customer', 'type', 'startTimestamp_day', 'startTimestamp_hour',
    'doubleValue', 'longValue', 'booleanValue', 'stringValue', 'ema_base_start'
]
df = df_complete_ema_final[keep_cols]

In [40]:
# --- Step 3: Define the type lists you had before ---
double_value_types = [
    'Steps',
    'ActiveBurnedCalories',
    'SPO2',
    'ElevationGain',
    'Latitude',
    'Rmssd'
]
long_value_types = [
    'HeartRate',
    'ActivityTypeDetail2',
    'ActivityTypeDetail1',
    'ActivityType',
    'FloorsClimbed',
    'RespirationRateSleep'
]
string_value_types = [
    'RawECGVoltage'
]
boolean_value_types = [
    'WalkBinary',
    'SleepAwakeBinary',
    'SleepLightBinary',
    'SleepBinary',
    'SleepStateBinary',
    'SleepDeepBinary',
    'BikeBinary',
    'ActiveBinary',
    'RunBinary',
    'SleepInBedBinary',
    'AtrialFibrillationDetection',
    'SleepREMBinary'
]
all_types = (
    double_value_types +
    long_value_types +
    string_value_types +
    boolean_value_types 
)


# daily_agg_final now has columns:
# [type, date, available_binary, available_hours, customer]


In [41]:

# Filter only needed types
df = df[df['type'].isin(all_types)]
df['type'] = df['type'].astype('category')
df['customer'] = df['customer'].astype('category')



In [42]:
import gc

def compute_availability_metrics(sub):
    """
    Given a subset of the dataframe for a single participant,
    return the daily availability metrics (including days with no data).
    """

    def row_has_data(row):
        t = row['type']
        if t in double_value_types:
            return not pd.isnull(row['doubleValue'])
        elif t in long_value_types:
            return not pd.isnull(row['longValue'])
        elif t in string_value_types:
            return not pd.isnull(row['stringValue'])
        elif t in boolean_value_types:
            return not pd.isnull(row['booleanValue'])
        return False
    
    # 1) Mark rows that truly have data
    sub['has_data'] = sub.apply(row_has_data, axis=1)

    # 2) Aggregate at hourly level
    hourly = (
        sub.groupby(['type', 'startTimestamp_day', 'startTimestamp_hour'], observed=True)['has_data']
           .any()  # yields boolean (True if at least one row has data in that hour)
           .reset_index(name='has_data_in_hour')
    )

    # 3) Aggregate at daily level
    daily_agg = (
        hourly.groupby(['type', 'startTimestamp_day'], observed=True)
              .agg(
                  available_binary=('has_data_in_hour', 'any'),  # True if any hour had data
                  available_hours=('has_data_in_hour', 'sum')    # count of hours with data
              )
              .reset_index()
    )
    # Convert boolean to int for binary, keep hours numeric
    daily_agg['available_binary'] = daily_agg['available_binary'].astype(int)
    daily_agg['available_hours'] = daily_agg['available_hours'].astype(int)

    # ----------- GENERATE FULL DATE RANGE FOR THIS PARTICIPANT -----------
    # We'll assume each participant subset has a column "ema_base_start"
    # that indicates the earliest day we care about for that participant.

    # Minimum "ema_base_start" across rows (in case there are multiple, or if it's the same).
    min_date = sub['ema_base_start'].min()
    # Maximum date actually appearing in sub's data
    max_date = sub['startTimestamp_day'].max()

    if pd.isnull(min_date) or pd.isnull(max_date):
        # If there's no valid range, just return daily_agg as-is
        # but make sure we add the 'customer' column
        daily_agg['customer'] = sub['customer'].iloc[0]
        return daily_agg

    # Create a full daily date range from min_date to max_date
    all_dates = pd.date_range(start=min_date, end=max_date, freq='D')

    # Unique types present for this participant
    unique_types = sub['type'].unique()

    # Cartesian product of [all types] x [all dates]
    all_combos = pd.MultiIndex.from_product(
        [unique_types, all_dates], 
        names=['type', 'startTimestamp_day']
    )
    all_days_df = all_combos.to_frame(index=False)

    # Merge daily_agg onto all possible (type, day) combos
    daily_agg_full = pd.merge(
        all_days_df,
        daily_agg,
        on=['type', 'startTimestamp_day'],
        how='left'
    )

    # Fill missing availability with 0
    daily_agg_full['available_binary'] = daily_agg_full['available_binary'].fillna(0)
    daily_agg_full['available_hours'] = daily_agg_full['available_hours'].fillna(0)

    # Convert them to int
    daily_agg_full['available_binary'] = daily_agg_full['available_binary'].astype(int)
    daily_agg_full['available_hours'] = daily_agg_full['available_hours'].astype(int)

    # Assign the participant ID
    daily_agg_full['customer'] = sub['customer'].iloc[0]

    return daily_agg_full

# ------------------- Process in chunks by participant -------------------
results = []
all_customers = df['customer'].cat.categories  # or df['customer'].unique()

for cust in all_customers:
    sub_df = df[df['customer'] == cust].copy()
    if sub_df.empty:
        continue
    sub_res = compute_availability_metrics(sub_df)
    results.append(sub_res)

    # Cleanup
    del sub_df
    gc.collect()

daily_agg_final = pd.concat(results, ignore_index=True)


NameError: name 'gc' is not defined

In [None]:
daily_agg_final.available_binary.unique()

In [None]:
df_complete_ema_final.startTimestamp.min()

In [None]:
df_complete_ema_final.startTimestamp.max()

In [None]:
type_list_adherence = ['Steps',
 'ActiveBurnedCalories',
 'HeartRate',
 'WalkBinary',
 'ActivityTypeDetail2',
 'ActivityTypeDetail1',
 'ActivityType',
 'RawECGVoltage',
 'SPO2',
 'SleepAwakeBinary',
 'SleepLightBinary',
 'SleepBinary',
 'SleepStateBinary',
 'SleepDeepBinary',
 'BikeBinary',
 'ActiveBinary',
 'RunBinary',
 'ElevationGain',
 'FloorsClimbed',
 'SleepInBedBinary',
 'Latitude',
 'AtrialFibrillationDetection',
 'RespirationRateSleep',
 'Rmssd',
 'SleepREMBinary']

In [None]:

# Calculate memory usage in bytes
memory_usage_bytes = df_complete_ema_final.memory_usage(deep=True).sum()

# Convert to megabytes
memory_usage_mb = memory_usage_bytes / (1024 ** 2)

# Convert to gigabytes
memory_usage_gb = memory_usage_bytes / (1024 ** 3)

# Convert to terabytes
memory_usage_tb = memory_usage_bytes / (1024 ** 4)

print(f"Memory usage: {memory_usage_bytes} bytes")
print(f"Memory usage: {memory_usage_mb:.2f} MB")
print(f"Memory usage: {memory_usage_gb:.2f} GB")
print(f"Memory usage: {memory_usage_tb:.2f} TB")

In [None]:
backup_path = raw_path + "/backup_data_passive_actual.feather"
df_complete_ema_final.to_feather(backup_path)

preprocessed_path_final = preprocessed_path + "/backup_data_passive_actual.feather"
df_complete_ema_final.to_feather(preprocessed_path_final)


with open(preprocessed_path + f'/ema_adherence_data.pkl', 'wb') as file:
    pickle.dump(df_sess, file)
    
with open(preprocessed_path + f'/monitoring_data.pkl', 'wb') as file:
    pickle.dump(df_monitoring, file)

    
with open(preprocessed_path + f'/ema_content.pkl', 'wb') as file:
    pickle.dump(df_ema_content, file)

In [None]:
# Define the new CSV backup path
#backup_path_csv = raw_path + "/backup_data_passive_actual.csv"
#df_complete_ema.to_csv(backup_path_csv, index=False)

# Export df_sess as CSV
df_sess_csv_path = preprocessed_path + '/ema_adherence_data.csv'
df_sess.to_csv(df_sess_csv_path, index=False)

# Export df_monitoring as CSV
df_monitoring_csv_path = preprocessed_path + '/monitoring_data.csv'
df_monitoring.to_csv(df_monitoring_csv_path, index=False)

# Export df_ema_content as CSV
df_ema_content_csv_path = preprocessed_path + '/ema_content.csv'
df_ema_content.to_csv(df_ema_content_csv_path, index=False)


In [None]:
df_ema_content

## Redcap data

In [None]:
df_redcap = pd.read_csv(redcap_path + "FOR5187_DATA_2025-01-07_1511.csv", low_memory=False)
df_redcap_zert = pd.read_csv(redcap_path + "ZERTIFIZIERUNGFOR518_DATA_2025-01-07_1518.csv", low_memory=False)

In [None]:
df_redcap_zert = df_redcap_zert[['for_id', 'redcap_event_name',
       'basic_documentation_sheet_timestamp',  'age', 'gender','scid_cv_prim_cat',
       'marital_status', 'partnership', 'graduation', 'profession', 'ema_start_date',
       'years_of_education', 'employability', 'ses', 'ema_smartphone', 'ema_sleep', 'ema_watch', 'prior_treatment', 'ema_special_event', 'psychotropic', 'somatic_problems']]

In [None]:
df_redcap = df_redcap[['for_id', 'redcap_event_name',
       'basic_documentation_sheet_timestamp', 'age', 'gender', 'scid_cv_prim_cat',
       'marital_status', 'partnership', 'graduation', 'profession', 'ema_start_date', 
       'years_of_education', 'employability', 'ses', 'ema_smartphone', 'ema_sleep', 'ema_watch','prior_treatment', 'ema_special_event', 'psychotropic', 'somatic_problems']]

In [None]:
df_redcap = pd.concat([df_redcap, df_redcap_zert],ignore_index=True)
#df_redcap = pd.merge(df_redcap, df_t20,on='for_id', suffixes=('_base', '_t20'), how="left")

In [None]:
# Group by subject_id and merge rows
df_redcap_merged = (
    df_redcap
    .groupby('for_id', as_index=False)
    .agg({
        'ema_watch': 'max',  # Takes the non-null value
        **{col: 'first' for col in df_redcap.columns 
           if col not in ['for_id', 'ema_watch', 'redcap_event_name']}  # Keeps the first of other columns
    })
)

# Optionally drop the 'redcap_event_name' column
if 'redcap_event_name' in df_redcap_merged.columns:
    df_redcap_merged = df_redcap_merged.drop(columns=['redcap_event_name'])

In [None]:
gender_mapping = {
    1: 'male',
    2: 'female',
    3: 'diverse',
    4: 'no gender',
    5: 'not specified'
}

scid_cv_cat_mapping = {
    1: 'Depressive Disorder',
    2: 'Specific Phobia',
    3: 'Social Anxiety Disorder',
    4: 'Agoraphobia and/or Panic Disorder',
    5: 'Generalized Anxiety Disorder',
    6: 'Obsessive-Compulsive Disorder',
    7: 'Post-Traumatic Stress Disorder'
}

marital_status_mapping = {
    1: 'single',
    2: 'married/registered partnership',
    3: 'divorced',
    4: 'separated',
    5: 'widowed',
    6: 'other'
}

employability_mapping = {
    0: 'employable',
    1: 'unemployable (on sick leave)',
    2: 'on disability pension',
    3: 'on retirement pension',
    4: 'other'
}

employability_mapping_simple = {
    0: 'yes',
    1: 'no',
    2: 'no',
    3: 'no',
    4: 'no'
}

graduation_mapping = {
    0: 'still in school',
    1: 'no school degree',
    2: 'elementary school degree or equivalent',
    3: 'middle school degree or equivalent',
    4: 'high school diploma/university entrance qualification',
    5: 'other'
}

profession_mapping = {
    0: 'still in training or studies',
    1: 'no training degree',
    2: 'vocational training, including technical school',
    3: 'university or college degree',
    4: 'other'
}

prior_treatment_mapping = {
    0: 'no prior treatment',
    1: 'outpatient psychotherapy',
    2: 'inpatient or partial inpatient treatment/psychotherapy',
    3: 'both',
    4: 'yes'
}

prior_treatment_mapping_simple = {
    0: 'no prior treatment',
    1: 'prior psychotherapy',
    2: 'prior inpatient',
    3: 'prior inpatient',
    4: 'prior psychotherapy'
}

psychotropic_medication_mapping = {
    0: 'no',
    1: 'yes'
}
somatic_mapping = {
    0: 'no',
    1: 'yes'
}
ema_smartphone_mapping = {
    1: 'iPhone',
    0: 'Android'
}

ema_special_event_mapping = {
    0: 'usual',
    1: 'special event'
}
def categorize_age(age):
    if 18 <= age <= 24:
        return 0
    elif 25 <= age <= 34:
        return 1
    elif 35 <= age <= 44:
        return 2
    elif 45 <= age <= 54:
        return 3
    elif 55 <= age <= 64:
        return 4
    else:
        return 5
    

In [None]:
# Apply mappings
df_redcap_merged['gender_description'] = df_redcap_merged['gender'].map(gender_mapping)
df_redcap_merged['scid_cv_description'] = df_redcap_merged['scid_cv_prim_cat'].map(scid_cv_cat_mapping)
df_redcap_merged['marital_status_description'] = df_redcap_merged['marital_status'].map(marital_status_mapping)
df_redcap_merged['employability_description'] = df_redcap_merged['employability'].map(employability_mapping)
df_redcap_merged['employability_description_simple'] = df_redcap_merged['employability'].map(employability_mapping_simple)
df_redcap_merged['prior_treatment_description_simple'] = df_redcap_merged['prior_treatment'].map(prior_treatment_mapping_simple)
df_redcap_merged['graduation_description'] = df_redcap_merged['graduation'].map(graduation_mapping)
df_redcap_merged['profession_description'] = df_redcap_merged['profession'].map(profession_mapping)
df_redcap_merged['prior_treatment_description'] = df_redcap_merged['prior_treatment'].map(prior_treatment_mapping)
df_redcap_merged['ema_smartphone_description'] = df_redcap_merged['ema_smartphone'].map(ema_smartphone_mapping)
df_redcap_merged['ema_special_event_description'] = df_redcap_merged['ema_special_event'].map(ema_special_event_mapping)
df_redcap_merged['age_description'] = df_redcap_merged['age'].apply(categorize_age)
df_redcap_merged['somatic_description'] = df_redcap_merged['somatic_problems'].map(somatic_mapping)
df_redcap_merged['psychotropic_description'] = df_redcap_merged['psychotropic'].map(psychotropic_medication_mapping)



In [None]:
df_monitoring["for_id"] = df_monitoring.for_id.str.strip()
df_forid = df_monitoring[["for_id","customer"]]
df_redcap = pd.merge(df_forid, df_redcap_merged, on="for_id", how="left")

In [None]:
valid_df = df_redcap.dropna(subset=['ema_start_date'])


In [None]:
valid_df

In [None]:
with open(preprocessed_path_freezed + f'/redcap_data.pkl', 'wb') as file:
    pickle.dump(valid_df, file)