## Abstract

First Data for Passive Data Collection using Smartwatches and GPS from the PREACT Study. 

## Introduction

Treatment personalization is highly discussed to counteract insufficient response rates in psychotherapy. In the quest for criteria allowing informed selection or adaptation, ambulatory assessment data (i.e. EMA, passive sensing)are a key component, as processes happening outside of therapy sessions can be depicted in high temporal and/or spatial resolution.

PREACT is a multicenter prospective-longitudinal study investigating different predictors of non-response (i.e. EEG, fMRI) in around 500 patients undergoing cognitive behavioral therapy for internalizing disorders (https://forschungsgruppe5187.de/de). 

## Methods
Patients can enroll for therapy-accompanying ambulatory assessment. They are provided with a customized study app and a state-of-the-art smartwatch collecting passive data like GPS and heart rate for up to 365 days. In parallel, three 14-day EMA phases (pre-, mid- and post-therapy) cover transdiagnostic (i.e. emotion regulation), contextual and therapy-related aspects.  

Here, we present first results on data compliance and quality for the passive sensing data as well as EMA assessments.


In [2]:
import os
import glob
import pickle
import sys
import re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)

import pandas as pd
import datetime as dt
from datetime import date, datetime
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from server_config import datapath, proj_sheet,preprocessed_path, raw_path

today = date.today().strftime("%d%m%Y")
today_day = pd.to_datetime('today').normalize()


df_monitoring = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{proj_sheet}/export?format=csv")


In [3]:
df_monitoring = df_monitoring.copy()
df_monitoring.rename(columns = {"Pseudonym": "customer", "EMA_ID": "ema_id", "Status": "status",
                                "Studienversion":"study_version", "FOR_ID":"for_id", 
                           "Start EMA Baseline": "ema_base_start", "Ende EMA Baseline": "ema_base_end", 
                           "Freischaltung/ Start EMA T20": "ema_t20_start","Ende EMA T20":"ema_t20_end", 
                               "Termin 1. Gespräch": "first_call_date", "Freischaltung/ Start EMA Post":"ema_post_start",
                               "Ende EMA Post":"ema_post_end", "T20=Post":"t20_post" }, inplace=True)

df_monitoring["customer"] = df_monitoring["customer"].str[:4]
df_monitoring["for_id"] = df_monitoring.for_id.str.strip()

df_monitoring["ema_base_start"] = pd.to_datetime(df_monitoring["ema_base_start"], dayfirst=True)
df_monitoring["ema_base_end"] = pd.to_datetime(df_monitoring["ema_base_end"], dayfirst=True)

df_monitoring_short = df_monitoring[["customer", "status", "study_version", "ema_base_start","ema_base_end"]]

### Backup Data 1

In [4]:
# small backup passive data
#file_pattern_back_1 = os.path.join(datapath, 'raw/tiki_backup_files/export_tiki_27052024/"epoch_part*.csv"')
datapath_back = os.path.join(raw_path, "tiki_backup_files/export_tiki_21052024/")
file_pattern_back_1 = os.path.join(datapath_back,"epoch_part*.csv")  # Adjust the path and extension if needed

backup_files = glob.glob(file_pattern_back_1)
file_list = glob.glob(file_pattern_back_1)
file_list.sort()
df_backup_small = pd.concat((pd.read_csv(f, encoding="latin-1", low_memory=False) for f in file_list), ignore_index=True)


ValueError: No objects to concatenate

In [None]:
df_backup_small["start_end"] = df_backup_small["endTimestamp"] - df_backup_small["startTimestamp"]
df_backup_small["startTimestamp"] = pd.to_datetime(df_backup_small["startTimestamp"],unit='ms')
df_backup_small["endTimestamp"] = pd.to_datetime(df_backup_small["endTimestamp"],unit='ms')

In [None]:
df_backup_small["customer"] = df_backup_small.customer.str.split("@").str.get(0)
df_backup_small["customer"] = df_backup_small["customer"].str[:4]

df_backup_small['startTimestamp'] = df_backup_small['startTimestamp'] + pd.to_timedelta(df_backup_small['timezoneOffset'], unit='m')
df_backup_small['endTimestamp'] = df_backup_small['endTimestamp'] + pd.to_timedelta(df_backup_small['timezoneOffset'], unit='m')

df_backup_small["startTimestamp_day"] = df_backup_small.startTimestamp.dt.normalize()
df_backup_small["startTimestamp_hour"] = df_backup_small.startTimestamp.dt.hour

In [None]:

# Define the pattern for big backup passive data files
file_pattern_back_2 = os.path.join(raw_path, 'tiki_backup_files/tiki_backup_*.csv')

# Use glob to find all matching files
big_backup_files = glob.glob(file_pattern_back_2)

# Define the dtype for columns that are known to be problematic
dtype_spec = {
    'startTimestamp': 'str',  # Load as string initially
    'endTimestamp': 'str'     # Load as string initially
}

# Create a list to hold all the dataframes
all_dfs = []

# Loop over the files and read them with date parsing
for filename in big_backup_files:
    df = pd.read_csv(
        filename,
        dtype=dtype_spec,  # Load timestamps as strings first
        low_memory=False  # Ensure proper memory handling
    )
    
    # Convert the timestamp columns to datetime, ensuring proper parsing of ISO 8601 format
    df['starttimestamp'] = pd.to_datetime(df['startTimestamp'], format='%Y-%m-%dT%H:%M:%S%z', errors='coerce')
    df['endtimestamp'] = pd.to_datetime(df['endTimestamp'], format='%Y-%m-%dT%H:%M:%S%z', errors='coerce')

    
    # Additional processing (e.g., adding date and index columns)
    match = re.match(r'tiki_backup_(\d{4}-\d{2}-\d{2})_(\d+)\.csv', os.path.basename(filename))
    if match:
        date_part = match.group(1)  # Extract the date
        index_part = int(match.group(2))  # Extract the index
        # Add the date and index as new columns to the dataframe
        df['date'] = date_part
        df['index'] = index_part

    all_dfs.append(df)

# Concatenate all dataframes
df_backup_big = pd.concat(all_dfs, ignore_index=True)

# Sort the merged dataframe by 'date' and 'index'

# Optionally, drop the 'date' and 'index' columns if they are no longer needed
df_backup_big.drop(columns=['date', 'index'], inplace=True)


In [None]:
# Convert the 'startTimestamp' and 'endTimestamp' columns to datetime objects
df_backup_big['startTimestamp'] = pd.to_datetime(df_backup_big['startTimestamp'], errors='coerce')
df_backup_big['endTimestamp'] = pd.to_datetime(df_backup_big['endTimestamp'], errors='coerce')
# Convert the 'startTimestamp' and 'endTimestamp' columns to datetime objects
df_backup_big['startTimestamp'] = pd.to_datetime(df_backup_big['startTimestamp'], errors='coerce')
df_backup_big['endTimestamp'] = pd.to_datetime(df_backup_big['endTimestamp'], errors='coerce')

# Adjust for timezone offset
df_backup_big['startTimestamp'] = df_backup_big['startTimestamp'] + pd.to_timedelta(df_backup_big['timezoneOffset'], unit='m')
df_backup_big['endTimestamp'] = df_backup_big['endTimestamp'] + pd.to_timedelta(df_backup_big['timezoneOffset'], unit='m')


# Format the datetime objects to the desired format
df_backup_big['startTimestamp'] = df_backup_big['startTimestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
df_backup_big['endTimestamp'] = df_backup_big['endTimestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

df_backup_big['startTimestamp'] = pd.to_datetime(df_backup_big['startTimestamp'], errors='coerce')
df_backup_big['endTimestamp'] = pd.to_datetime(df_backup_big['endTimestamp'], errors='coerce')

df_backup_big["startTimestamp_day"] = df_backup_big.startTimestamp.dt.normalize()
df_backup_big["startTimestamp_hour"] = df_backup_big.startTimestamp.dt.hour

### Merge 2 dataframes

In [None]:
latest_timestamp_big = df_backup_big['startTimestamp'].max()

# Filter the second dataframe to include only entries after the latest timestamp
df_backup_small_filtered = df_backup_small[df_backup_small['startTimestamp'] > latest_timestamp_big]

# Concatenate the first dataframe with the filtered second dataframe
result_df_final = pd.concat([df_backup_big, df_backup_small_filtered], ignore_index=True)

In [None]:
# Sort the original DataFrames by startTimestamp
df_backup_big_sorted = df_backup_big.sort_values(by='startTimestamp')
df_backup_small_filtered_sorted = df_backup_small_filtered.sort_values(by='startTimestamp')

# Concatenate the sorted DataFrames
result_df_final = pd.concat([df_backup_big_sorted, df_backup_small_filtered_sorted], ignore_index=True)

# Optional: Sort the final DataFrame again to ensure everything is in order
result_df_final = result_df_final.sort_values(by='startTimestamp').reset_index(drop=True)


In [None]:
result_df_final["customer"] = result_df_final.customer.str.split("@").str.get(0)
result_df_final["customer"] = result_df_final["customer"].str[:4]

In [None]:
print("Minimum timestamp:", result_df_final['startTimestamp'].min())
print("Maximum timestamp:", result_df_final['startTimestamp'].max())


In [None]:
b

### Calculate data coverage

In [None]:
result_df_final_merged = result_df_final.merge(df_monitoring_short, on="customer", how="right")

In [None]:
print("Minimum timestamp:", result_df_final_merged['startTimestamp'].min())
print("Maximum timestamp:", result_df_final_merged['startTimestamp'].max())

In [None]:
print("Minimum timestamp:", result_df_final_merged['startTimestamp'].min())
print("Maximum timestamp:", result_df_final_merged['startTimestamp'].max())


In [None]:
def calculate_data_coverage(df, today_day, data_type_groups):
    """
    Calculate the data coverage percentage for each customer, for each specified group of data types.

    :param df: DataFrame containing customer data
    :param today_day: The current date for calculating potential coverage
    :param data_type_groups: A dictionary where keys are group names and values are lists of data types
    :param status_col: Column name for the status
    :param study_version_col: Column name for the study version
    :param ema_base_end_col: Column name for the EMA base end date
    :return: DataFrame with additional columns for data coverage percentages
    """

    # Ensure the date columns are datetime objects
    df['startTimestamp_day'] = pd.to_datetime(df['startTimestamp_day'])
    df['ema_base_end'] = pd.to_datetime(df['ema_base_end'])

    # Find the earliest 'startTimestamp_day' for each customer
    earliest_timestamp_per_customer = df.groupby('customer')['startTimestamp_day'].min()

    # Map the earliest timestamp back to the original DataFrame
    df['earliest_start_day'] = df['customer'].map(earliest_timestamp_per_customer)

    # Calculate potential days of coverage from the earliest start day to today
    df['potential_days_coverage'] = (today_day - df['earliest_start_day']).dt.days

    # Define the condition for adjusting potential days coverage
    condition = (
        (df['status'] == 'Abgeschlossen') & 
        (df['study_version'].isin(['Kurz', 'Kurz (Wechsel/Abbruch)']))
    )

    # Adjust potential days of coverage based on the condition
    df['potential_days_coverage'] = np.where(
        condition,
        (df['ema_base_end'] - df['earliest_start_day']).dt.days,
        df['potential_days_coverage']
    )

    for group_name, data_types in data_type_groups.items():
        # Filter for the current group of data types
        df_type_group = df[df['type'].isin(data_types)]
        
        # Count unique days with data for each customer for the current data types
        actual_days = df_type_group.groupby('customer')['startTimestamp_day'].nunique()

        # Map the actual number of days back to the DataFrame
        df[f'{group_name}_actual_days_with_data'] = df['customer'].map(actual_days).fillna(0)

        # Calculate data coverage percentage for the current data types
        df[f'{group_name}_data_coverage_per'] = (df[f'{group_name}_actual_days_with_data'] / df['potential_days_coverage']) * 100

    # Drop intermediary columns if necessary
    df.drop(columns=['earliest_start_day'], inplace=True)

    return df

In [None]:
data_type_groups = {
    'GPS': ["Latitude"],
    # Add more groups as needed
    'Activity': ["Steps"],
    'Sleep': ["SleepBinary"],
    'Heart_Rate': ["HeartRate"]
    # Add more groups as needed
}

In [None]:
result_df_final_merged = calculate_data_coverage(result_df_final_merged, today_day, data_type_groups)


In [None]:
result_df_final_merged = result_df_final_merged.drop(columns=['valueType', 'createdAt', 'source', 'trustworthiness', 'medicalGrade', 'generation'])

In [None]:
object_cols = ["booleanValue", "stringValue", "status", "study_version", "customer", "type"] 

In [None]:
# Fill NaN values with -99 for the specified columns
for col in object_cols:
    result_df_final_merged[col] = result_df_final_merged[col].fillna(-99)

# Convert "booleanValue" to boolean
result_df_final_merged['booleanValue'] = result_df_final_merged['booleanValue'].apply(lambda x: bool(x) if x != -99 else False)

# Convert "stringValue", "status", "study_version" to string using StringDtype
result_df_final_merged['stringValue'] = result_df_final_merged['stringValue'].astype('string')
result_df_final_merged['status'] = result_df_final_merged['status'].astype('string')
result_df_final_merged['study_version'] = result_df_final_merged['study_version'].astype('string')
result_df_final_merged['customer'] = result_df_final_merged['customer'].astype('string')
result_df_final_merged['type'] = result_df_final_merged['type'].astype('string')


In [None]:
# Calculate memory usage in bytes
memory_usage_bytes = result_df_final.memory_usage(deep=True).sum()

# Convert to megabytes
memory_usage_mb = memory_usage_bytes / (1024 ** 2)

# Convert to gigabytes
memory_usage_gb = memory_usage_bytes / (1024 ** 3)

# Convert to terabytes
memory_usage_tb = memory_usage_bytes / (1024 ** 4)

print(f"Memory usage: {memory_usage_bytes} bytes")
print(f"Memory usage: {memory_usage_mb:.2f} MB")
print(f"Memory usage: {memory_usage_gb:.2f} GB")
print(f"Memory usage: {memory_usage_tb:.2f} TB")


In [None]:
backup_path = preprocessed_path + "backup_data_passive_general.feather"

In [None]:
# Save to HDF5 format
result_df_final_merged.to_feather(backup_path)

In [None]:
result_df_final_merged.startTimestamp.min()

In [None]:
result_df_final_merged.startTimestamp.max()

In [None]:
result_df_final_merged.shape

In [None]:
result_df_final_merged.head()