## Abstract

First Data for Passive Data Collection using Smartwatches and GPS from the PREACT Study. 

## Introduction

Treatment personalization is highly discussed to counteract insufficient response rates in psychotherapy. In the quest for criteria allowing informed selection or adaptation, ambulatory assessment data (i.e. EMA, passive sensing)are a key component, as processes happening outside of therapy sessions can be depicted in high temporal and/or spatial resolution.

PREACT is a multicenter prospective-longitudinal study investigating different predictors of non-response (i.e. EEG, fMRI) in around 500 patients undergoing cognitive behavioral therapy for internalizing disorders (https://forschungsgruppe5187.de/de). 

## Methods
Patients can enroll for therapy-accompanying ambulatory assessment. They are provided with a customized study app and a state-of-the-art smartwatch collecting passive data like GPS and heart rate for up to 365 days. In parallel, three 14-day EMA phases (pre-, mid- and post-therapy) cover transdiagnostic (i.e. emotion regulation), contextual and therapy-related aspects.  

Here, we present first results on data compliance and quality for the passive sensing data as well as EMA assessments.


In [17]:
import os
import glob
import pickle
import sys
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)

import pandas as pd
import datetime as dt
from datetime import date, datetime
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from config import datapath, proj_sheet,preprocessed_path

today = date.today().strftime("%d%m%Y")
today_day = pd.to_datetime('today').normalize()

today = "29072024"

# actual passive + ema_data
datapath1 = datapath + f"raw/export_tiki_{today}/"
file_pattern = os.path.join(datapath1, "epoch_part*.csv")
file_list = glob.glob(file_pattern)
file_list.sort()
df_complete = pd.concat((pd.read_csv(f, encoding="latin-1", low_memory=False) for f in file_list), ignore_index=True)

# small backup passive data
#file_pattern_back_1 = os.path.join(datapath, 'raw/tiki_backup_files/export_tiki_27052024/"epoch_part*.csv"')
file_pattern_back_1 = os.path.join(datapath, 'raw/tiki_backup_files/export_tiki_16072024/"epoch_part*.csv"')  # Adjust the path and extension if needed

backup_files = glob.glob(file_pattern_back_1)
file_list = glob.glob(file_pattern_back_1)
file_list.sort()
df_backup_small = pd.concat((pd.read_csv(f, encoding="latin-1", low_memory=False) for f in file_list), ignore_index=True)


# big backup passive data
file_pattern_back_2 = os.path.join(datapath, 'raw/tiki_backup_files/tiki_backup_*.csv')  # Adjust the path and extension if needed
big_backup_files = glob.glob(file_pattern_back_2)

dataframes = []
for file in big_backup_files:
    df_backup = pd.read_csv(file, encoding="latin-1", low_memory=False)  # Adjust read_csv parameters as needed
    # Extract the date from the filename
    filename_parts = os.path.basename(file).split('_')
    date_str = filename_parts[2]
    time_suffix = int(filename_parts[3].split('.')[0])  # Convert the suffix to an integer
    date = pd.to_datetime(date_str)
    df_backup['file_date'] = date
    df_backup['time_suffix'] = time_suffix
    dataframes.append(df_backup)

df_backup_big = pd.concat(dataframes, ignore_index=True)
df_backup_big = df_backup_big.sort_values(by=['file_date', 'time_suffix'])

df_monitoring = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{proj_sheet}/export?format=csv")


ValueError: No objects to concatenate

### Monitoring data

In [3]:
df_monitoring = df_monitoring.copy()
df_monitoring.rename(columns = {"Pseudonym": "customer", "EMA_ID": "ema_id", "Status": "status",
                                "Studienversion":"study_version", "FOR_ID":"for_id", 
                           "Start EMA Baseline": "ema_base_start", "Ende EMA Baseline": "ema_base_end", 
                           "Freischaltung/ Start EMA T20": "ema_t20_start","Ende EMA T20":"ema_t20_end", 
                               "Termin 1. Gespräch": "first_call_date", "Freischaltung/ Start EMA Post":"ema_post_start",
                               "Ende EMA Post":"ema_post_end", "T20=Post":"t20_post" }, inplace=True)

In [4]:
df_monitoring["customer"] = df_monitoring["customer"].str[:4]
df_monitoring["for_id"] = df_monitoring.for_id.str.strip()

In [5]:
df_monitoring["ema_base_start"] = pd.to_datetime(df_monitoring["ema_base_start"], dayfirst=True)
df_monitoring["ema_base_end"] = pd.to_datetime(df_monitoring["ema_base_end"], dayfirst=True)

In [6]:
df_monitoring = df_monitoring[['for_id', 'ema_id', 'customer', 'study_version', 'status',
       't20_post', 'ema_base_start', 'ema_base_end',
       'first_call_date', 'ema_t20_start', 'ema_t20_end',
       'ema_post_start', 'ema_post_end']]

### Passive data

In [None]:
### 1.1 Import epoch level passive + GPS data

df_complete["customer"] = df_complete.customer.str.split("@").str.get(0)
df_complete["customer"] = df_complete["customer"].str[:4]

df_complete["start_end"] = df_complete["endTimestamp"] - df_complete["startTimestamp"]
df_complete["startTimestamp"] = pd.to_datetime(df_complete["startTimestamp"],unit='ms')
df_complete["endTimestamp"] = pd.to_datetime(df_complete["endTimestamp"],unit='ms')
df_complete["createdAt"] = pd.to_datetime(df_complete["createdAt"],unit='ms')


## Timezone offset 

# Handle NaN in timezoneOffset by skipping or replacing with a default
# Here, we skip adjustments for NaN offsets by using np.where to check for NaN
df_complete['startTimestamp'] = np.where(df_complete['timezoneOffset'].isna(),
                               df_complete['startTimestamp'],  # If NaN, keep original timestamp
                               df_complete['startTimestamp'] + pd.to_timedelta(df_complete['timezoneOffset'], unit='m'))  # Else, apply offset

df_complete['createdAt'] = np.where(df_complete['createdAt'].isna(),
                               df_complete['createdAt'],  # If NaN, keep original timestamp
                               df_complete['createdAt'] + pd.to_timedelta(df_complete['timezoneOffset'], unit='m'))  # Else, apply offset

df_complete['endTimestamp'] = np.where(df_complete['endTimestamp'].isna(),
                               df_complete['endTimestamp'],  # If NaN, keep original timestamp
                               df_complete['endTimestamp'] + pd.to_timedelta(df_complete['timezoneOffset'], unit='m'))  # Else, apply offset


df_complete["startTimestamp_day"] = df_complete.startTimestamp.dt.normalize()
df_complete["createdAt_day"] = df_complete.createdAt.dt.normalize()

df_complete["startTimestamp_hour"] = df_complete.startTimestamp.dt.hour
df_complete["createdAt_hour"] = df_complete.createdAt.dt.hour


In [7]:
# Convert the 'startTimestamp' and 'endTimestamp' columns to datetime objects
df_backup_big['startTimestamp'] = pd.to_datetime(df_backup_big['startTimestamp'], errors='coerce')
df_backup_big['endTimestamp'] = pd.to_datetime(df_backup_big['endTimestamp'], errors='coerce')
# Convert the 'startTimestamp' and 'endTimestamp' columns to datetime objects
df_backup_big['startTimestamp'] = pd.to_datetime(df_backup_big['startTimestamp'], errors='coerce')
df_backup_big['endTimestamp'] = pd.to_datetime(df_backup_big['endTimestamp'], errors='coerce')

# Debug: Print data types after conversion
print("\nData types after conversion to datetime:")
print(df_backup_big.dtypes)

# Adjust for timezone offset
df_backup_big['startTimestamp'] = df_backup_big['startTimestamp'] + pd.to_timedelta(df_backup_big['timezoneOffset'], unit='m')
df_backup_big['endTimestamp'] = df_backup_big['endTimestamp'] + pd.to_timedelta(df_backup_big['timezoneOffset'], unit='m')


Data types after conversion to datetime:
customer                                     object
source                                       object
type                                         object
startTimestamp            datetime64[ns, UTC+01:00]
endTimestamp              datetime64[ns, UTC+01:00]
doubleValue                                 float64
longValue                                   float64
booleanValue                                 object
dateValue                                   float64
stringValue                                  object
generation                                   object
trustworthiness                              object
medicalGrade                                 object
userReliability                             float64
chronologicalExactness                      float64
timezoneOffset                              float64
file_date                            datetime64[ns]
time_suffix                                   int64
dtype: object


NameError: name 'df_backup_complete' is not defined

In [8]:

# Format the datetime objects to the desired format
df_backup_big['startTimestamp'] = df_backup_big['startTimestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
df_backup_big['endTimestamp'] = df_backup_big['endTimestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

df_backup_big['startTimestamp'] = pd.to_datetime(df_backup_big['startTimestamp'], errors='coerce')
df_backup_big['endTimestamp'] = pd.to_datetime(df_backup_big['endTimestamp'], errors='coerce')

### Merge all 3 dataframes

In [15]:
latest_timestamp_big = df_backup_big['startTimestamp'].max()

# Filter the second dataframe to include only entries after the latest timestamp
df_backup_small_filtered = df_backup_small[df_backup_small['startTimestamp'] > latest_timestamp_big]

# Concatenate the first dataframe with the filtered second dataframe
result_df = pd.concat([df_backup_big, df_backup_small_filtered], ignore_index=True)

Index(['customer', 'source', 'type', 'startTimestamp', 'endTimestamp',
       'doubleValue', 'longValue', 'booleanValue', 'dateValue', 'stringValue',
       'generation', 'trustworthiness', 'medicalGrade', 'userReliability',
       'chronologicalExactness', 'timezoneOffset', 'file_date', 'time_suffix'],
      dtype='object')

In [16]:
latest_timestamp_big = result_df['startTimestamp'].max()

# Filter the second dataframe to include only entries after the latest timestamp
df_complete_filtered = df_backup_small[df_backup_small['startTimestamp'] > latest_timestamp_big]

# Concatenate the first dataframe with the filtered second dataframe
result_df = pd.concat([df_backup_big, df_backup_small_filtered], ignore_index=True)

Index(['customer', 'source', 'startTimestamp', 'endTimestamp', 'type',
       'valueType', 'doubleValue', 'longValue', 'booleanValue', 'dateValue',
       'stringValue', 'generation', 'trustworthiness', 'medicalGrade',
       'userReliability', 'chronologicalExactness', 'timezoneOffset',
       'createdAt'],
      dtype='object')

In [None]:
# Get the latest timestamp from the first dataframe
latest_timestamp = df1['timestamp'].max()

# Filter the second dataframe to include only entries after the latest timestamp
df2_filtered = df2[df2['timestamp'] > latest_timestamp]

# Concatenate the first dataframe with the filtered second dataframe
result = pd.concat([df1, df2_filtered], ignore_index=True)

### Extract GPS data

In [None]:
#Extract GPS data
df_loc_complete = df_complete[df_complete.type.isin(["Latitude", "Longitude"])]
df_loc_complete = df_loc_complete[["customer", "startTimestamp","endTimestamp","type", "doubleValue", 
                           'createdAt', 'startTimestamp_day', 'createdAt_day',
       'startTimestamp_hour', 'createdAt_hour']]
df_loc_complete = df_loc_complete.merge(df_monitoring, on="customer", how="left")

In [None]:
# Find the earliest 'startTimestamp_day' for each customer
earliest_timestamp_per_customer = df_loc_complete.groupby('customer')['startTimestamp_day'].min()

# Map the earliest timestamp back to the original dataframe
df_loc_complete['earliest_start_day'] = df_loc_complete['customer'].map(earliest_timestamp_per_customer)

# Calculate 'relative_day' as the difference in days from the earliest day
df_loc_complete['relative_day'] = (df_loc_complete['startTimestamp_day'] - df_loc_complete['earliest_start_day']).dt.days


In [None]:
df_loc_complete['potential_days_coverage'] = (today_day - df_loc_complete['earliest_start_day']).dt.days

condition = (
    (df_loc_complete['status'] == 'Abgeschlossen') & 
    (df_loc_complete['study_version'].isin(['Kurz', 'Kurz (Wechsel/Abbruch)']))
)

df_loc_complete['potential_days_coverage'] = np.where(
    condition,
    (df_loc_complete['ema_base_end'] - df_loc_complete['earliest_start_day']).dt.days,
    df_loc_complete['potential_days_coverage']
)

# Count unique days with data for each customer
actual_days = df_loc_complete.groupby('customer')['startTimestamp_day'].nunique()

# Mapping the actual number of days back to the DataFrame
df_loc_complete['actual_days_with_data'] = df_loc_complete['customer'].map(actual_days)

df_loc_complete['data_coverage_per'] = (df_loc_complete['actual_days_with_data'] / df_loc_complete['potential_days_coverage']) * 100


In [None]:
df_pd_complete = df_complete[~df_complete.type.isin(["Latitude", "Longitude"])]
df_pd_complete = df_pd_complete[["customer", "startTimestamp", "endTimestamp","start_end","type", "doubleValue", 'longValue', 'booleanValue', 'dateValue',
       'stringValue',"timezoneOffset", 'createdAt', 'startTimestamp_day', 'createdAt_day',
       'startTimestamp_hour', 'createdAt_hour']]
df_pd_complete = df_pd_complete.merge(df_monitoring, on="customer", how="left")

In [None]:
# Find the earliest 'startTimestamp_day' for each customer
earliest_timestamp_per_customer = df_pd_complete.groupby('customer')['startTimestamp_day'].min()

# Map the earliest timestamp back to the original dataframe
df_pd_complete['earliest_start_day'] = df_pd_complete['customer'].map(earliest_timestamp_per_customer)

# Calculate 'relative_day' as the difference in days from the earliest day
df_pd_complete['relative_day'] = (df_pd_complete['startTimestamp_day'] - df_pd_complete['earliest_start_day']).dt.days


In [None]:
df_pd_complete['potential_days_coverage'] = (today_day - df_pd_complete['earliest_start_day']).dt.days

condition = (
    (df_pd_complete['status'] == 'Abgeschlossen') & 
    (df_pd_complete['study_version'].isin(['Kurz', 'Kurz (Wechsel/Abbruch)']))
)

df_pd_complete['potential_days_coverage'] = np.where(
    condition,
    (df_pd_complete['ema_base_end'] - df_pd_complete['earliest_start_day']).dt.days,
    df_pd_complete['potential_days_coverage']
)
# Count unique days with data for each customer
actual_days = df_pd_complete.groupby('customer')['startTimestamp_day'].nunique()

# Mapping the actual number of days back to the DataFrame
df_pd_complete['actual_days_with_data'] = df_pd_complete['customer'].map(actual_days)

df_pd_complete['data_coverage_per'] = (df_pd_complete['actual_days_with_data'] / df_pd_complete['potential_days_coverage']) * 100


In [None]:

with open(preprocessed_path + f'/passive_data.pkl', 'wb') as file:
    pickle.dump(df_pd_complete, file)
    
with open(preprocessed_path + f'/gps_data.pkl', 'wb') as file:
    pickle.dump(df_loc_complete, file)

### EMA data

In [None]:
# load data
session = pd.read_csv(datapath1 + "questionnaireSession.csv",low_memory=False)
answers = pd.read_csv(datapath1 + "answers.csv", low_memory=False)
choice = pd.read_csv(datapath1 + "choice.csv",low_memory=False)
questions = pd.read_csv(datapath1 + "questions.csv",low_memory=False)
questionnaire = pd.read_csv(datapath1 + "questionnaires.csv", low_memory=False)


In [None]:
# session data
session["user"] = session["user"].str[:4]
session.rename(columns = {"user":"customer","completedAt": "quest_complete", "createdAt": "quest_create", "expirationTimestamp": "quest_expir"}, inplace=True)
session["quest_create"] = (pd.to_datetime(session["quest_create"],unit='ms'))
session["quest_complete"] = (pd.to_datetime(session["quest_complete"],unit='ms'))

df_sess = session[["customer", "sessionRun", "quest_create", "quest_complete", "study"]]

In [None]:
# answer data 
answers["user"] = answers["user"].str[:4]
answers = answers[["user", "questionnaireSession", "questionnaire", "study", 
                   "question", "order","element", "createdAt"]]
answers["createdAt"] = (pd.to_datetime(answers["createdAt"],unit='ms'))
answers.rename(columns={"user":"customer","questionnaireSession":"session_unique", "createdAt": "quest_create"}, inplace=True)

In [None]:
# item description data
choice = choice[["element", "choice_id", "text", "question"]]
choice.rename(columns={"text":"choice_text"}, inplace=True)

In [None]:
# question description data
questions = questions[["id", "title"]]
questions.rename(columns={"id":"question","title":"quest_title"}, inplace=True)

In [None]:
questionnaire = questionnaire[["id", "name"]]
questionnaire.rename(columns={"id":"questionnaire","name":"questionnaire_name"}, inplace=True)

In [None]:
answer_merged = pd.merge(answers, choice, on= ["question","element"])
answer_merged = pd.merge(answer_merged, questions, on= "question")
answer_merged = pd.merge(answer_merged, questionnaire, on= "questionnaire")
answer_merged["quest_complete_day"] = answer_merged.quest_create.dt.normalize()

In [None]:
answer_merged = pd.merge(answer_merged, df_monitoring, on = "customer")

In [None]:
df_sess = pd.merge(df_sess, df_monitoring, on = "customer")

In [None]:
df_sess = df_sess[['customer', 'sessionRun', 'quest_create', 'quest_complete', 'study',
       'for_id', 'ema_id', 'study_version', 'status', 't20_post',
       'ema_base_start', 'ema_base_end','ema_t20_start', 'ema_t20_end',
       'ema_post_start', 'ema_post_end']]

In [None]:
df_sess = df_sess.copy()
df_sess["quest_create_day"] = df_sess.quest_create.dt.normalize()
df_sess["quest_complete_day"] = df_sess.quest_complete.dt.normalize()

df_sess["quest_create_hour"] = df_sess.quest_create.dt.hour
df_sess["quest_complete_hour"] = df_sess.quest_complete.dt.hour

In [None]:
# count number of completed EMA beeps in first phase
df_sess1 = df_sess.loc[df_sess.study.isin([24,25])]
df_sess1 = df_sess1.copy()

df_sess2 = df_sess.loc[df_sess.study.isin([33,34])]
df_sess2 = df_sess2.copy()

df_sess3 = df_sess.loc[df_sess.study.isin([38,39])]
df_sess3 = df_sess3.copy()

In [None]:
df_sess1['quest_complete_relative1'] = (df_sess1['quest_complete_day'] - df_sess1['ema_base_start']).dt.days


sess_count1 = df_sess1.dropna(subset=["quest_complete"]).groupby("customer")["quest_complete"].size()\
.reset_index()
sess_count1 = sess_count1.rename(columns = {"quest_complete":"nquest_EMA1"})

# count number of completed EMA beeps in second phase
sess_count2 = df_sess2.dropna(subset=["quest_complete"]).groupby("customer")["quest_complete"].size()\
.reset_index()
sess_count2 = sess_count2.rename(columns = {"quest_complete":"nquest_EMA2"})

# count number of completed EMA beeps in second phase
sess_count3 = df_sess3.dropna(subset=["quest_complete"]).groupby("customer")["quest_complete"].size()\
.reset_index()
sess_count3 = sess_count3.rename(columns = {"quest_complete":"nquest_EMA3"})

In [None]:
#daily_counts = df_sess1.groupby(['customer', 'quest_complete_day','quest_complete_relative1']).size().reset_index(name='daily_entries_sum')

In [None]:
df_sess = df_sess.merge(sess_count1, on=['customer'], how='left')
df_sess = df_sess.merge(sess_count2, on=['customer'], how='left')
df_sess = df_sess.merge(sess_count3, on=['customer'], how='left')

In [None]:

with open(preprocessed_path + f'/ema_data.pkl', 'wb') as file:
    pickle.dump(df_sess, file)
    
with open(preprocessed_path + f'/monitoring_data.pkl', 'wb') as file:
    pickle.dump(df_monitoring, file)
    
with open(preprocessed_path + f'/ema_content.pkl', 'wb') as file:
    pickle.dump(answer_merged, file)