## Abstract

First Data for Passive Data Collection using Smartwatches and GPS from the PREACT Study. 

## Introduction

Treatment personalization is highly discussed to counteract insufficient response rates in psychotherapy. In the quest for criteria allowing informed selection or adaptation, ambulatory assessment data (i.e. EMA, passive sensing)are a key component, as processes happening outside of therapy sessions can be depicted in high temporal and/or spatial resolution.

PREACT is a multicenter prospective-longitudinal study investigating different predictors of non-response (i.e. EEG, fMRI) in around 500 patients undergoing cognitive behavioral therapy for internalizing disorders (https://forschungsgruppe5187.de/de). 

## Methods
Patients can enroll for therapy-accompanying ambulatory assessment. They are provided with a customized study app and a state-of-the-art smartwatch collecting passive data like GPS and heart rate for up to 365 days. In parallel, three 14-day EMA phases (pre-, mid- and post-therapy) cover transdiagnostic (i.e. emotion regulation), contextual and therapy-related aspects.  

Here, we present first results on data compliance and quality for the passive sensing data as well as EMA assessments.


In [48]:
import os
import glob
import pickle
import sys
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)

import pandas as pd
import datetime as dt
from datetime import date, datetime
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import regex as re


from config import datapath, proj_sheet,preprocessed_path, raw_path, redcap_path

today = date.today().strftime("%d%m%Y")
today_day = pd.to_datetime('today').normalize()

today = "19082024"

df_monitoring = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{proj_sheet}/export?format=csv")

In [2]:
# actual passive + ema_data
datapath1 = raw_path + f"export_tiki_{today}/"
file_pattern = os.path.join(datapath1, "epoch_part*.csv")
file_list = glob.glob(file_pattern)
file_list.sort()
df_complete = pd.concat((pd.read_csv(f, encoding="latin-1", low_memory=False) for f in file_list), ignore_index=True)

In [3]:
### 1.1 Import epoch level passive + GPS data

df_complete["customer"] = df_complete.customer.str.split("@").str.get(0)
df_complete["customer"] = df_complete["customer"].str[:4]

df_complete["start_end"] = df_complete["endTimestamp"] - df_complete["startTimestamp"]
df_complete["startTimestamp"] = pd.to_datetime(df_complete["startTimestamp"],unit='ms')
df_complete["endTimestamp"] = pd.to_datetime(df_complete["endTimestamp"],unit='ms')

## Timezone offset 

# Handle NaN in timezoneOffset by skipping or replacing with a default
# Here, we skip adjustments for NaN offsets by using np.where to check for NaN
df_complete['startTimestamp'] = np.where(df_complete['timezoneOffset'].isna(),
                               df_complete['startTimestamp'],  # If NaN, keep original timestamp
                               df_complete['startTimestamp'] + pd.to_timedelta(df_complete['timezoneOffset'], unit='m'))  # Else, apply offset

df_complete['endTimestamp'] = np.where(df_complete['endTimestamp'].isna(),
                               df_complete['endTimestamp'],  # If NaN, keep original timestamp
                               df_complete['endTimestamp'] + pd.to_timedelta(df_complete['timezoneOffset'], unit='m'))  # Else, apply offset


df_complete["startTimestamp_day"] = df_complete.startTimestamp.dt.normalize()
df_complete["startTimestamp_hour"] = df_complete.startTimestamp.dt.hour

KeyboardInterrupt: 

In [None]:
# Merge with backup data
backup_path = preprocessed_path + "backup_data_passive.feather"
df_backup = pd.read_feather(backup_path)

In [None]:
latest_timestamp = df_backup['startTimestamp'].max()

# Filter the second dataframe to include only entries after the latest timestamp
df_complete_filtered = df_complete[df_complete['startTimestamp'] > latest_timestamp]

### 

In [None]:
df_complete_filtered = df_complete_filtered.drop(columns=['valueType', 'createdAt', 'source', 
                                                              'trustworthiness', 'medicalGrade', 'generation'])


## Monitoring data

In [49]:
df_monitoring = df_monitoring.copy()
df_monitoring.rename(columns = {"Pseudonym": "customer", "EMA_ID": "ema_id", "Status": "status",
                                "Studienversion":"study_version", "FOR_ID":"for_id", 
                           "Start EMA Baseline": "ema_base_start", "Ende EMA Baseline": "ema_base_end", 
                           "Freischaltung/ Start EMA T20": "ema_t20_start","Ende EMA T20":"ema_t20_end", 
                               "Termin 1. Gespräch": "first_call_date", "Freischaltung/ Start EMA Post":"ema_post_start",
                               "Ende EMA Post":"ema_post_end", "T20=Post":"t20_post" }, inplace=True)

df_monitoring["customer"] = df_monitoring["customer"].str[:4]
df_monitoring["for_id"] = df_monitoring.for_id.str.strip()

df_monitoring["ema_base_start"] = pd.to_datetime(df_monitoring["ema_base_start"], dayfirst=True)
df_monitoring["ema_base_end"] = pd.to_datetime(df_monitoring["ema_base_end"], dayfirst=True)

df_monitoring_short = df_monitoring[["customer", "status", "study_version", "ema_base_start","ema_base_end"]]


## Passive data

In [None]:
df_complete_filtered= df_complete_filtered.merge(df_monitoring_short, on="customer", how="right")

In [None]:
object_cols = ["booleanValue", "stringValue","customer", "type", "status", "study_version"] 

# Fill NaN values with -99 for the specified columns
for col in object_cols:
    df_complete_filtered[col] = df_complete_filtered[col].fillna(-99)

# Convert "booleanValue" to boolean
df_complete_filtered['booleanValue'] = df_complete_filtered['booleanValue'].apply(lambda x: bool(x) if x != -99 else False)

# Convert "stringValue", "status", "study_version" to string using StringDtype
df_complete_filtered['stringValue'] = df_complete_filtered['stringValue'].astype('string')
df_complete_filtered['status'] = df_complete_filtered['status'].astype('string')
df_complete_filtered['study_version'] = df_complete_filtered['study_version'].astype('string')
df_complete_filtered['customer'] = df_complete_filtered['customer'].astype('string')
df_complete_filtered['type'] = df_complete_filtered['type'].astype('string')


In [None]:
def calculate_data_coverage(df, today_day, data_type_groups):
    """
    Calculate the data coverage percentage for each customer, for each specified group of data types.

    :param df: DataFrame containing customer data
    :param today_day: The current date for calculating potential coverage
    :param data_type_groups: A dictionary where keys are group names and values are lists of data types
    :param status_col: Column name for the status
    :param study_version_col: Column name for the study version
    :param ema_base_end_col: Column name for the EMA base end date
    :return: DataFrame with additional columns for data coverage percentages
    """

    # Ensure the date columns are datetime objects
    df['startTimestamp_day'] = pd.to_datetime(df['startTimestamp_day'])

    df['ema_base_end'] = pd.to_datetime(df['ema_base_end'])

    # Find the earliest 'startTimestamp_day' for each customer
    earliest_timestamp_per_customer = df.groupby('customer')['startTimestamp_day'].min()

    # Map the earliest timestamp back to the original DataFrame
    df['earliest_start_day'] = df['customer'].map(earliest_timestamp_per_customer)

    # Calculate potential days of coverage from the earliest start day to today
    df['potential_days_coverage'] = (today_day - df['earliest_start_day']).dt.days

    # Define the condition for adjusting potential days coverage
    condition = (
        (df['status'] == 'Abgeschlossen') & 
        (df['study_version'].isin(['Kurz', 'Kurz (Wechsel/Abbruch)']))
    )

    # Adjust potential days of coverage based on the condition
    df['potential_days_coverage'] = np.where(
        condition,
        (df['ema_base_end'] - df['earliest_start_day']).dt.days,
        df['potential_days_coverage']
    )

    for group_name, data_types in data_type_groups.items():
        # Filter for the current group of data types
        df_type_group = df[df['type'].isin(data_types)]
        
        # Count unique days with data for each customer for the current data types
        actual_days = df_type_group.groupby('customer')['startTimestamp_day'].nunique()

        # Map the actual number of days back to the DataFrame
        df[f'{group_name}_actual_days_with_data'] = df['customer'].map(actual_days).fillna(0)

        # Calculate data coverage percentage for the current data types
        df[f'{group_name}_data_coverage_per'] = (df[f'{group_name}_actual_days_with_data'] / df['potential_days_coverage']) * 100

    # Drop intermediary columns if necessary
    df.drop(columns=['earliest_start_day'], inplace=True)

    return df

In [None]:
data_type_groups = {
    'GPS': ["Latitude"],
    # Add more groups as needed
    'Activity': ["Steps"],
    'Sleep': ["SleepBinary"],
    'Heart_Rate': ["HeartRate"]
    # Add more groups as needed
}

In [None]:
df_complete_filtered_cov = calculate_data_coverage(df_complete_filtered, today_day, data_type_groups)


## EMA data

In [40]:
# load data
session = pd.read_csv(datapath1 + "questionnaireSession.csv",low_memory=False)
answers = pd.read_csv(datapath1 + "answers.csv", low_memory=False)
choice = pd.read_csv(datapath1 + "choice.csv",low_memory=False)
questions = pd.read_csv(datapath1 + "questions.csv",low_memory=False)
questionnaire = pd.read_csv(datapath1 + "questionnaires.csv", low_memory=False)


In [41]:
# session data
session["user"] = session["user"].str[:4]
session.rename(columns = {"user":"customer","completedAt": "quest_complete", "createdAt": "quest_create", "expirationTimestamp": "quest_expir"}, inplace=True)
session["quest_create"] = (pd.to_datetime(session["quest_create"],unit='ms'))
session["quest_complete"] = (pd.to_datetime(session["quest_complete"],unit='ms'))

df_sess = session[["customer", "sessionRun", "quest_create", "quest_complete", "study"]]

In [42]:
# answer data 
answers["user"] = answers["user"].str[:4]
answers = answers[["user", "questionnaireSession", "questionnaire", "study", 
                   "question", "order","element", "createdAt"]]
answers["createdAt"] = (pd.to_datetime(answers["createdAt"],unit='ms'))
answers.rename(columns={"user":"customer","questionnaireSession":"session_unique", "createdAt": "quest_create"}, inplace=True)

In [43]:
# item description data
choice = choice[["element", "choice_id", "text", "question"]]
choice.rename(columns={"text":"choice_text"}, inplace=True)

In [44]:
# question description data
questions = questions[["id", "title"]]
questions.rename(columns={"id":"question","title":"quest_title"}, inplace=True)

In [45]:
questionnaire = questionnaire[["id", "name"]]
questionnaire.rename(columns={"id":"questionnaire","name":"questionnaire_name"}, inplace=True)

In [46]:
answer_merged = pd.merge(answers, choice, on= ["question","element"])
answer_merged = pd.merge(answer_merged, questions, on= "question")
answer_merged = pd.merge(answer_merged, questionnaire, on= "questionnaire")
answer_merged["quest_complete_day"] = answer_merged.quest_create.dt.normalize()

In [50]:
answer_merged = pd.merge(answer_merged, df_monitoring, on = "customer")

In [51]:
df_sess = pd.merge(df_sess, df_monitoring, on = "customer")

In [52]:
df_sess = df_sess[['customer', 'sessionRun', 'quest_create', 'quest_complete', 'study',
       'for_id', 'ema_id', 'study_version', 'status', 't20_post',
       'ema_base_start', 'ema_base_end','ema_t20_start', 'ema_t20_end',
       'ema_post_start', 'ema_post_end']]

In [53]:
df_sess = df_sess.copy()
df_sess["quest_create_day"] = df_sess.quest_create.dt.normalize()
df_sess["quest_complete_day"] = df_sess.quest_complete.dt.normalize()

df_sess["quest_create_hour"] = df_sess.quest_create.dt.hour
df_sess["quest_complete_hour"] = df_sess.quest_complete.dt.hour

In [54]:
# count number of completed EMA beeps in first phase
df_sess1 = df_sess.loc[df_sess.study.isin([24,25])]
df_sess1 = df_sess1.copy()

df_sess2 = df_sess.loc[df_sess.study.isin([33,34])]
df_sess2 = df_sess2.copy()

df_sess3 = df_sess.loc[df_sess.study.isin([38,39])]
df_sess3 = df_sess3.copy()

In [55]:
df_sess1['quest_complete_relative1'] = (df_sess1['quest_complete_day'] - df_sess1['ema_base_start']).dt.days


sess_count1 = df_sess1.dropna(subset=["quest_complete"]).groupby("customer")["quest_complete"].size()\
.reset_index()
sess_count1 = sess_count1.rename(columns = {"quest_complete":"nquest_EMA1"})

# count number of completed EMA beeps in second phase
sess_count2 = df_sess2.dropna(subset=["quest_complete"]).groupby("customer")["quest_complete"].size()\
.reset_index()
sess_count2 = sess_count2.rename(columns = {"quest_complete":"nquest_EMA2"})

# count number of completed EMA beeps in second phase
sess_count3 = df_sess3.dropna(subset=["quest_complete"]).groupby("customer")["quest_complete"].size()\
.reset_index()
sess_count3 = sess_count3.rename(columns = {"quest_complete":"nquest_EMA3"})

In [56]:
df_sess = df_sess.merge(sess_count1, on=['customer'], how='left')
df_sess = df_sess.merge(sess_count2, on=['customer'], how='left')
df_sess = df_sess.merge(sess_count3, on=['customer'], how='left')

In [86]:
df_ema_content = answer_merged.copy()

In [87]:

df_ema_content['weekday'] = df_ema_content['quest_create'].dt.day_name()
df_ema_content['createdAt_day'] = df_ema_content.quest_create.dt.normalize()
# Convert columns to datetime
df_ema_content['createdAt_day'] = pd.to_datetime(df_ema_content['createdAt_day'])
df_ema_content['ema_base_start'] = pd.to_datetime(df_ema_content['ema_base_start'], dayfirst=True)
df_ema_content['ema_t20_start'] = pd.to_datetime(df_ema_content['ema_t20_start'], dayfirst=True)
df_ema_content['ema_post_start'] = pd.to_datetime(df_ema_content['ema_post_start'], dayfirst=True)


study_mapping = {
    24: 0,
    25: 0,
    33: 1,
    34: 1,
    38: 2,
    39: 2
}

# Apply the mapping to the 'study' column
df_ema_content['assess'] = df_ema_content['study'].map(study_mapping)
# Replace '_morning' with '' in the 'quest_title' column as we don't need to differenciate
df_ema_content['quest_title'] = df_ema_content['quest_title'].str.replace('_morning', '', regex=False)

df_ema_content['weekend'] = [1 if day in ['Saturday', 'Sunday'] else 0 for day in df_ema_content['weekday']]

df_ema_content['quest_nr'] = df_ema_content['questionnaire_name'].apply(lambda x: int(re.search(r'\d+', x).group()) \
                                               if re.search(r'\d+', x) else None)

df_ema_content["n_quest"] = df_ema_content.groupby(["study", "customer", "createdAt_day"])["questionnaire_name"].transform("nunique")

# Create unique day ID
# Create a unique day identifier directly without creating extra columns
df_ema_content['unique_day_id'] = df_ema_content['createdAt_day'].dt.strftime('%Y%m%d') + '_' + df_ema_content['quest_nr'].astype(str)

df_ema_content['ema_relative_start'] = df_ema_content.groupby(['customer', 'assess'])['createdAt_day'].transform('min')
df_ema_content['ema_relative_end'] = df_ema_content.groupby(['customer', 'assess'])['createdAt_day'].transform('max')

# Calculate ema_relative_start and ema_relative_end for each phase and customer
phase_0 = df_ema_content[df_ema_content['assess'] == 0].groupby(['customer'])['createdAt_day'].agg(ema_relative_start_phase0='min', ema_relative_end_phase0='max').reset_index()
phase_1 = df_ema_content[df_ema_content['assess'] == 1].groupby(['customer'])['createdAt_day'].agg(ema_relative_start_phase1='min', ema_relative_end_phase1='max').reset_index()
phase_2 = df_ema_content[df_ema_content['assess'] == 2].groupby(['customer'])['createdAt_day'].agg(ema_relative_start_phase2='min', ema_relative_end_phase2='max').reset_index()

# Merge these values back into the original DataFrame
df_ema_content = df_ema_content.merge(phase_0, on='customer', how='left')
df_ema_content = df_ema_content.merge(phase_1, on='customer', how='left')
df_ema_content = df_ema_content.merge(phase_2, on='customer', how='left')

# Map the correct start date for each phase
df_ema_content['ema_relative_start'] = df_ema_content.apply(
    lambda row: row['ema_relative_start_phase0'] if row['assess'] == 0 else (
                row['ema_relative_start_phase1'] if row['assess'] == 1 else 
                row['ema_relative_start_phase2']), axis=1)

# Now calculate the Absolute Day Index based on the correct start date
df_ema_content['absolute_day_index'] = (
    df_ema_content['createdAt_day'] - df_ema_content['ema_relative_start']
).dt.days + 1

high_indices = df_ema_content[df_ema_content['absolute_day_index'] > 180]
if not high_indices.empty:
    print("Warning: Some entries have unexpectedly high absolute day indices:")
    print("Customers with high absolute day indices:")
    print(high_indices['customer'].unique())

# Calculate the Relative Day Index by counting unique days since ema_relative_start
df_ema_content['relative_day_index'] = df_ema_content.groupby(['customer', 'assess'])['createdAt_day'].rank(method='dense').astype(int)

# Drop duplicate rows based on 'customer', 'assess', and 'createdAt_day' to ensure only one counter per unique day
df_unique = df_ema_content.drop_duplicates(subset=['customer', 'assess', 'unique_day_id']).copy()

# Use .loc to avoid SettingWithCopyWarning
df_unique['questionnaire_counter'] = df_unique.groupby(['customer', 'assess']).cumcount() + 1
df_unique['questionnaire_counter'] = df_unique.questionnaire_counter.astype(int)


# Merge this back into the original dataframe to retain other data if needed
df_ema_content = pd.merge(df_ema_content, df_unique[['customer', 'assess', 'unique_day_id', 'questionnaire_counter']],
                              on=['customer', 'assess', 'unique_day_id'], how='left')

Customers with high absolute day indices:
['UfMn']


In [88]:
df_test =df_ema_content.loc[df_ema_content.customer =='UfMn' ]

In [90]:
filter_criteria = (df_ema_content['customer'] == 'UfMn') & \
                  (df_ema_content['study'] == 25) & \
                  (df_ema_content['quest_create'] > '2024-02-08')

# Drop the entries that match the criteria
df_ema_content = df_ema_content[~filter_criteria]

In [92]:
df_ema_base = df_ema_content[["customer", 'ema_relative_start_phase0', 'ema_relative_end_phase0', 
                             'ema_relative_start_phase1', 'ema_relative_end_phase1',
                              'ema_relative_start_phase2', 'ema_relative_end_phase2']]
df_ema_base = df_ema_base.drop_duplicates()

In [None]:
#df_complete_ema = df_complete_filtered_cov.merge(df_ema_base, on = "customer", how="left")

In [None]:
df_complete_ema = pd.concat([df_backup, df_complete_ema], ignore_index=True)

In [None]:
#df_complete_ema = df_complete_final.merge(df_ema_base, on = "customer", how="left")

In [None]:

# Calculate memory usage in bytes
memory_usage_bytes = df_complete_ema.memory_usage(deep=True).sum()

# Convert to megabytes
memory_usage_mb = memory_usage_bytes / (1024 ** 2)

# Convert to gigabytes
memory_usage_gb = memory_usage_bytes / (1024 ** 3)

# Convert to terabytes
memory_usage_tb = memory_usage_bytes / (1024 ** 4)

print(f"Memory usage: {memory_usage_bytes} bytes")
print(f"Memory usage: {memory_usage_mb:.2f} MB")
print(f"Memory usage: {memory_usage_gb:.2f} GB")
print(f"Memory usage: {memory_usage_tb:.2f} TB")

## Redcap data

In [138]:
df_redcap_zert = pd.read_csv(redcap_path + "/ZERTIFIZIERUNGFOR518-LeonaExport_DATA_2024-08-21_0954.csv", low_memory=False)
df_redcap = pd.read_csv(redcap_path + f"/FOR5187-LeonaExport_DATA_2024-08-21_1024.csv", low_memory=False)

In [139]:
bsi_scales = {
    'bsi_somatization': ['bsi_2', 'bsi_7', 'bsi_23', 'bsi_29', 'bsi_30', 'bsi_33', 'bsi_37'],
    'bsi_compulsivity': ['bsi_5', 'bsi_15', 'bsi_26', 'bsi_27', 'bsi_32', 'bsi_36'],
    'bsi_insecurity': ['bsi_20', 'bsi_21', 'bsi_22', 'bsi_42'],
    'bsi_depression': ['bsi_9', 'bsi_16', 'bsi_17', 'bsi_18', 'bsi_35', 'bsi_50'],
    'bsi_anxiety': ['bsi_1', 'bsi_12', 'bsi_19', 'bsi_38', 'bsi_45', 'bsi_49'],
    'bsi_aggression': ['bsi_6', 'bsi_13', 'bsi_40', 'bsi_41', 'bsi_46'],
    'bsi_phobia': ['bsi_8', 'bsi_28', 'bsi_31', 'bsi_43', 'bsi_47'],
    'bsi_paranoia': ['bsi_4', 'bsi_10', 'bsi_24', 'bsi_48', 'bsi_51'],
    'bsi_psychotizism': ['bsi_3', 'bsi_14', 'bsi_34', 'bsi_44', 'bsi_53'],
    'bsi_additional': ['bsi_11', 'bsi_25', 'bsi_39', 'bsi_52']
}

# Check for missing columns and calculate each scale
for scale, columns in bsi_scales.items():
    missing_cols = [col for col in columns if col not in df_redcap_zert.columns]
    if missing_cols:
        print(f"Missing columns in DataFrame for {scale}:", missing_cols)
    else:
        # Sum the specified columns and create a new column for each scale
        df_redcap_zert[scale] = df_redcap_zert[columns].sum(axis=1)

In [140]:
# Calculate BSI-GS in Certification dataset
bsi_columns = [f'bsi_{i}' for i in range(1, 54)]
df_redcap_zert['bsi_gs'] = df_redcap_zert[bsi_columns].sum(axis=1)
df_redcap_zert['bsi_gsi'] = df_redcap_zert["bsi_gs"]/53

In [141]:
df_redcap_zert = df_redcap_zert[['for_id', 'redcap_event_name','ema_start_date','ema_smartphone', 'ema_special_event', 'age', 'gender', 'marital_status', 'partnership', 'graduation','profession','years_of_education',
 'employability', 'ses', 'somatic_problems','scid_cv_prim_cat', 'bsi_somatization',
 'bsi_compulsivity','bsi_insecurity','bsi_depression','bsi_anxiety','bsi_aggression','bsi_phobia','bsi_paranoia',
 'bsi_psychotizism','bsi_additional','bsi_gs','bsi_gsi', 'prior_treatment', 'redcap_event_name']] #ema_wear_exp

In [142]:
# Check for different assessment times 
df_redcap_base = df_redcap.loc[df_redcap.redcap_event_name == 'v1_baseline_arm_1']
df_redcap_t20 = df_redcap.loc[df_redcap.redcap_event_name == 'v3_t20_arm_1']

df_redcap_zert_base = df_redcap.loc[df_redcap.redcap_event_name == 'v1_baseline_arm_1']
df_redcap_zert_t20 = df_redcap_zert.loc[df_redcap.redcap_event_name == 'v3_t20_arm_1']

# BSI at T20 
bsi_list = ['for_id', 'bsi_gsi']
df_redcap_t20 = df_redcap_t20[bsi_list]
df_redcap_zert_t20 = df_redcap_zert_t20[bsi_list]
df_t20 = pd.concat([df_redcap_t20, df_redcap_zert_t20],ignore_index=True)

In [143]:
df_redcap = pd.concat([df_redcap_base, df_redcap_zert_base],ignore_index=True)
df_redcap = pd.merge(df_redcap, df_t20,on='for_id', suffixes=('_base', '_t20'), how="left")

In [144]:
df_redcap = df_redcap[['for_id', 'ema_start_date','ema_smartphone',
 'ema_wear_exp', 'ema_special_event', 'age', 'gender', 'marital_status', 'partnership', 'graduation','profession','years_of_education',
 'employability','ses', 'somatic_problems','scid_cv_prim_cat','bsi_somatization', 'prior_treatment',
 'bsi_compulsivity','bsi_insecurity','bsi_depression','bsi_anxiety','bsi_aggression','bsi_phobia','bsi_paranoia',
 'bsi_psychotizism','bsi_additional','bsi_gs','bsi_gsi_base', 'bsi_gsi_t20']] 

In [145]:
df_redcap = df_redcap.drop_duplicates()

In [146]:
gender_mapping = {
    1: 'male',
    2: 'female',
    3: 'diverse',
    4: 'no gender',
    5: 'not specified'
}

scid_cv_cat_mapping = {
    1: 'Depressive Disorder',
    2: 'Specific Phobia',
    3: 'Social Anxiety Disorder',
    4: 'Agoraphobia and/or Panic Disorder',
    5: 'Generalized Anxiety Disorder',
    6: 'Obsessive-Compulsive Disorder',
    7: 'Post-Traumatic Stress Disorder'
}

marital_status_mapping = {
    1: 'single',
    2: 'married/registered partnership',
    3: 'divorced',
    4: 'separated',
    5: 'widowed',
    6: 'other'
}

employability_mapping = {
    0: 'employable',
    1: 'unemployable (on sick leave)',
    2: 'on disability pension',
    3: 'on retirement pension',
    4: 'other'
}

graduation_mapping = {
    0: 'still in school',
    1: 'no school degree',
    2: 'elementary school degree or equivalent',
    3: 'middle school degree or equivalent',
    4: 'high school diploma/university entrance qualification',
    5: 'other'
}

profession_mapping = {
    0: 'still in training or studies',
    1: 'no training degree',
    2: 'vocational training, including technical school',
    3: 'university or college degree',
    4: 'other'
}

prior_treatment_mapping = {
    0: 'no prior treatment',
    1: 'outpatient psychotherapy',
    2: 'inpatient or partial inpatient treatment/psychotherapy',
    3: 'both',
    4: 'yes'
}

ema_smartphone_mapping = {
    1: 'iPhone',
    0: 'Android'
}

ema_special_event_mapping = {
    0: 'usual',
    1: 'special event'
}
def categorize_age(age):
    if 18 <= age <= 24:
        return 0
    elif 25 <= age <= 34:
        return 1
    elif 35 <= age <= 44:
        return 2
    elif 45 <= age <= 54:
        return 3
    elif 55 <= age <= 64:
        return 4
    else:
        return 5
    

In [147]:
# Apply mappings
df_redcap['gender_description'] = df_redcap['gender'].map(gender_mapping)
df_redcap['scid_cv_description'] = df_redcap['scid_cv_prim_cat'].map(scid_cv_cat_mapping)
df_redcap['marital_status_description'] = df_redcap['marital_status'].map(marital_status_mapping)
df_redcap['employability_description'] = df_redcap['employability'].map(employability_mapping)
df_redcap['graduation_description'] = df_redcap['graduation'].map(graduation_mapping)
df_redcap['profession_description'] = df_redcap['profession'].map(profession_mapping)
df_redcap['prior_treatment_description'] = df_redcap['prior_treatment'].map(prior_treatment_mapping)
df_redcap['ema_smartphone_description'] = df_redcap['ema_smartphone'].map(ema_smartphone_mapping)
df_redcap['ema_special_event_description'] = df_redcap['ema_special_event'].map(ema_special_event_mapping)
df_redcap['age_description'] = df_redcap['age'].apply(categorize_age)



In [148]:
df_monitoring["for_id"] = df_monitoring.for_id.str.strip()
df_forid = df_monitoring[["for_id","customer"]]
df_redcap = pd.merge(df_forid, df_redcap, on="for_id", how="left")

In [149]:
valid_df = df_redcap.dropna(subset=['ema_start_date'])


## Export

In [38]:
backup_path = preprocessed_path + "backup_data_passive.feather"
df_complete_ema.to_feather(backup_path)

with open(preprocessed_path + f'/ema_data.pkl', 'wb') as file:
    pickle.dump(df_sess, file)
    
with open(preprocessed_path + f'/monitoring_data.pkl', 'wb') as file:
    pickle.dump(df_monitoring, file)

NameError: name 'df_complete_ema' is not defined

In [150]:
with open(preprocessed_path + f'/redcap_data.pkl', 'wb') as file:
    pickle.dump(valid_df, file)

In [93]:
    
with open(preprocessed_path + f'/ema_content.pkl', 'wb') as file:
    pickle.dump(df_ema_content, file)