In [1]:
import os
import sys
import regex as re

import glob
import pickle
from IPython.display import Markdown
from config import datapath

# If your current working directory is the notebooks directory, use this:
library_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'library'))
sys.path.append(library_path)

from gps_features import haversine, db2, identify_home

import pandas as pd
import numpy as np
import datetime as dt

from sklearn.cluster import DBSCAN
import statistics 

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline

In [2]:
today = dt.date.today().strftime("%d%m%Y")
today_day = pd.to_datetime('today').normalize()
#today = "30042024"

with open(datapath + f'ema_data.pkl', 'rb') as file:
    df_active = pickle.load(file)
    
with open(datapath + f'ema_content.pkl', 'rb') as file:
    df_ema = pickle.load(file)
    
with open(datapath + f'gps_data.pkl', 'rb') as file:
    df_gps = pickle.load(file)
    
with open(datapath + f'passive_data.pkl', 'rb') as file:
    df_passive = pickle.load(file)

with open(datapath + f'monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

In [3]:
min_num_daily = 4
min_days_data = 10

In [4]:
df_ema = df_ema[['customer','study', 'createdAt', 'choice_id', 'choice_text',
       'quest_title', 'questionnaire_name', 'ema_start_date', 'status',
       'study_version']]

In [5]:
df_ema = df_ema.copy()
df_ema['weekday'] = df_ema['createdAt'].dt.day_name()
df_ema['createdAt_day'] = df_ema.createdAt.dt.normalize()

df_ema['quest_nr'] = df_ema['questionnaire_name'].apply(lambda x: int(re.search(r'\d+', x).group()) \
                                               if re.search(r'\d+', x) else None)

df_ema["n_quest"] = df_ema.groupby(["study", "customer", "createdAt_day"])["questionnaire_name"].transform("nunique")


### 1. Include only patients with finished assessments and enough data

In [6]:
df_ema = df_ema.loc[df_ema.status.isin(["Abgeschlossen", "Post_Erhebung_1",
                                                             "Erhebung_2_aktiv","Post_Erhebung_2"])]

In [7]:
df_ema2 = df_ema.loc[df_ema.study.isin([33,34])] # second assessment phase
df_ema1 = df_ema.loc[df_ema.study.isin([24,25])] # first assessment phase

In [8]:
df_ema2 = df_ema2.loc[df_ema2["n_quest"] >= min_num_daily]
df_ema2["n_days_4"] = df_ema2.groupby("customer")["createdAt_day"].transform("nunique")
df_ema2_customers = df_ema2.customer.unique().tolist()

In [9]:
df_ema1 = df_ema1.loc[df_ema1["n_quest"] >= min_num_daily]
df_ema1["n_days_4"] = df_ema1.groupby("customer")["createdAt_day"].transform("nunique")
df_ema1_customers = df_ema1.customer.unique().tolist()

In [10]:
#keep only customers with sufficient data 
df_ema_red = df_ema.loc[df_ema.customer.isin(df_ema2_customers)]
df_ema_red = df_ema_red.loc[df_ema_red.customer.isin(df_ema1_customers)]

In [12]:
df_ema_red

Unnamed: 0,customer,study,createdAt,choice_id,choice_text,quest_title,questionnaire_name,ema_start_date,status,study_version,weekday,createdAt_day,quest_nr,n_quest
2,f1J2,25,2023-08-23 07:40:23.960,6,6,panas_selfassurance,TIKI_1A_E1,2023-08-22,Erhebung_2_aktiv,Lang,Wednesday,2023-08-23,1,8
3,hIhW,33,2024-03-02 08:32:47.883,6,6,panas_selfassurance,TIKI_1A_E1,2023-09-07,Post_Erhebung_2,Lang,Saturday,2024-03-02,1,6
6,f1J2,33,2024-04-30 07:34:10.709,6,6,panas_selfassurance,TIKI_1A_E1,2023-08-22,Erhebung_2_aktiv,Lang,Tuesday,2024-04-30,1,6
11,hIhW,25,2023-09-08 07:30:24.633,5,5,panas_selfassurance,TIKI_1A_E1,2023-09-07,Post_Erhebung_2,Lang,Friday,2023-09-08,1,7
24,0xWn,33,2024-02-20 08:54:28.760,5,5,panas_selfassurance,TIKI_1A_E1,2023-07-27,Post_Erhebung_2,Lang,Tuesday,2024-02-20,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516553,0xWn,33,2024-03-04 22:23:03.650,4,4,ta_kognitiv_2,TIKI_8E_E1_S2,2023-07-27,Post_Erhebung_2,Lang,Monday,2024-03-04,8,6
516554,0xWn,33,2024-03-04 22:23:06.803,4,4,ta_behavioral,TIKI_8E_E1_S2,2023-07-27,Post_Erhebung_2,Lang,Monday,2024-03-04,8,6
516555,94FE,33,2024-03-08 22:27:23.923,3,3,ta_behavioral,TIKI_8E_E1_S2,2023-06-15,Post_Erhebung_1,Lang,Friday,2024-03-08,8,7
516556,0xWn,33,2024-03-04 22:23:08.850,2,0,physical_health,TIKI_8E_E1_S2,2023-07-27,Post_Erhebung_2,Lang,Monday,2024-03-04,8,6


### 2. Calculate descriptives

In [None]:
df_ema_red

In [None]:
answer_merged = answer_merged.loc[answer_merged.question.isin([340, 352])]

In [None]:
df_ema = df_ema[["customer", "questionnaire", "study", "question", "createdAt", ]]

In [None]:
# Sort by UserID and Date
answer_merged = answer_merged.sort_values(by=['customer', 'createdAt'])

# Calculate the day of assessment
answer_merged['assessment_day'] = answer_merged.groupby('customer')['createdAt'].transform(lambda x: (x - x.min()).dt.days + 1)


In [None]:
answer_merged['assessment_id'] = answer_merged.\
apply(lambda row: f"{row['assessment_day']}_{row['quest_nr']}", axis=1)
