# SAA 2024: GPS Data Preprocessing and Reliability in PREACT

This notebook preprocesses GPS data in accordance with Mueller et al. (2021) and performs analyses by splitting the data into two weekly segments.

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess Data**: Filter and transform the data for analysis.
3. **Split Data**: Divide the data into two parts based on the `ema_base_start` variable.
4. **Analyze Data**: Perform analyses separately for the two parts.
5. **Calculate Internal Consistency**: Evaluate the internal consistency of features between the first and second weeks.

In [1]:
import os
import sys

import glob
import pickle
from IPython.display import Markdown
from config import datapath

# If your current working directory is the notebooks directory, use this:
library_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'library'))
sys.path.append(library_path)

from gps_features import haversine, apply_clustering, identify_home,calculate_metrics, calculate_transition_time, calculate_intraclass_coefficient, calculate_retest_reliability

import pandas as pd
import numpy as np
import datetime as dt
import regex as re
import pingouin as pg  # Ensure pingouin is installed

from sklearn.cluster import DBSCAN
import statistics 
import scipy.stats as stats
from scipy.stats import pearsonr
from math import radians, cos, sin, asin, sqrt, log


import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline

In [2]:
today = dt.date.today().strftime("%d%m%Y")
today_day = pd.to_datetime('today').normalize()
today = "21052024"

with open(datapath + f'ema_data.pkl', 'rb') as file:
    df_active = pickle.load(file)
    
with open(datapath + f'ema_content.pkl', 'rb') as file:
    df_ema = pickle.load(file)
    
with open(datapath + f'gps_data.pkl', 'rb') as file:
    df_gps = pickle.load(file)
    
with open(datapath + f'passive_data.pkl', 'rb') as file:
    df_passive = pickle.load(file)

with open(datapath + f'monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

### Configurations

In [3]:
min_hour_daily = 8
min_days_data = 12

#stationary filtering
max_distance = 150 
speed_limit = 1.4  # Max allowed speed in m/s

# DBSCAN
kms_per_radian = 6371.0088 # equitorial radius of the earth = 6,371.1 
epsilon = 0.03/kms_per_radian
min_samples = 10

# Kmeans
DKmeans = 500

#home featurenight
min_nights_obs = 4
min_f_home = 0.5 

# EMA
min_num_daily = 4
min_days_data = 10

## EMA data

In [4]:
df_ema = df_ema[['customer','study', 'createdAt', 'choice_id', 'choice_text',
       'quest_title', 'questionnaire_name', 'ema_start_date', 'status',
       'study_version']]

In [5]:
df_ema = df_ema.copy()
df_ema['weekday'] = df_ema['createdAt'].dt.day_name()
df_ema['createdAt_day'] = df_ema.createdAt.dt.normalize()

df_ema['quest_nr'] = df_ema['questionnaire_name'].apply(lambda x: int(re.search(r'\d+', x).group()) \
                                               if re.search(r'\d+', x) else None)

df_ema["n_quest"] = df_ema.groupby(["study", "customer", "createdAt_day"])["questionnaire_name"].transform("nunique")

# Create unique day ID
# Create a unique day identifier directly without creating extra columns
df_ema['unique_day_id'] = df_ema['createdAt_day'].dt.strftime('%Y%m%d') + '_' + df_ema['quest_nr'].astype(str)

# Now df_ema has the 'unique_day_id' column directly

study_mapping = {
    24: 0,
    25: 0,
    33: 1,
    34: 1
}

# Apply the mapping to the 'study' column
df_ema['assess'] = df_ema['study'].map(study_mapping)
# Replace '_morning' with '' in the 'quest_title' column as we don't need to differenciate
df_ema['quest_title'] = df_ema['quest_title'].str.replace('_morning', '', regex=False)

### Include only patients with finished assessments and enough data


In [6]:
df_ema = df_ema.loc[df_ema.status.isin(["Abgeschlossen", "Post_Erhebung_1",
                                                             "Erhebung_2_aktiv","Post_Erhebung_2"])]
df_ema = df_ema.loc[df_ema.study.isin([24,25])] # first assessment phase

In [7]:
df_ema = df_ema.loc[df_ema["n_quest"] >= min_num_daily]
df_ema["n_days_4"] = df_ema.groupby("customer")["createdAt_day"].transform("nunique")
df_ema = df_ema.loc[df_ema.n_days_4 >= min_days_data]

In [8]:
# Define the time boundaries for the first and second week
df_ema['first_week_end'] = df_ema['ema_start_date'] + pd.Timedelta(days=8)
df_ema['second_week_end'] = df_ema['ema_start_date'] + pd.Timedelta(days=15)

# Filter data for the first and second week
first_week_df = df_ema[(df_ema['createdAt_day'] >= df_ema['ema_start_date']) & 
                       (df_ema['createdAt_day'] < df_ema['first_week_end'])]

second_week_df = df_ema[(df_ema['createdAt_day'] >= df_ema['first_week_end'])]

In [9]:
# Pivot the table as specified
df_piv_first = first_week_df.pivot_table(
    index=["customer", "unique_day_id"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_piv_first.columns = [col for col in df_piv_first.columns.values]

# Reset the index to turn the MultiIndex into columns
df_piv_first = df_piv_first.reset_index()

In [10]:
# Pivot the table as specified
df_piv_second = second_week_df.pivot_table(
    index=["customer", "unique_day_id"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_piv_second.columns = [col for col in df_piv_second.columns.values]

# Reset the index to turn the MultiIndex into columns
df_piv_second = df_piv_second.reset_index()

In [11]:
columns_to_convert = ['ecg_control', 'er_acceptance',
       'er_control', 'er_distraction', 'er_intensity', 'er_reappraisal',
       'er_relaxation', 'er_rumination', 'er_suppression', 'event_general',
       'event_social1', 'event_social2', 'event_social3',
       'panas_attentiveness', 'panas_fatigue', 'panas_fear1', 'panas_fear2',
       'panas_guilt1', 'panas_guilt2', 'panas_hostility1', 'panas_hostility2',
       'panas_joviality1', 'panas_joviality2', 'panas_loneliness',
       'panas_sadness1', 'panas_sadness2', 'panas_selfassurance',
       'panas_serenity1', 'panas_serenity2', 'panas_shyness',
       'physical_health', 'situation2', 'ta_behavioral',
       'ta_behavioral_2', 'ta_kognitiv', 'ta_kognitiv_2']

In [12]:
df_piv_first[columns_to_convert] = df_piv_first[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [13]:
df_piv_second[columns_to_convert] = df_piv_second[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [14]:
pa_scale = ['panas_attentiveness','panas_joviality1','panas_joviality2', 'panas_selfassurance','panas_serenity1',
 'panas_serenity2']
na_scale = ['panas_fatigue','panas_fear1','panas_fear2','panas_guilt1','panas_guilt2','panas_hostility1',
            'panas_hostility2','panas_loneliness','panas_sadness1','panas_sadness2','panas_shyness']

In [15]:

# Assuming pa_scale is defined
pa_scale = ['panas_attentiveness', 'panas_joviality1', 'panas_joviality2', 
            'panas_selfassurance', 'panas_serenity1', 'panas_serenity2']

# Calculate the mean PA value per customer for each week
mean_pa_first = df_piv_first.groupby('customer')[pa_scale].mean().mean(axis=1).reset_index(name='mean_pa_first')
mean_pa_second = df_piv_second.groupby('customer')[pa_scale].mean().mean(axis=1).reset_index(name='mean_pa_second')

# Merge the DataFrames for PA
merged_pa = mean_pa_first.merge(mean_pa_second, on='customer')

# Reshape the data for PA
pa_melted = pd.melt(merged_pa, id_vars='customer', 
                    value_vars=['mean_pa_first', 'mean_pa_second'],
                    var_name='week', value_name='mean_pa')

# Calculate ICC for PA
icc_pa = pg.intraclass_corr(data=pa_melted, targets='customer', raters='week', ratings='mean_pa')
print("ICC results for PA:")
print(icc_pa)

# Similarly, for NA scales, you would follow the same process
na_scale = ['panas_fatigue','panas_fear1','panas_fear2','panas_guilt1','panas_guilt2','panas_hostility1',
            'panas_hostility2','panas_loneliness','panas_sadness1','panas_sadness2','panas_shyness']  # replace with actual NA features

# Calculate the mean NA value per customer for each week
mean_na_first = df_piv_first.groupby('customer')[na_scale].mean().mean(axis=1).reset_index(name='mean_na_first')
mean_na_second = df_piv_second.groupby('customer')[na_scale].mean().mean(axis=1).reset_index(name='mean_na_second')

# Merge the DataFrames for NA
merged_na = mean_na_first.merge(mean_na_second, on='customer')

# Reshape the data for NA
na_melted = pd.melt(merged_na, id_vars='customer', 
                    value_vars=['mean_na_first', 'mean_na_second'],
                    var_name='week', value_name='mean_na')

# Calculate ICC for NA
icc_na = pg.intraclass_corr(data=na_melted, targets='customer', raters='week', ratings='mean_na')
print("ICC results for NA:")
print(icc_na)

ICC results for PA:
    Type              Description       ICC          F  df1  df2  \
0   ICC1   Single raters absolute  0.858505  13.134790  110  111   
1   ICC2     Single random raters  0.858646  13.321869  110  110   
2   ICC3      Single fixed raters  0.860353  13.321869  110  110   
3  ICC1k  Average raters absolute  0.923866  13.134790  110  111   
4  ICC2k    Average random raters  0.923948  13.321869  110  110   
5  ICC3k     Average fixed raters  0.924935  13.321869  110  110   

           pval         CI95%  
0  3.150269e-34    [0.8, 0.9]  
1  2.802954e-34    [0.8, 0.9]  
2  2.802954e-34    [0.8, 0.9]  
3  3.150269e-34  [0.89, 0.95]  
4  2.802954e-34  [0.89, 0.95]  
5  2.802954e-34  [0.89, 0.95]  
ICC results for NA:
    Type              Description       ICC          F  df1  df2  \
0   ICC1   Single raters absolute  0.849272  12.268962  110  111   
1   ICC2     Single random raters  0.849193  12.184168  110  110   
2   ICC3      Single fixed raters  0.848303  12.184168 

In [16]:
# Assume geodata_cluster_merged is your merged dataframe
# Calculate the retest reliability between the first and second week
features = ["mean_na"]
correlation = calculate_retest_reliability(merged_na, features)

# Print the results
for feature, correlation in correlation.items():
    print(f"Correlation for {feature} between first and second week: {correlation:.2f}")

Correlation for mean_na between first and second week: 0.88


In [17]:
# Assume geodata_cluster_merged is your merged dataframe
# Calculate the retest reliability between the first and second week
features = ["mean_pa"]
correlation = calculate_retest_reliability(merged_pa, features)

# Print the results
for feature, correlation in correlation.items():
    print(f"Correlation for {feature} between first and second week: {correlation:.2f}")

Correlation for mean_pa between first and second week: 0.85


### Correlate aggregates for emotion regulation

In [None]:
er_columns = ['er_acceptance','er_control', 'er_distraction', 'er_intensity', 'er_reappraisal',
       'er_relaxation', 'er_rumination', 'er_suppression']

In [22]:
mean_er_first = df_piv_first.groupby('customer')[er_columns].mean().reset_index()
mean_er_second = df_piv_second.groupby('customer')[er_columns].mean().reset_index()

merged_er = pd.merge(mean_er_first, mean_er_second, on='customer', suffixes=('_first', '_second'))


### Create retest-reliabiliy using ICC and correlation

In [25]:
import pingouin as pg
# List of features without the '_first' or '_second' suffixes
# List of features without the '_first' or '_second' suffixes
features = ['er_acceptance','er_control', 'er_distraction', 'er_intensity', 'er_reappraisal',
       'er_relaxation', 'er_rumination', 'er_suppression']

# Dictionary to store ICC results
icc_results = {}

# Calculate ICC for each pair of columns
for feature in features:
    feature_first = feature + '_first'
    feature_second = feature + '_second'
    
    if feature_first in merged_er.columns and feature_second in merged_er.columns:
        # Drop rows with NaN values in the relevant columns
        clean_data = merged_er[[feature_first, feature_second]].dropna()
        
        if len(clean_data) >= 5:
            # Create a DataFrame suitable for pingouin
            data = pd.DataFrame({
                'subject': clean_data.index,
                'first': clean_data[feature_first],
                'second': clean_data[feature_second]
            })
            
            # Melt the DataFrame to long format
            data_long = data.melt(id_vars='subject', var_name='rater', value_name='rating')
            
            # Calculate ICC(2,1) for single measurements
            icc = pg.intraclass_corr(data=data_long, targets='subject', raters='rater', ratings='rating', nan_policy='omit')
            print(f"ICC result for feature {feature}:")
            print(icc)
            if 'ICC2' in icc['Type'].values:
                icc_value = icc[icc['Type'] == 'ICC2']['ICC'].values[0]
                icc_results[feature] = icc_value
            else:
                print(f"ICC2 not found for feature {feature}")
                icc_results[feature] = np.nan
        else:
            print(f"Not enough non-missing values for feature {feature}.")
            icc_results[feature] = np.nan
    else:
        print(f"Feature columns for {feature} are missing in the dataframe.")

ICC result for feature er_acceptance:
    Type              Description       ICC         F  df1  df2          pval  \
0   ICC1   Single raters absolute  0.777285  7.980090  110  111  2.451632e-24   
1   ICC2     Single random raters  0.778376  8.349136  110  110  5.013216e-25   
2   ICC3      Single fixed raters  0.786076  8.349136  110  110  5.013216e-25   
3  ICC1k  Average raters absolute  0.874688  7.980090  110  111  2.451632e-24   
4  ICC2k    Average random raters  0.875378  8.349136  110  110  5.013216e-25   
5  ICC3k     Average fixed raters  0.880227  8.349136  110  110  5.013216e-25   

          CI95%  
0  [0.69, 0.84]  
1  [0.69, 0.84]  
2   [0.7, 0.85]  
3  [0.82, 0.91]  
4  [0.82, 0.92]  
5  [0.83, 0.92]  
ICC result for feature er_control:
    Type              Description       ICC          F  df1  df2  \
0   ICC1   Single raters absolute  0.889008  17.019375  110  111   
1   ICC2     Single random raters  0.888955  16.874200  110  110   
2   ICC3      Single fixed ra

In [26]:
# Assume geodata_cluster_merged is your merged dataframe
# Calculate the retest reliability between the first and second week

correlation = calculate_retest_reliability(merged_er, features)

# Print the results
for feature, correlation in correlation.items():
    print(f"Correlation for {feature} between first and second week: {correlation:.2f}")

Correlation for er_acceptance between first and second week: 0.84
Correlation for er_control between first and second week: 0.91
Correlation for er_distraction between first and second week: 0.82
Correlation for er_intensity between first and second week: 0.71
Correlation for er_reappraisal between first and second week: 0.87
Correlation for er_relaxation between first and second week: 0.89
Correlation for er_rumination between first and second week: 0.80
Correlation for er_suppression between first and second week: 0.90
