## FOR SP6 (PREACT-digital): Emotion regulation ema data aggregation

*Date: 2025-07-15*

This notebook aggregates the 8 emotion regulation items for the Baseline EMA data of PREACT-digital.

Aggregation level: Person-specific (intraindividual) mean and standard deviation values for each item over the entire baseline period (14 days).

**8 Items:**
* `er_intensity`
* `er_control`
* `er_relaxation`
* `er_rumination`
* `er_reappraisal`
* `er_distraction`
* `er_suppression`
* `er_acceptance`

1. **Import data** 
2. **Preprocessing**: rename variables, filter for baseline data and transform 
3. **Aggregation**: one row per subject (for_id) and one column per item: intraindividual mean (im) and standard deviation (isd) for each item
4. **Export data**: as .csv file

In [1]:
# import packages
import os
from pyprojroot import here # define relative paths to the project root (working directory)
import sys 
import pickle
import pandas as pd
import numpy as np

# relative project root
root = here() # '.here' is located as invisible file in the project root working directory

# add 'src/' to the relative path
src_path = here()/ "src" 
sys.path.append(str(src_path))

# import relative paths
from server_config import preprocessed_path

### 1. Import Data

In [2]:
with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
    df_ema = pickle.load(file)
    

### 2. Preprocess Data

In [3]:
# preview data frame
df_ema.head()

Unnamed: 0,customer,questionnaire,study,question,element,quest_create,order,questionnaireSession,choice_id,choice_text,...,ema_relative_start_phase0,ema_relative_start_phase1,ema_relative_start_phase2,ema_relative_end_phase0,ema_relative_end_phase1,ema_relative_end_phase2,ema_relative_start,absolute_day_index,relative_day_index,questionnaire_counter
0,4MLe,105,24,315,1709.0,2023-05-17 18:24:55.163,0.0,7611,3,3,...,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,2023-05-17,1,1,1
1,4MLe,105,24,316,1717.0,2023-05-17 18:24:58.692,0.0,7611,4,4,...,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,2023-05-17,1,1,1
2,4MLe,105,24,317,1725.0,2023-05-17 18:25:02.916,0.0,7611,5,5,...,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,2023-05-17,1,1,1
3,4MLe,105,24,318,1730.0,2023-05-17 18:25:06.688,0.0,7611,3,3,...,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,2023-05-17,1,1,1
4,4MLe,105,24,319,1736.0,2023-05-17 18:25:13.488,0.0,7611,2,2,...,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,2023-05-17,1,1,1


In [40]:
df_ema.columns

Index(['customer', 'questionnaire', 'study', 'question', 'element',
       'quest_create', 'order', 'questionnaireSession', 'choice_id',
       'choice_text', 'quest_title', 'questionnaire_name', 'quest_create_day',
       'for_id', 'ema_id', 'study_version', 'status', 't20_post',
       'ema_base_start', 'ema_base_end', 'ema_t20_start', 'ema_t20_end',
       'ema_post_start', 'ema_post_end', 'weekday', 'createdAt_day', 'season',
       'time_of_day', 'assess', 'weekend', 'quest_nr', 'n_quest',
       'quest_nr_str', 'unique_day_id', 'ema_relative_start_phase0',
       'ema_relative_start_phase1', 'ema_relative_start_phase2',
       'ema_relative_end_phase0', 'ema_relative_end_phase1',
       'ema_relative_end_phase2', 'ema_relative_start', 'absolute_day_index',
       'relative_day_index', 'questionnaire_counter'],
      dtype='object')

In [41]:
#'quest_complete' in df_ema.columns

In [42]:
# Step 1: Rename relevant columns (in line with quarto documentation) using a mapping dictionary 

rename_map = {
    'customer': 'id',
    'choice_id': 'response',
    'quest_title': 'item',
    'quest_create': 'timestamp_item_completion',
    'quest_expir': 'timestamp_beep_expiration',
    'quest_create_day': 'date',
    'ema_base_start': 'ema_burst_start',
    'ema_base_end': 'ema_burst_end',
    'assess': 'measurement_burst',
    'unique_day_id': 'beep_per_person_id',
    'quest_nr': 'nr_beep_daily',
    'n_quest': 'n_beeps_completed',
}

df_ema.rename(columns=rename_map, inplace=True)


# Step 2: keep only relevant columns
df_ema = df_ema[['id', 'for_id', 'timestamp_item_completion', 
                 'measurement_burst', 'response', 'item', 
                 'beep_per_person_id', 'date', 'nr_beep_daily', 'n_beeps_completed', 
                 'ema_burst_start', 'ema_burst_end', 'ema_relative_start_phase0',
                 'ema_relative_start_phase1', 'ema_relative_start_phase2',
                 'ema_relative_end_phase0', 'ema_relative_end_phase1', 'ema_relative_end_phase2',
                 'absolute_day_index', 'relative_day_index']]



In [43]:
df_ema.head()

Unnamed: 0,id,for_id,timestamp_item_completion,measurement_burst,response,item,beep_per_person_id,date,nr_beep_daily,n_beeps_completed,ema_burst_start,ema_burst_end,ema_relative_start_phase0,ema_relative_start_phase1,ema_relative_start_phase2,ema_relative_end_phase0,ema_relative_end_phase1,ema_relative_end_phase2,absolute_day_index,relative_day_index
0,4MLe,FOR11905,2023-05-17 18:24:55.163,0,3,panas_selfassurance,20230517_8.0,2023-05-17,8.0,1,2023-05-17,2023-05-31,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,1,1
1,4MLe,FOR11905,2023-05-17 18:24:58.692,0,4,panas_joviality2,20230517_8.0,2023-05-17,8.0,1,2023-05-17,2023-05-31,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,1,1
2,4MLe,FOR11905,2023-05-17 18:25:02.916,0,5,panas_fatigue,20230517_8.0,2023-05-17,8.0,1,2023-05-17,2023-05-31,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,1,1
3,4MLe,FOR11905,2023-05-17 18:25:06.688,0,3,panas_joviality1,20230517_8.0,2023-05-17,8.0,1,2023-05-17,2023-05-31,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,1,1
4,4MLe,FOR11905,2023-05-17 18:25:13.488,0,2,panas_fear1,20230517_8.0,2023-05-17,8.0,1,2023-05-17,2023-05-31,2023-05-17,2023-10-27,2024-08-15,2023-06-01,2023-11-11,2024-08-30,1,1


In [44]:
# print all unique 'item' values in df_ema that contain 'er_':
items_with_er = df_ema['item'][df_ema['item'].str.contains('er_', na=False)].unique()
print(items_with_er)


['er_intensity' 'er_control' 'er_distraction' 'er_reappraisal'
 'er_rumination' 'er_relaxation' 'er_suppression' 'er_acceptance']


In [45]:
df_ema['measurement_burst'].unique().tolist()

[0, 1, 2]

In [46]:
# filter for baseline data
df_ema_burst0 = df_ema[df_ema['measurement_burst'] == 0].copy()

# check how many unique FOR IDs
df_ema_burst0['for_id'].nunique()


397

In [47]:
#print(sorted(df_ema_burst0['for_id'].unique()))

##### Sanity check:

##### N = 397 FOR IDs (last data export: 20250708)

##### N = 393 IDs (last data export: 20250708)

**STILL TO DO**

In [58]:
# add column 'number of beeps completed'

# add column '% (compliance)'

#df_ema_burst0['unqiue_d'].unique().tolist()


### 3. Aggregation

[aggregation level: one row per subject with one Mean + SD per Item]

In [49]:
# check data type
df_ema_burst0['item'] = df_ema_burst0['item'].astype('string')

df_ema_burst0['item'].dtype # object type, i.e. string 

string[python]

In [50]:
# filter only the rows where the 'item' column starts with 'er_'
df_er_items = df_ema_burst0[df_ema_burst0['item'].str.startswith('er_')]

# save only the emotion regulation items 
print(df_er_items['item'].unique())


<StringArray>
[  'er_intensity',     'er_control', 'er_distraction', 'er_reappraisal',
  'er_rumination',  'er_relaxation', 'er_suppression',  'er_acceptance']
Length: 8, dtype: string


In [51]:
# Group by subject and item, then aggregate
agg_df = df_er_items.groupby(['for_id', 'item'])['response'].agg(['mean', 'std']).reset_index()

agg_df.head()

Unnamed: 0,for_id,item,mean,std
0,FOR11001,er_acceptance,5.0,1.195229
1,FOR11001,er_control,3.206522,1.895849
2,FOR11001,er_distraction,4.891304,1.296288
3,FOR11001,er_intensity,4.086957,1.426994
4,FOR11001,er_reappraisal,2.978261,1.300704


In [52]:
# step 1: convert to wide format: one row per subject, item values = columns
agg_wide = agg_df.pivot(index='for_id', columns='item') 

# step 2: Flatten multi-index columns (because step 1 creates multi-level columns)
agg_wide.columns = [f"{stat}_{item}" for stat, item in agg_wide.columns]

# step 3: reset index 
agg_wide.reset_index(inplace=True)

# visual inspection
agg_wide.head()


Unnamed: 0,for_id,mean_er_acceptance,mean_er_control,mean_er_distraction,mean_er_intensity,mean_er_reappraisal,mean_er_relaxation,mean_er_rumination,mean_er_suppression,std_er_acceptance,std_er_control,std_er_distraction,std_er_intensity,std_er_reappraisal,std_er_relaxation,std_er_rumination,std_er_suppression
0,FOR11001,5.0,3.206522,4.891304,4.086957,2.978261,3.695652,5.304348,4.836957,1.195229,1.895849,1.296288,1.426994,1.300704,1.364697,1.427663,1.521113
1,FOR11003,1.0,1.947368,1.157895,1.894737,1.0,1.0,1.105263,1.0,0.0,1.31122,0.688247,1.286457,0.0,0.0,0.458831,0.0
2,FOR11005,1.0,1.947368,1.157895,1.894737,1.0,1.0,1.105263,1.0,0.0,1.31122,0.688247,1.286457,0.0,0.0,0.458831,0.0
3,FOR11010,5.125,3.208333,3.791667,4.875,1.791667,2.875,3.875,4.5,1.2619,1.587428,1.744037,0.946963,1.350657,1.776966,1.623536,1.668115
4,FOR11011,3.866667,3.2,3.866667,4.266667,3.333333,1.0,4.066667,3.8,1.641718,1.082326,1.125463,1.222799,1.290994,0.0,1.334523,1.473577


In [53]:
# sanity check: sample size
agg_wide['for_id'].nunique()

397

In [54]:
# sanity check: data type
agg_wide['mean_er_control'].dtype

dtype('float64')

In [55]:
# rename variables

rename_er_map = {
    'mean_er_acceptance': 'accept_im',
    'std_er_acceptance': 'accept_isd',
    'mean_er_control': 'contr_im',
    'std_er_control': 'contr_isd',
    'mean_er_distraction': 'distr_im',
    'std_er_distraction': 'distr_isd',
    'mean_er_intensity': 'intens_im',
    'std_er_intensity': 'intens_isd',
    'mean_er_reappraisal': 'reappr_im',
    'std_er_reappraisal': 'reappr_isd',
    'mean_er_relaxation': 'relax_im',
    'std_er_relaxation': 'relax_isd',
    'mean_er_rumination': 'rumin_im',
    'std_er_rumination': 'rumin_isd',
    'mean_er_suppression': 'suppr_im',
    'std_er_suppression': 'suppr_isd',
}

agg_wide.rename(columns=rename_er_map, inplace=True)

agg_wide.head(10)

Unnamed: 0,for_id,accept_im,contr_im,distr_im,intens_im,reappr_im,relax_im,rumin_im,suppr_im,accept_isd,contr_isd,distr_isd,intens_isd,reappr_isd,relax_isd,rumin_isd,suppr_isd
0,FOR11001,5.0,3.206522,4.891304,4.086957,2.978261,3.695652,5.304348,4.836957,1.195229,1.895849,1.296288,1.426994,1.300704,1.364697,1.427663,1.521113
1,FOR11003,1.0,1.947368,1.157895,1.894737,1.0,1.0,1.105263,1.0,0.0,1.31122,0.688247,1.286457,0.0,0.0,0.458831,0.0
2,FOR11005,1.0,1.947368,1.157895,1.894737,1.0,1.0,1.105263,1.0,0.0,1.31122,0.688247,1.286457,0.0,0.0,0.458831,0.0
3,FOR11010,5.125,3.208333,3.791667,4.875,1.791667,2.875,3.875,4.5,1.2619,1.587428,1.744037,0.946963,1.350657,1.776966,1.623536,1.668115
4,FOR11011,3.866667,3.2,3.866667,4.266667,3.333333,1.0,4.066667,3.8,1.641718,1.082326,1.125463,1.222799,1.290994,0.0,1.334523,1.473577
5,FOR11012,4.176471,4.352941,4.705882,5.0,3.823529,3.117647,4.647059,3.882353,1.776066,1.538716,1.829095,1.658312,1.074436,1.900077,2.089892,2.117851
6,FOR11015,4.741935,3.83871,4.354839,5.419355,4.83871,3.903226,5.032258,4.967742,1.23741,1.416493,1.45025,1.088552,1.485413,1.350428,1.425573,1.816294
7,FOR11016,2.194444,3.805556,4.75,4.638889,5.972222,1.861111,5.361111,4.888889,1.848852,1.600347,2.418677,1.376388,1.443925,1.290687,1.457057,1.996823
8,FOR11018,4.032967,3.626374,3.912088,3.824176,4.076923,3.769231,3.824176,4.175824,1.905842,1.953051,1.93016,1.975367,1.962033,1.966819,1.969734,2.030837
9,FOR11019,4.009615,5.298077,3.75,2.644231,2.865385,3.769231,2.298077,2.625,2.383174,2.052135,2.428592,1.879466,2.234147,2.277926,1.800107,2.290493


In [56]:
# sanity check: any NaNs? TRUE
agg_wide.isna().any().any()

# sanity check: how many NaN per column?
agg_wide.isna().sum()

for_id        0
accept_im     1
contr_im      1
distr_im      1
intens_im     0
reappr_im     1
relax_im      1
rumin_im      1
suppr_im      1
accept_isd    4
contr_isd     4
distr_isd     4
intens_isd    4
reappr_isd    4
relax_isd     4
rumin_isd     4
suppr_isd     4
dtype: int64

### 4. Export data

In [97]:
import os
os.makedirs(preprocessed_path, exist_ok=True)

In [98]:
# export as .csv file
agg_wide.to_csv(os.path.join(preprocessed_path, 'er_ema.csv'), index=False)


In [99]:
# Full path to the CSV file
#csv_file = os.path.join(preprocessed_path, 'er_ema.csv')

#df = pd.read_csv(csv_file)
#df.head(10)

Unnamed: 0,for_id,accept_im,contr_im,distr_im,intens_im,reappr_im,relax_im,rumin_im,suppr_im,accept_isd,contr_isd,distr_isd,intens_isd,reappr_isd,relax_isd,rumin_isd,suppr_isd
0,FOR11001,5.0,3.206522,4.891304,4.086957,2.978261,3.695652,5.304348,4.836957,1.195229,1.895849,1.296288,1.426994,1.300704,1.364697,1.427663,1.521113
1,FOR11003,1.0,1.947368,1.157895,1.894737,1.0,1.0,1.105263,1.0,0.0,1.31122,0.688247,1.286457,0.0,0.0,0.458831,0.0
2,FOR11005,1.0,1.947368,1.157895,1.894737,1.0,1.0,1.105263,1.0,0.0,1.31122,0.688247,1.286457,0.0,0.0,0.458831,0.0
3,FOR11010,5.125,3.208333,3.791667,4.875,1.791667,2.875,3.875,4.5,1.2619,1.587428,1.744037,0.946963,1.350657,1.776966,1.623536,1.668115
4,FOR11011,3.866667,3.2,3.866667,4.266667,3.333333,1.0,4.066667,3.8,1.641718,1.082326,1.125463,1.222799,1.290994,0.0,1.334523,1.473577
5,FOR11012,4.176471,4.352941,4.705882,5.0,3.823529,3.117647,4.647059,3.882353,1.776066,1.538716,1.829095,1.658312,1.074436,1.900077,2.089892,2.117851
6,FOR11015,4.741935,3.83871,4.354839,5.419355,4.83871,3.903226,5.032258,4.967742,1.23741,1.416493,1.45025,1.088552,1.485413,1.350428,1.425573,1.816294
7,FOR11016,2.194444,3.805556,4.75,4.638889,5.972222,1.861111,5.361111,4.888889,1.848852,1.600347,2.418677,1.376388,1.443925,1.290687,1.457057,1.996823
8,FOR11018,4.032967,3.626374,3.912088,3.824176,4.076923,3.769231,3.824176,4.175824,1.905842,1.953051,1.93016,1.975367,1.962033,1.966819,1.969734,2.030837
9,FOR11019,4.009615,5.298077,3.75,2.644231,2.865385,3.769231,2.298077,2.625,2.383174,2.052135,2.428592,1.879466,2.234147,2.277926,1.800107,2.290493


In [22]:
# create a mapping directory for affect
#affect_map = {
#    'panas_attentiveness': 'attentive',
#    'panas_joviality1': 'cheerful',
#    'panas_joviality2': 'happy',
#    'panas_selfassurance': 'self-confident',
#   'panas_serenity1': 'relaxed',
#    'panas_serenity2': 'calm',
#    'panas_fatigue': 'fatigue',
#    'panas_fear1': 'anxious',
#    'panas_fear2': 'nervous',
#    'panas_guilt1': 'ashamed',
#    'panas_guilt2': 'dissatisfied_myself',
#    'panas_hostility1': 'irritable',
#    'panas_hostility2': 'angry',
#    'panas_loneliness': 'lonely',
#    'panas_sadness1': 'downcast',
#    'panas_sadness2': 'sad',
#    'panas_shyness': 'shy'    
#}

# rename affect items
#df_ema.loc[:, 'item']  = df_ema['item'].replace(affect_map)