# Bare minimum 

## imports

In [1]:
import pandas as pd
from tqdm import tqdm
import random
from matplotlib import pyplot as plt 
import numpy as np
from datetime import timedelta
import squarify
import openpyxl 
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd


# Add this line to remove column width limit

## Data Ingestion

In [2]:
df_ema = pd.read_parquet('../data_ema/csv_processed.parquet')
df_questionnaires = pd.read_excel('../data_questionnaire/2023006_COBRA_20230811.xlsx')
df_id_mapping = pd.read_excel('../data_ema/COBRA ID Mapping for Roman.xlsx')
pd.set_option('display.max_columns', None)

#### Joining the two dataframes

In [3]:
df_ema = df_ema.merge(df_id_mapping[['ID - Ethica EMA', 'Subject_ID_Roman']], 
                    left_on='ID - Ethica EMA', 
                    right_on='ID - Ethica EMA', 
                    how='left')

In [None]:
df_questionnaires.head()

In [None]:
questionnaires_unique = df_questionnaires['SubjectID'].unique()
len(questionnaires_unique)

In [None]:
ema_unique = df_ema['Subject_ID_Roman'].unique()
len(ema_unique)

In [None]:
# Find values unique to each dataframe
unique_to_questionnaires = set(questionnaires_unique) - set(ema_unique)
unique_to_ema = set(ema_unique) - set(questionnaires_unique)

# Print values unique to each dataframe
print("Values unique to df_questionnaires:")
print(unique_to_questionnaires)

print("\nValues unique to df_ema:")
print(unique_to_ema)

In [8]:
# MOve the subject ID to the left side of the dataframe
subject_id_roman = df_ema['Subject_ID_Roman']

# Drop the "Subject_ID_Roman" column from the original DataFrame
df_ema = df_ema.drop('Subject_ID_Roman', axis=1)

# Insert the "Subject_ID_Roman" column at the beginning (left) of the DataFrame
df_ema.insert(0, 'Subject_ID_Roman', subject_id_roman)


##### Merge with the questionnaires dataframe

In [9]:
df_ema = df_ema.merge(df_id_mapping[['ID - Ethica EMA', 'Subject_ID_Roman']], 
                    left_on='ID - Ethica EMA', 
                    right_on='ID - Ethica EMA', 
                    how='left')

#### Create initial index to join together with the independent variables

In [10]:
df_ema['initial_index_targets'] = range(len(df_ema))

## Filter out a single participant (ignore this section if you want to work with all participants)

In [11]:
# unique_values = df_ema['Subject_ID_Roman_x'].unique()

# # Randomly choose one value from the unique values
# chosen_value = random.choice(unique_values)

# # Filter out all rows with the chosen value
# single_participant = df_ema[df_ema['Subject_ID_Roman_x'] == chosen_value]

# # The filtered_df will now contain all rows with a different value in the "Subject_ID_Roman_x" column.
# single_participant.head(20)

# df_ema=single_participant.copy()

#### Add new outcome features to the dataframe

Enable the cell below as well to run everything

In [12]:
# Split the string to remove the time zone
df_ema['Scheduled Time'] = df_ema['Scheduled Time'].str.split().str[:-1].str.join(' ')

# Convert the column to datetime
df_ema['Scheduled Time'] = pd.to_datetime(df_ema['Scheduled Time'], format="%Y-%m-%d %H:%M:%S")


# Split the string to remove the time zone
df_ema['Issued Time'] = df_ema['Issued Time'].str.split().str[:-1].str.join(' ')

# Convert the column to datetime
df_ema['Issued Time'] = pd.to_datetime(df_ema['Issued Time'], format="%Y-%m-%d %H:%M:%S")


# Split the string to remove the time zone
df_ema['Response Time'] = df_ema['Response Time'].str.split().str[:-1].str.join(' ')

# Convert the column to datetime
df_ema['Response Time'] = pd.to_datetime(df_ema['Response Time'], format="%Y-%m-%d %H:%M:%S")

In [None]:
df_ema.head()

#### Correct two time stamp errors:

Thoss two rows above have minor errors: 
1) In the row with index 2902, the date in the response time column is the 2022-04-29 21:37:05instead of the 2022-04-28 21:37:05. Therefore, we recalculate the response time here to account for this.Recalculated, this should be 2028 seconds.
2) In the row with the index 27418, the response delay should actually be 373 seconds, therefore the stamp should be 15:55:28

--> Therefore I replace thos values in the response time column

In [None]:
selected_rows = df_ema.loc[[2902, 27418]]
selected_rows.head()


In [15]:
df_ema.at[2902, 'Response Time'] = '2022-04-20 21:37:05'  
df_ema.at[2902, 'Duration (seconds) from scheduled to completion time'] = 2628  # Update Duration (seconds)
df_ema.at[27418, 'Issued Time'] = '2022-09-02 15:55:28'  


#### Back to before

In [16]:
df_ema['Issued_minus_scheduled_time'] = (df_ema['Issued Time'] - df_ema['Scheduled Time']).dt.total_seconds()
filtered_df = df_ema[df_ema['Issued_minus_scheduled_time'] > 1.0]


In [None]:
df_ema.head(220).tail(10)

Enable cell below to run everything

In [None]:
non_zero_count = (df_ema['Issued_minus_scheduled_time'] > 1).sum()
non_zero_count

In [None]:
# Count the number of rows with values between -1.5 and 10 (around 0)
count_around_0 = len(df_ema[(df_ema['Issued_minus_scheduled_time'] >= -1.5) & (df_ema['Issued_minus_scheduled_time'] <= 10)])

# Count the number of rows with values between 598.5 and 610
count_around_600 = len(df_ema[(df_ema['Issued_minus_scheduled_time'] >= 598.5) & (df_ema['Issued_minus_scheduled_time'] <= 610)])

# Count the number of rows with values between 1198.5 and 1210
count_around_1200 = len(df_ema[(df_ema['Issued_minus_scheduled_time'] >= 1198.5) & (df_ema['Issued_minus_scheduled_time'] <= 1210)])


# Count the number of rows with values between 1198.5 and 1210
count_around_1800 = len(df_ema[(df_ema['Issued_minus_scheduled_time'] >= 1798.5) & (df_ema['Issued_minus_scheduled_time'] <= 1810)])


# Count the number of rows with values between 1198.5 and 1210
count_around_2400 = len(df_ema[(df_ema['Issued_minus_scheduled_time'] >= 2398.5) & (df_ema['Issued_minus_scheduled_time'] <= 2410)])


print("Number of rows with values around 0 mins:", count_around_0)
print("Number of rows with values around 10 mins:", count_around_600)
print("Number of rows with values around 20 mins:", count_around_1200)
print("Number of rows with values around 30 mins:", count_around_1800)
print("Number of rows with values around 40 mins:", count_around_2400)



In [None]:
len(df_ema)

In [None]:
df_ema['Scheduled Time'] = pd.to_datetime(df_ema['Scheduled Time'])
filtered_rows = df_ema[df_ema['Issued_minus_scheduled_time'] > 598.5]
daily_counts = filtered_rows.groupby(filtered_rows['Scheduled Time'].dt.date).size()

plt.figure(figsize=(10, 6))
daily_counts.plot(kind='bar', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Number of surveys per day')
plt.title('Frequency of Occurrences per Day')

x_labels = daily_counts.index
plt.xticks(range(0, len(x_labels), max(1, len(x_labels) // 5)), x_labels[::max(1, len(x_labels) // 5)], rotation=45, ha='right')

plt.show()


In [None]:
df_ema['Scheduled Time'] = pd.to_datetime(df_ema['Scheduled Time'])

daily_counts = df_ema.groupby(df_ema['Scheduled Time'].dt.date).size()

# Plotting the frequency of occurrences per day
plt.figure(figsize=(10, 6))
daily_counts.plot(kind='bar', color='skyblue')
plt.xlabel('Date')
plt.ylabel('Number of surveys per day')
plt.title('Frequency of Surveys per Day')

# Show only 5 x-axis labels
x_labels = daily_counts.index
plt.xticks(range(0, len(x_labels), max(1, len(x_labels) // 5)), x_labels[::max(1, len(x_labels) // 5)], rotation=45, ha='right')

plt.show()


In [23]:
# Assuming 'Scheduled Time' column is in datetime format
df_ema['Scheduled Time'] = pd.to_datetime(df_ema['Scheduled Time'])

# Create a new DataFrame containing rows that satisfy the condition
filtered_rows = df_ema[df_ema['Issued_minus_scheduled_time'] > 598.5]


In [24]:
# Assuming 'Scheduled Time' column is in datetime format
df_ema['Scheduled Time'] = pd.to_datetime(df_ema['Scheduled Time'])

# Create a new DataFrame containing rows that satisfy the condition
filtered_rows = df_ema[df_ema['Issued_minus_scheduled_time'] > 598.5]

# Sort the filtered DataFrame by 'Scheduled Time' in descending order and select the top 30 rows
recent_rows = filtered_rows.sort_values(by='Scheduled Time', ascending=False).head(42)

# Extract unique participant IDs and Ethica IDs
unique_participant_ids = recent_rows['Subject_ID_Roman_x'].unique()
unique_ethica_ids = recent_rows['ID - Ethica EMA'].unique()

# Create a DataFrame with the unique IDs
unique_ids_df = pd.DataFrame({'Participant_ID': unique_participant_ids, 'Ethica_ID': unique_ethica_ids})


In [None]:
unique_ids_df.head(3)

In [26]:
unique_ids_df.to_csv(f'../outputs/for_ethica/recent_ethica_and_participant_IDs.csv', index=False)

#### Only for ethica (Unique IDs that have a difference in issued and scheduled time)

#### Back to normal code

In [27]:
def subtract_duration(row):
    duration_column = pd.to_numeric(row['Duration (seconds) from scheduled to completion time'], errors='coerce')
    issued_minus_scheduled_column = pd.to_numeric(row['Issued_minus_scheduled_time'], errors='coerce')
    
    if not np.isnan(duration_column) and not np.isnan(issued_minus_scheduled_column):
        return float(duration_column) - float(issued_minus_scheduled_column)
    else:
        return 'NA'

df_ema['Duration (seconds) from issued to completion time'] = df_ema.apply(subtract_duration, axis=1)


#### Just to check whethere there are any inconsistenties in the completion time column, specifically completion times of surveys that have not been completed (compliance = 0)

In [None]:
import pandas as pd

# Assuming you already have df_ema DataFrame

# Get the unique participant IDs from the 'Subject_ID_Roman_x' column
unique_participant_ids = df_ema['Subject_ID_Roman_x'].unique()

# Create the "compliance" column and initialize it with 1
df_ema['compliance_test'] = 1

# Iterate through each unique participant ID
for participant_id in tqdm(unique_participant_ids):
    # Filter the DataFrame for the current participant ID
    participant_df = df_ema[df_ema['Subject_ID_Roman_x'] == participant_id]

    # Identify rows where "Duration (seconds) from scheduled to completion time" is "Expired" or "Canceled"
    expired_or_canceled = (participant_df['Duration (seconds) from scheduled to completion time'] == "Expired") | (participant_df['Duration (seconds) from scheduled to completion time'] == "Canceled")

    # Update the "compliance" column to 0 for these rows
    participant_df.loc[expired_or_canceled, 'compliance_test'] = 0

    # If the first row is "Expired" or "Canceled," set it to 0 since there's no previous row
    if expired_or_canceled.iloc[0]:
        participant_df.at[participant_df.index[0], 'compliance_test'] = 0

    # Update the original DataFrame with the modified values for this participant
    df_ema.update(participant_df)


In [None]:
import pandas as pd

# Assuming df_ema is your DataFrame
# Count the number of rows where compliance_new is 0 and Duration (seconds) from issued to completion time is not NaN
count_rows = df_ema[(df_ema['compliance_test'] == 0) & (~df_ema['Duration (seconds) from issued to completion time'].isna())].shape[0]

print("Number of rows satisfying the conditions:", count_rows)


In [30]:
filtered_df = df_ema[(df_ema['compliance_test'] == 0) 
                     & (~df_ema['Duration (seconds) from issued to completion time'].isna()) 
                     & (df_ema['Duration (seconds) from first response to completion time'] != 'Unknown')]


In [None]:
filtered_df['issue_response_diff'] = filtered_df['Response Time'] - filtered_df['Issued Time']


In [None]:
filtered_df['issue_response_diff'].describe()

In [None]:
filtered_df.head()

In [None]:
plt.hist(filtered_df['issue_response_diff'].dt.total_seconds(), bins = 60)
plt.show()

In [None]:
filtered_df_subset = filtered_df[filtered_df['issue_response_diff'].dt.total_seconds() <= 3590]

# Plot histogram
plt.hist(filtered_df_subset['issue_response_diff'].dt.total_seconds(), bins=20, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel('Issue-Response Time Difference (seconds)')
plt.ylabel('Frequency')
plt.title('Histogram of Issue-Response Time Difference (Excluding >3590 seconds)')

# Show plot
plt.show()


In [None]:
len(filtered_df_subset)

In [None]:
filtered_df_subset.head(5)

In [None]:
filtered_df.head(1000).tail(5)

#### Calculate response delay

In [39]:

# First, convert the columns to numeric and handle conversion errors with NaN
df_ema['Duration (seconds) from first response to completion time'] = pd.to_numeric(
    df_ema['Duration (seconds) from first response to completion time'], errors='coerce')
df_ema['Duration (seconds) from issued to completion time'] = pd.to_numeric(
    df_ema['Duration (seconds) from issued to completion time'], errors='coerce')

# Then, calculate the 'response delay' column by subtracting the two columns
df_ema['response delay'] = df_ema['Duration (seconds) from issued to completion time'] - df_ema['Duration (seconds) from first response to completion time']

# Now, the 'response delay' column will contain the time differences, and NaN for non-numeric values


In [40]:
response_delay = df_ema['response delay']
df_ema = df_ema.drop('response delay', axis=1)
position = 8  # Since Python uses 0-based indexing, the 9th position corresponds to index 8
df_ema.insert(position, 'response delay', response_delay)


#### Make a response_delay_column without the effects of the reminders (everything after 10 minutes is NaN)

In [41]:
df_ema['response_delay_10min'] = df_ema['response delay'].where(df_ema['response delay'] <= 598)

# Set values to NaN for rows where 'response delay' is greater than 598
df_ema.loc[df_ema['response delay'] > 598, 'response_delay_10min'] = pd.NA

In [None]:
df_ema.head(5)

In [None]:
import pandas as pd

# Assuming your DataFrame is named df_ema
count_expired = len(df_ema[df_ema['Duration (seconds) from scheduled to completion time'] == 'Expired'])
print('Expired:', count_expired)

# Assuming your DataFrame is named df_ema
count_cancelled = len(df_ema[df_ema['Duration (seconds) from scheduled to completion time'] == 'Canceled'])
print('Cancelled:', count_cancelled)

# Assuming your DataFrame is named df_ema
count_NaN = len(df_ema[df_ema['Duration (seconds) from scheduled to completion time'] == 'NaN'])
print('NaN:', count_NaN)




#### Visualisations of response delay

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have a DataFrame named 'df_ema' with the 'response delay' column
# Filter the data to exclude rows with NaN values in 'response delay'
filtered_df = df_ema.dropna(subset=['response delay'])

# Create a histogram of 'response delay' values with specified bins
bins = list(range(0, 3601, 10))  # Define bins from 0 to 1200 seconds with a bin width of 100 seconds
plt.hist(filtered_df['response delay'], bins=bins, edgecolor='k')
plt.xticks(range(0, 3601, 300))

# Set labels and title
plt.xlabel('Response Delay (seconds)')
plt.ylabel('Frequency')
plt.title('Response Delay Distribution')

# Show the plot
plt.show()


### Calculate next-prompt compliance

#### Make compliance feature

In [None]:
import pandas as pd

# Assuming you already have df_ema DataFrame

# Get the unique participant IDs from the 'Subject_ID_Roman_x' column
unique_participant_ids = df_ema['Subject_ID_Roman_x'].unique()

# Create the "compliance" column and initialize it with 1
df_ema['compliance'] = 1

# Iterate through each unique participant ID
for participant_id in tqdm(unique_participant_ids):
    # Filter the DataFrame for the current participant ID
    participant_df = df_ema[df_ema['Subject_ID_Roman_x'] == participant_id]

    # Identify rows where "Duration (seconds) from scheduled to completion time" is "Expired" or "Canceled"
    expired_or_canceled = (participant_df['Duration (seconds) from scheduled to completion time'] == "Expired") | (participant_df['Duration (seconds) from scheduled to completion time'] == "Canceled")

    # Update the "compliance" column to 0 for these rows
    participant_df.loc[expired_or_canceled, 'compliance'] = 0

    # If the first row is "Expired" or "Canceled," set it to 0 since there's no previous row
    if expired_or_canceled.iloc[0]:
        participant_df.at[participant_df.index[0], 'compliance'] = 0

    # Update the original DataFrame with the modified values for this participant
    df_ema.update(participant_df)


In [None]:
df_ema['compliance'].value_counts()

In [None]:
df_ema.head(37360).tail(5)

#### Testing above for single participants (Next-promt compliance)

In [None]:
# Get the unique values in the "Subject_ID_Roman_x" column
unique_values = df_ema['Subject_ID_Roman_x'].unique()

# Randomly choose one value from the unique values
chosen_value = random.choice(unique_values)

# Filter out all rows with the chosen value
single_participant = df_ema[df_ema['Subject_ID_Roman_x'] == chosen_value]

# The filtered_df will now contain all rows with a different value in the "Subject_ID_Roman_x" column.
single_participant.head(5)



#### Build targets

In [None]:
df_ema['2-min receptivity'] = df_ema['response delay'].apply(lambda x: 1 if x <= 120 else (0 if x > 120 else None))
df_ema['2-min receptivity'].value_counts(dropna=False)


In [None]:
df_ema['5-min receptivity'] = df_ema['response delay'].apply(lambda x: 1 if x <= 300 else (0 if x > 300 else None))
df_ema['5-min receptivity'].value_counts(dropna=False)

In [None]:
df_ema['10-min receptivity'] = df_ema['response delay'].apply(lambda x: 1 if x <= 600 else (0 if x > 600 else None))
df_ema['10-min receptivity'].value_counts(dropna=False)

In [None]:
df_ema['20-min receptivity'] = df_ema['response delay'].apply(lambda x: 1 if x <= 1200 else (0 if x > 1200 else None))
df_ema['20-min receptivity'].value_counts(dropna=False)

In [None]:
def convert_and_ignore(value):
    if value not in ['NaN']:
        try:
            return float(value)
        except (ValueError, TypeError):
            pass
    return None

# Apply the custom function to the DataFrame
df_ema['response delay'] = df_ema['response delay'].apply(convert_and_ignore)

# Remove non-float values
filtered_df = df_ema.dropna(subset=['response delay'])

# Define time intervals
intervals = [(0, 600), (600, 1200), (1200, 1800), (1800, 2400), (2400, 3000), (3000, 3600), (3600, 4200)]

# Initialize counters for each interval
counters = {f"{start}-{end} seconds": 0 for start, end in intervals}

# Count values in each interval
for start, end in intervals:
    counters[f"{start}-{end} seconds"] = ((filtered_df['response delay'] >= start) &
                                           (filtered_df['response delay'] < end)).sum()

# Print the counts for each interval
for interval, count in counters.items():
    print(f"{interval}: {count} responses")

In [54]:
negative_values_mask = df_ema['response delay'] < 0
df_negative_values = df_ema[negative_values_mask]

In [None]:
df_negative_values.head()

In [None]:
df_ema['response delay'].describe()

### Completion time

#### Visualisations of response delay

In [57]:
negative_values_mask = df_ema['Duration (seconds) from first response to completion time'] < 0
df_negative_values = df_ema[negative_values_mask]


In [None]:
df_negative_values.head()

There seem to be 2 rows with negative Completion Time values. Therefore, we will drop these two rows (20731, adn 35192) from the dataset

In [59]:
df_ema = df_ema.drop([20731, 35192])

In [None]:
df_ema['Duration (seconds) from first response to completion time'].describe()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have a DataFrame named 'df_ema' with the 'response delay' column
# Filter the data to exclude rows with NaN values in 'response delay'
filtered_df = df_ema.dropna(subset=['Duration (seconds) from first response to completion time'])

# Create a histogram of 'response delay' values with specified bins
bins = list(range(0, 901, 10))  # Define bins from 0 to 1200 seconds with a bin width of 100 seconds
plt.hist(filtered_df['Duration (seconds) from first response to completion time'], bins=bins, edgecolor='k')
plt.xticks(range(0, 901, 60))

# Set labels and title
plt.xlabel('Completion time (seconds)')
plt.ylabel('Frequency')
plt.title('Completion Time Distribution')

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have a DataFrame named 'df_ema' with the 'response delay' column
# Filter the data to exclude rows with NaN values in 'response delay'
filtered_df = df_ema.dropna(subset=['Duration (seconds) from first response to completion time'])

# Create a histogram of 'response delay' values with specified bins
bins = list(range(0, 31, 1))  # Define bins from 0 to 1200 seconds with a bin width of 100 seconds
plt.hist(filtered_df['Duration (seconds) from first response to completion time'], bins=bins, edgecolor='k')
plt.xticks(range(0, 31, 1))

# Set labels and title
plt.xlabel('Completion time (seconds)')
plt.ylabel('Frequency')
plt.title('Completion Time Distribution')

# Show the plot
plt.show()


In [None]:
def convert_and_ignore(value):
    if value not in ['NaN']:
        try:
            return float(value)
        except (ValueError, TypeError):
            pass
    return None

# Apply the custom function to the DataFrame
df_ema['Duration (seconds) from first response to completion time'] = df_ema['Duration (seconds) from first response to completion time'].apply(convert_and_ignore)

# Remove non-float values
filtered_df = df_ema.dropna(subset=['Duration (seconds) from first response to completion time'])

# Define time intervals
intervals = [(0, 60), (60, 120), (120, 180), (180, 240), (240, 300), (300, 360), (360, 420), (420, 480), (360, 420), (480, 540), (540, 600), (600, 660), (660, 720)]

# Initialize counters for each interval
counters = {f"{start}-{end} seconds": 0 for start, end in intervals}

# Count values in each interval
for start, end in intervals:
    counters[f"{start}-{end} seconds"] = ((filtered_df['Duration (seconds) from first response to completion time'] >= start) &
                                           (filtered_df['Duration (seconds) from first response to completion time'] < end)).sum()

# Print the counts for each interval
for interval, count in counters.items():
    print(f"{interval}: {count} responses")

In [None]:
df_ema['Duration (seconds) from first response to completion time'].describe()

#### Because responses that are quicker than 2 seconds are technically impossible, we remove any responses that are lower than 2 seconds

In [None]:
num_rows_duration_2_or_less = (df_ema['Duration (seconds) from first response to completion time'] <= 2).sum()
print("Number of rows with duration of 2 or less:", num_rows_duration_2_or_less)


In [None]:
len(df_ema)

In [None]:

# Filter out rows from df_ema that are not in num_rows_duration_2_or_less
df_ema = df_ema[~(df_ema['Duration (seconds) from first response to completion time'] < 2)]

# Check the number of rows in filtered_df
print("Number of rows in filtered DataFrame:", len(filtered_df))


#### Rename some columns

In [68]:
df_ema = df_ema.rename(columns={'Duration (seconds) from first response to completion time': 'completion_time'})


In [None]:
df_ema.head(1)

## Exports

Which rows from this dataframe need to be preserved for later?

In [70]:
df_ema['Duration (seconds) from scheduled to completion time'] = pd.to_numeric(df_ema['Duration (seconds) from scheduled to completion time'], errors='coerce')


In [71]:
df_ema.to_parquet(f'../outputs/targets.parquet', index=False)

In [None]:
df_ema.head()

In [None]:
df_ema['Subject_ID_Roman_x'].nunique()

##### Below is the output of the 2 participants that did not have 54 surveys

In [None]:
import pandas as pd

# Assuming df_ema is your DataFrame
participant_counts = df_ema['Subject_ID_Roman_x'].value_counts()

# Get the participant IDs without exactly 54 surveys
participants_without_54_surveys = participant_counts[participant_counts != 54].index

# Create a new DataFrame with the participant IDs and their survey counts
output_df = pd.DataFrame({
    'Subject_ID_Roman_x': participants_without_54_surveys,
    'Survey_Count': participant_counts[participants_without_54_surveys]
})

# Save the DataFrame to a CSV file
output_df.to_csv('participants_without_54_surveys.csv', index=False)
output_df.to_csv(f'../outputs/participants_without_54_surveys_for_se.csv', index=False)

# Create a new DataFrame with only the rows corresponding to the participant IDs without exactly 54 surveys
df_ema_filtered = df_ema[df_ema['Subject_ID_Roman_x'].isin(participants_without_54_surveys)]

# Sort the filtered DataFrame by participant ID
df_ema_filtered_sorted = df_ema_filtered.sort_values('Subject_ID_Roman_x')

# Display the counts
print(f"Participants with exactly 54 surveys: {(participant_counts == 54).sum()}")
print(f"Participants without exactly 54 surveys: {len(participants_without_54_surveys)}")
print(output_df)  # Display the survey counts for participants without exactly 54 surveys


In [None]:
output_df.head()

In [76]:
# Assuming df_ema is your DataFrame
participant_counts = df_ema['Subject_ID_Roman_x'].value_counts()

# Extract the list of participants with exactly 54 surveys
participants_with_54_surveys = participant_counts[participant_counts == 54].index.tolist()

# Filter the DataFrame to include only participants with exactly 54 surveys
df_ema = df_ema[df_ema['Subject_ID_Roman_x'].isin(participants_with_54_surveys)]


In [None]:
df_ema['Subject_ID_Roman_x'].nunique()

In [None]:
# Get all unique participant IDs from df_filtered_deleted_app
unique_participant_ids = output_df['Subject_ID_Roman_x'].unique()

# Print the unique participant IDs
print("Unique Participant IDs in df_filtered_not_54_surveys:")
print(unique_participant_ids)

# Print the count of unique participant IDs
print("Number of Unique Participant IDs:", len(unique_participant_ids))

In [79]:
df_ema_filtered_sorted = df_ema_filtered.sort_values(['Subject_ID_Roman_x', 'Issued Time'])

In [80]:
df_ema_filtered_sorted_reindexed = df_ema_filtered_sorted.reset_index(drop=True)

In [None]:
df_ema_filtered_sorted_reindexed.head(3)

In [None]:
df_ema.head(1)

# Merge the file together with the independent variables file

## Data Ingestion

In [83]:
df_questionnaires = pd.read_excel('../data_questionnaire/2023006_COBRA_20230811.xlsx')
df_id_mapping = pd.read_excel('../data_ema/COBRA ID Mapping for Roman.xlsx')

#df_InterVarValue = pd.read_csv('../data/mcd_mongo_exports/InterventionVariableWithValue.csv')
#df_PartVarValue = pd.read_csv('../data/mcd_mongo_exports/ParticipantVariableWithValue.csv')

#### Joining the two dataframes

In [84]:
df_ema = df_ema.merge(df_id_mapping[['ID - Ethica EMA', 'Subject_ID_Roman']], 
                    left_on='ID - Ethica EMA', 
                    right_on='ID - Ethica EMA', 
                    how='left')



In [None]:
df_ema['Subject_ID_Roman_x'].nunique()

In [None]:
len(df_ema)

In [None]:
df_questionnaires.head(2)

In [None]:
questionnaires_unique = df_questionnaires['SubjectID'].unique()
len(questionnaires_unique)

In [None]:
# Find values unique to each dataframe
unique_to_questionnaires = set(questionnaires_unique) - set(ema_unique)
unique_to_ema = set(ema_unique) - set(questionnaires_unique)

# Print values unique to each dataframe
print("Values unique to df_questionnaires:")
print(unique_to_questionnaires)

print("\nValues unique to df_ema:")
print(unique_to_ema)

In [90]:
# MOve the subject ID to the left side of the dataframe
subject_id_roman = df_ema['Subject_ID_Roman']

# Drop the "Subject_ID_Roman" column from the original DataFrame
df_ema = df_ema.drop('Subject_ID_Roman', axis=1)

# Insert the "Subject_ID_Roman" column at the beginning (left) of the DataFrame
df_ema.insert(0, 'Subject_ID_Roman', subject_id_roman)


##### Merge with the questionnaires dataframe

In [91]:
df_ema = df_ema.merge(
    df_id_mapping[['ID - Ethica EMA', 'Subject_ID_Roman']],
    left_on='ID - Ethica EMA',
    right_on='ID - Ethica EMA',
    how='left',
    suffixes=('_left', '_right')
)

In [None]:
df_ema.head(220).tail(2)

#### Create index for each rows for joining with dependent var df later on

In [93]:
df_ema['initial_index_ema'] = range(len(df_ema))

In [None]:
df_ema.head(3)

In [95]:
df_ema['Issued_minus_scheduled_time'] = (df_ema['Issued Time'] - df_ema['Scheduled Time']).dt.total_seconds()
filtered_df = df_ema[df_ema['Issued_minus_scheduled_time'] > 1.0]

In [None]:
df_ema.head(220).tail(2)

In [None]:
non_zero_count = (df_ema['Issued_minus_scheduled_time'] > 1).sum()
non_zero_count

# Feature engineering

#### Buidling time of the day feature

Labelling:
1: 8-10.30am
2: 10.30am-1pm
3: 1-23.30pm
4: 3.30pm-6pm
5: 6-8.30pm
6: 8.30-12am

In [98]:

# Assuming "issued time" is in datetime format
df_ema['time of the day'] = pd.cut(df_ema['Scheduled Time'].dt.hour*60 + df_ema['Scheduled Time'].dt.minute, 
                                   bins=np.array([0,1,2,3,4,5,6,7,8.1,9,10,11,12,13.2,14,15,16,17,18,19.2,20,21,22,23,24])*60,
                                   labels=[0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23],
                                   include_lowest=True)



In [99]:
# Extract hour and minute from the timestamp
df_ema['hour'] = df_ema['Scheduled Time'].dt.hour
df_ema['minute'] = df_ema['Scheduled Time'].dt.minute

# Combine hour and minute to represent time in minutes
df_ema['time_in_minutes'] = df_ema['hour'] * 60 + df_ema['minute']

In [100]:
import pandas as pd

# Assuming "issued time" is in datetime format
df_ema['time_of_the_day_categories'] = pd.cut(
    df_ema['Scheduled Time'].dt.hour * 60 + df_ema['Scheduled Time'].dt.minute,
    bins=[-1, 479, 629,779, 929, 1079, 1199, 1439],  # The upper bounds represent the end of each time category
    labels=[0, 1, 2, 3, 4, 5, 6],  # The assigned values for each time category
    include_lowest=True
)


In [101]:
import pandas as pd

# Assuming "issued time" is in datetime format
df_ema['time_of_the_day_categories_dummy_no_last_p'] = pd.cut(
    df_ema['Scheduled Time'].dt.hour * 60 + df_ema['Scheduled Time'].dt.minute,
    bins=[-1, 479, 629,779, 929, 1079],  # The upper bounds represent the end of each time category
    labels=[0, 1, 2, 3, 4],  # The assigned values for each time category
    include_lowest=True
)


In [102]:
import pandas as pd

# Assuming "issued time" is in datetime format
df_ema['time_of_the_day_categories_dummy'] = pd.cut(
    df_ema['Scheduled Time'].dt.hour * 60 + df_ema['Scheduled Time'].dt.minute,
    bins=[-1, 479, 629,779, 929, 1079, 1199],  # The upper bounds represent the end of each time category
    labels=[0, 1, 2, 3, 4, 5],  # The assigned values for each time category
    include_lowest=True
)


In [103]:
dummy_df = pd.get_dummies(df_ema['time_of_the_day_categories'], prefix='time_of_day')

# Rename the columns
dummy_df.columns = [
    'time_of_day_nighttime',
    'time_of_day_8_to_930',
    'time_of_day_1030_to_12',
    'time_of_day_13_to_1430',
    'time_of_day_1530_to_17',
    'time_of_day_18_to_1930',
    'time_of_day_2030_to_2130'
]

# Concatenate the dummy variables with the original DataFrame
df_ema = pd.concat([df_ema, dummy_df], axis=1)

In [None]:
result = df_ema.groupby(['Subject_ID_Roman_left', 'time_of_the_day_categories']).size().reset_index(name='count')

In [None]:
users_with_value_0 = df_ema[df_ema['time_of_the_day_categories'] == 0]['Subject_ID_Roman_left'].unique()

# Count the number of users
num_users_with_value_0 = len(users_with_value_0)

print("Number of users with at least one occurrence of value 0:", num_users_with_value_0)


In [None]:
df_ema['Subject_ID_Roman_left'].nunique()

In [None]:
result = df_ema.groupby(['Subject_ID_Roman_left', 'time_of_the_day_categories']).size().reset_index(name='count')

In [None]:
df_ema['time_of_the_day_categories'].value_counts()

In [None]:
df_ema['time_of_the_day_categories'].value_counts()

In [None]:
df_ema['Subject_ID_Roman_left'].nunique()

In [None]:
category_counts = df_ema.groupby(['Subject_ID_Roman_left', 'time_of_the_day_categories']).size().unstack(fill_value=0)

# Identify subject IDs for each category
subjects_category_1 = category_counts[category_counts[1] == 9].index.tolist()
subjects_category_2 = category_counts[category_counts[2] == 9].index.tolist()
subjects_category_3 = category_counts[category_counts[3] == 9].index.tolist()
subjects_category_4 = category_counts[category_counts[4] == 9].index.tolist()
subjects_category_5 = category_counts[category_counts[5] == 9].index.tolist()
subjects_category_6 = category_counts[category_counts[6] == 9].index.tolist()

# Check if there are exactly 9 occurrences for each category
are_counts_equal_9 = all(category_counts[i].eq(9).all() for i in range(1, 7))

# Combine all subject IDs for each category
all_subjects = set(subjects_category_1) & set(subjects_category_2) & set(subjects_category_3) & set(subjects_category_4) & set(subjects_category_5) & set(subjects_category_6)

# Create a new object to store the result
combined_subjects = list(all_subjects)

# Count surveys for participants who do not have exactly 9 surveys in each category
excluded_subjects = category_counts.index.difference(combined_subjects)
excluded_counts = category_counts.loc[excluded_subjects]

# Print the survey counts for excluded participants
print("Survey counts for participants who do not have exactly 9 surveys in each category:")
print(excluded_counts)


In [None]:
df_ema['Subject_ID_Roman_left'].nunique()

In [None]:
df_ema['Subject_ID_Roman_left'].nunique()

In [None]:
len(combined_subjects)

In [115]:
# Combine all subject IDs for each category
all_subjects = set(subjects_category_1) & set(subjects_category_2) & set(subjects_category_3) & set(subjects_category_4) & set(subjects_category_5) & set(subjects_category_6)

# Identify subject IDs not in all categories
not_in_all_categories = set(df_ema['Subject_ID_Roman_left'].unique()) - all_subjects

# Create a list of subject IDs not in all categories
not_in_all_categories_list = list(not_in_all_categories)


In [None]:
len(not_in_all_categories_list)

In [117]:
not_in_all_categories_list = pd.DataFrame(not_in_all_categories_list)
not_in_all_categories_list.to_csv(f'../outputs/participant_dropouts_and_timezone_issues.csv', index=False)

In [None]:
df_ema['Subject_ID_Roman_left'].nunique()

There are 12 participants who do not have exactly 9 surveys in each category, which we drop now, so from 631 - 12 = 619 participants are now remaining

In [119]:
df_ema = df_ema[df_ema['Subject_ID_Roman_left'].isin(combined_subjects)]

In [None]:
import matplotlib.pyplot as plt

# Assuming you have created the 'time_of_the_day_categories' column as described in the adapted code
plt.figure(figsize=(10, 6))
bins = range(0, int(df_ema['time_in_minutes'].max()) + 60, 1)
plt.hist(df_ema['time_in_minutes'], bins=bins, edgecolor='black', alpha=0.7)

# Set x-axis ticks to represent hours in a 24-hour format with hourly intervals
hour_ticks = range(0, int(df_ema['time_in_minutes'].max()) + 60, 60)
hour_labels = ['12a', '1a', '2a', '3a', '4a', '5a', '6a', '7a', '8a', '9a', '10a', '11a', '12p', '1p', '2p', '3p', '4p', '5p', '6p', '7p', '8p', '9p', '10p', '11p', '12a']

# Add vertical lines for each time category boundary
category_boundaries = [480, 630, 780, 930, 1080, 1200]
for i, boundary in enumerate(category_boundaries):
    plt.axvline(boundary, color='red', linestyle='--', linewidth=1)
    plt.text(boundary + 5, plt.ylim()[1] * 0.95, str(i + 1), color='red', fontweight='bold')

# Labeling the plot
plt.title('Frequency Plot of Prompt Timings with Category Boundaries')
plt.xlabel('Time of the day')
plt.ylabel('Number of Prompt')

# Show the plot
plt.show()


In [None]:
# Assuming you have created the 'time_of_the_day_categories' column as described in the previous response
time_of_day_counts = df_ema['time_of_the_day_categories'].value_counts()

# Define explanations for each category
category_explanations = {
    0: '12am - 8am',
    1: '8am - 10:29am',
    2: '10:30am - 12:59pm',
    3: '1pm - 3:29pm',
    4: '3:30pm - 5:59pm',
    5: '6pm - 8:29pm',
    6: '8:30pm - 11:59pm'
}

# Sort the counts based on the category
sorted_counts = time_of_day_counts.sort_index()

# Display the counts and their corresponding categories with explanations
print("Time of the Day Categories:")
for category, count in sorted_counts.items():
    explanation = category_explanations[category]
    print(f"Category {category}: {explanation} - Count: {count}")


#### Build sleep quality feature

In [122]:
# Create a new column 'sleep_quality' and assign values from '[1_VAS] How would you rate your overall sleep quality?&nb'
df_ema['sleep_quality'] = df_ema['[1_VAS]  How would you rate your overall sleep quality?&nb']

# Convert 'Issued Time' to datetime if it's not already
df_ema['Issued Time'] = pd.to_datetime(df_ema['Issued Time'])

# Sort DataFrame by 'Issued Time'
df_ema = df_ema.sort_values(by='Issued Time')

# Iterate over unique participant IDs
for participant_id in df_ema['Subject_ID_Roman_x'].unique():
    # Select rows for the current participant
    participant_df = df_ema[df_ema['Subject_ID_Roman_x'] == participant_id].copy()  # Make a copy to avoid SettingWithCopyWarning
    
    # Forward-fill the sleep quality values within each date group for the current participant
    participant_df['sleep_quality'] = participant_df.groupby(participant_df['Issued Time'].dt.date)['sleep_quality'].transform(lambda x: x.ffill().bfill())
    
    # Update the original DataFrame with the modified values
    df_ema.loc[participant_df.index, 'sleep_quality'] = participant_df['sleep_quality']


In [None]:
df_ema.head()

In [None]:
df_ema['sleep_quality'].value_counts()

In [None]:
df_ema['sleep_quality'].isna().sum()

In [None]:
df_ema['sleep_quality'].notna().sum()

In [None]:
df_ema.head()

#### Build number of study survey feature

In [128]:
df_ema['Issued Time'] = pd.to_datetime(df_ema['Issued Time'])  # Convert to datetime if not already

# Initialize a new column 'number of survey'
df_ema['number_of_survey'] = 0

# Iterate through each unique participant ID
for participant_id in df_ema['Subject_ID_Roman_x'].unique():
    # Filter rows for the current participant
    participant_rows = df_ema[df_ema['Subject_ID_Roman_x'] == participant_id]
    
    # Sort the participant rows based on 'issued time'
    sorted_rows = participant_rows.sort_values(by='Issued Time')
    
    # Assign 'number of survey' based on the sorted order
    df_ema.loc[sorted_rows.index, 'number_of_survey'] = range(1, len(sorted_rows) + 1)


In [None]:
user_category_counts = df_ema.groupby('Subject_ID_Roman_x')['number_of_survey'].nunique()

# Check if all users have exactly one of all 54 categories
all_categories_present = all(user_category_counts == 54)

if all_categories_present:
    print("Every user has exactly one of all 54 categories.")
else:
    print("Not every user has exactly one of all 54 categories.")


In [None]:
df_ema['number_of_survey'].value_counts()

#### Build day in study/days of monitoring feature

In [None]:
# Count rows with non-normal time stamps (NaT values)
non_normal_count = df_ema['Issued Time'].isna().sum()

print(f"Number of rows with non-normal time stamps: {non_normal_count}")

In [None]:
# Count rows with non-normal time stamps (NaT values)
non_normal_count = df_ema['Scheduled Time'].isna().sum()

print(f"Number of rows with non-normal time stamps: {non_normal_count}")

In [None]:
df_ema['Issued Time'] = pd.to_datetime(df_ema['Issued Time'])

# Group by 'Subject_ID_Roman_x'
for participant_id, data in tqdm(df_ema.groupby('Subject_ID_Roman_x')):
    # Filter out rows with valid 'Issued Time'
    data = data.dropna(subset=['Issued Time'])

    if not data.empty:  # Proceed if there's data after dropping NaT values
        unique_dates = sorted(data['Issued Time'].dt.date.unique())
        date_label_mapping = {date: i + 1 for i, date in enumerate(unique_dates)}
        data['day_of_monitoring'] = data['Issued Time'].dt.date.map(date_label_mapping)
        df_ema.loc[data.index, 'day_of_monitoring'] = data['day_of_monitoring']


In [None]:
df_ema['day_of_monitoring'].value_counts()

In [None]:
category_counts = df_ema.groupby(['Subject_ID_Roman_left', 'day_of_monitoring']).size().unstack(fill_value=0)

# Identify subject IDs for each category
subjects_category_1 = category_counts[category_counts[1] != 6].index.tolist()
subjects_category_2 = category_counts[category_counts[2] != 6].index.tolist()
subjects_category_3 = category_counts[category_counts[3] != 6].index.tolist()
subjects_category_4 = category_counts[category_counts[4] != 6].index.tolist()
subjects_category_5 = category_counts[category_counts[5] != 6].index.tolist()
subjects_category_6 = category_counts[category_counts[6] != 6].index.tolist()
subjects_category_7 = category_counts[category_counts[7] != 6].index.tolist()
subjects_category_8 = category_counts[category_counts[8] != 6].index.tolist()
subjects_category_9 = category_counts[category_counts[9] != 6].index.tolist()

# Combine all subject IDs for each category
all_subjects = set(subjects_category_1) | set(subjects_category_2) | set(subjects_category_3) | set(subjects_category_4) | set(subjects_category_5) | set(subjects_category_6) | set(subjects_category_7) | set(subjects_category_8) | set(subjects_category_9)

# Create a new object to store the result
combined_subjects = list(all_subjects)

# Count the number of surveys for each participant in each category
survey_counts = {}
for subject in combined_subjects:
    survey_counts[subject] = {}
    for category in range(1, 10):
        if subject in category_counts.index:
            survey_counts[subject][category] = category_counts.loc[subject, category]
        else:
            survey_counts[subject][category] = 0

# Print the survey counts for each participant in each category
for subject, counts in survey_counts.items():
    print(f"Subject ID: {subject}")
    for category, count in counts.items():
        print(f"Category {category}: {count} surveys")
    print()


In [136]:
category_counts = df_ema.groupby(['Subject_ID_Roman_left', 'day_of_monitoring']).size().unstack(fill_value=0)
# Identify subject IDs for each category
subjects_category_1 = category_counts[category_counts[1] == 6].index.tolist()
subjects_category_2 = category_counts[category_counts[2] == 6].index.tolist()
subjects_category_3 = category_counts[category_counts[3] == 6].index.tolist()
subjects_category_4 = category_counts[category_counts[4] == 6].index.tolist()
subjects_category_5 = category_counts[category_counts[5] == 6].index.tolist()
subjects_category_6 = category_counts[category_counts[6] == 6].index.tolist()
subjects_category_7 = category_counts[category_counts[7] == 6].index.tolist()
subjects_category_8 = category_counts[category_counts[8] == 6].index.tolist()
subjects_category_9 = category_counts[category_counts[9] == 6].index.tolist()
# Check if there are exactly 9 occurrences for each category
are_counts_equal_9 = all(category_counts[i].eq(6).all() for i in range(1, 10))

In [137]:
# Combine all subject IDs for each category
all_subjects = set(subjects_category_1) & set(subjects_category_2) & set(subjects_category_3) & set(subjects_category_4) & set(subjects_category_5) & set(subjects_category_6) & set(subjects_category_7) & set(subjects_category_8) & set(subjects_category_9)
# Create a new object to store the result
combined_subjects = list(all_subjects)

In [None]:
len(combined_subjects)

In [139]:
# Combine all subject IDs for each category
all_subjects = set(subjects_category_1) & set(subjects_category_2) & set(subjects_category_3) & set(subjects_category_4) & set(subjects_category_5) & set(subjects_category_6) & set(subjects_category_7) & set(subjects_category_8) & set(subjects_category_9) 

# Identify subject IDs not in all categories
not_in_all_categories = set(df_ema['Subject_ID_Roman_left'].unique()) - all_subjects

# Create a list of subject IDs not in all categories
not_in_all_categories_list = list(not_in_all_categories)

In [None]:
print(not_in_all_categories)

In [None]:
len(not_in_all_categories_list)

In [142]:
not_in_all_categories_list = pd.DataFrame(not_in_all_categories_list)
not_in_all_categories_list.to_csv(f'../outputs/participant_not_all_surveys_issued_or_10_days.csv', index=False)

In [143]:
# Filter the df_ema DataFrame based on the subject IDs in not_in_all_categories_list
df_ema_filtered = df_ema[df_ema['Subject_ID_Roman_x'].isin(not_in_all_categories_list)]

# Sort the filtered DataFrame by Participant ID and Issued Time
df_ema_filtered_sorted = df_ema_filtered.sort_values(['Subject_ID_Roman_x', 'Issued Time'])

# Reset the index of the sorted DataFrame
df_ema_filtered_sorted_reindexed = df_ema_filtered_sorted.reset_index(drop=True)

# Create a boolean mask for participant IDs with at least one row having NaT value in 'Issued Time'
mask = df_ema_filtered_sorted_reindexed.groupby('Subject_ID_Roman_x')['Issued Time'].transform(lambda x: x.isnull().any())

# Filter the DataFrame based on the mask
df_filtered_missing_issued_time = df_ema_filtered_sorted_reindexed[mask]
df_filtered_other_issue = df_ema_filtered_sorted_reindexed[~mask]


From 619 users 37 users did not have exactly 9 values in the day_of_monitoring category, so remaining now are: 619-37 = 582 users

In [144]:
df_ema = df_ema[df_ema['Subject_ID_Roman_left'].isin(combined_subjects)]


In [None]:
len(df_ema)

In [None]:
df_ema['day_of_monitoring'].value_counts()

In [None]:
df_ema['Subject_ID_Roman_x'].nunique()

#### Build day of the week feature

In [148]:
df_ema['day_of_the_week'] = df_ema['Issued Time'].dt.dayofweek + 1

In [None]:
df_ema['day_of_the_week'].value_counts()

In [None]:
df_ema['day_of_the_week'].isna().sum()

In [None]:
df_ema['day_of_the_week'].notna().sum()

In [None]:
df_ema.head(2)

In [153]:
day_of_week_dummies = pd.get_dummies(df_ema['day_of_the_week'])

# Rename columns to match your specified format
day_of_week_dummies.columns = [
    'day_of_week_monday',
    'day_of_week_tuesday',
    'day_of_week_wednesday',
    'day_of_week_thursday',
    'day_of_week_friday',
    'day_of_week_saturday',
    'day_of_week_sunday'
]

# Concatenate the dummy variables with the original DataFrame
df_ema = pd.concat([df_ema, day_of_week_dummies], axis=1)


In [154]:
df_ema['day_of_the_week_dummy'] = df_ema['day_of_the_week']

# Replace all 1 values with NA in the 'time_of_the_week_dummy' column
df_ema.loc[df_ema['day_of_the_week_dummy'] == 1, 'day_of_the_week_dummy'] = pd.NA


In [None]:
df_ema['day_of_the_week_dummy'].value_counts()

##### Day of Week Bindary (weekday vs. weekend)

In [156]:
df_ema['day_of_the_week_binary'] = np.where(
    df_ema['day_of_the_week'].isin([1, 2, 3, 4, 5]),  # Label as 0 for values 1, 2, 3, 4, 5
    0, 
    np.where(
        df_ema['day_of_the_week'].isin([6, 7]),  # Label as 1 for values 6, 7
        1,
        np.nan  # Preserve NaN values
    )
)


In [None]:
df_ema['day_of_the_week_binary'].value_counts()

In [None]:
df_ema.head(2)

#### Build incentives feature

Labels incentives:

1: Incentive category - 30$ (had at least 1 survey on a date before the 20th of September 2022)

0: Incentive category - 20$  (Person joined the study on or after the 20th of September 2022)

In [159]:
df_ema['incentive_category'] = (df_ema['Scheduled Time'] < '2022-09-20').astype(int)


In [160]:
# Create a mask for rows where 'scheduled time' is before '2023-09-20'
mask = df_ema['Scheduled Time'] < '2022-09-20'

# Find unique participants meeting the condition
participants_with_condition = df_ema.loc[mask, 'Subject_ID_Roman_x'].unique()

# Update 'incentive category' based on the condition for each participant
for participant in participants_with_condition:
    df_ema.loc[df_ema['Subject_ID_Roman_x'] == participant, 'incentive_category'] = 1

# Fill NaN values with 0 in the 'incentive category' column
df_ema['incentive_category'].fillna(0, inplace=True)



In [None]:
df_ema['incentive_category'].value_counts()

#### Build joining date feature

In [162]:

# Create 'joining day' column
df_ema['Scheduled Time'] = pd.to_datetime(df_ema['Scheduled Time'])  # Convert to datetime if not already in datetime format
df_ema['joining day'] = df_ema['Scheduled Time'].dt.day_name()

# Find the oldest 'Scheduled time' for each participant
oldest_times = df_ema.groupby('Subject_ID_Roman_x')['Scheduled Time'].min()

# Iterate through unique participants to get the oldest 'joining day'
for participant, oldest_time in oldest_times.items():
    oldest_day = df_ema.loc[(df_ema['Subject_ID_Roman_x'] == participant) & (df_ema['Scheduled Time'] == oldest_time), 'joining day']
    if not oldest_day.empty:  # Check if any matching record is found
        oldest_day = oldest_day.values[0]  # Extract the value
        df_ema.loc[df_ema['Subject_ID_Roman_x'] == participant, 'joining day'] = oldest_day

In [None]:
# Assuming 'df_ema' is your DataFrame and 'Subject_ID_Roman_x' contains participant IDs

unique_participant_day_counts = df_ema.groupby(['Subject_ID_Roman_x', 'joining day']).size().reset_index(name='Count')

# Count the unique occurrences of each 'joining day'
unique_day_counts = unique_participant_day_counts['joining day'].value_counts()

unique_day_counts


In [None]:
# Count the frequencies of each day
day_counts = df_ema['joining day'].value_counts()

# Plotting the bar chart
plt.figure(figsize=(8, 6))
day_counts.plot(kind='bar', color='skyblue')
plt.title('Frequency of Joining Days')
plt.xlabel('Day of the Week')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x labels for better readability
plt.tight_layout()
plt.show()

#### Building survey length feature

In [165]:
start_column = '[2_MAQ_1] eating/drinking (other than water)'
end_column = '[17_MAQ_6] I wanted to please others who wanted me to eat it'

# Create a new column "survey length" and count non-NaN or non-None values for the specified range of columns
df_ema['survey length'] = df_ema.loc[:, start_column:end_column].count(axis=1)


In [166]:
df_ema['survey length'] = df_ema.loc[:, start_column:end_column].apply(lambda row: row.drop('src').count(), axis=1)


Survey_length - 4 is done to center the survey length feature

In [167]:
df_ema['survey length'] = df_ema['survey length'] - 4

In [None]:
df_ema['survey length'].describe()

#### Build "what have you been doing since last prompt?" - feature as fidelity-check, aka. First_question_length?

In [169]:
start_column = '[2_MAQ_1] eating/drinking (other than water)'
end_column = '[2_MAQ_13] praying/meditation'

# Create a new column "survey length" and count non-NaN or non-None values for the specified range of columns
df_ema['First_question_length'] = df_ema.loc[:, start_column:end_column].count(axis=1)


In [170]:
df_ema['nothing'] = df_ema['[2_MAQ_6] nothing\r'].map({True: 1, False: 0})

# Handling None values by replacing them with 0
df_ema['nothing'].fillna(0, inplace=True)

# Convert the column to integer type
df_ema['nothing'] = df_ema['nothing'].astype(int)


In [None]:
df_ema['nothing'].value_counts()

In [172]:
df_try=df_ema.copy()

In [173]:
df_not = df_ema[df_ema['[2_MAQ_6] nothing\r'].notna()].copy()

In [None]:
df_not.head(2)

In [None]:
df_not['First_question_length'].value_counts()

In [None]:
df_ema.head(2)

In [None]:
df_ema['First_question_length'].value_counts()

#### Build Social setting feature

In [None]:
df_ema.head(2)

In [None]:
# Assuming df_ema is your DataFrame
# Replace column names accordingly if they are different

# Columns of interest
columns_25 = [ '[25_MAQ_8] other', '[25_MAQ_7] pet\r',
              '[25_MAQ_6] no one\r', '[25_MAQ_5] colleagues/fellow students', '[25_MAQ_4] friends\r',
              '[25_MAQ_3] other family members\r', '[25_MAQ_2] your children\r', '[25_MAQ_1] spouse/partner\r']

columns_60 = [ '[60_MAQ_8] other', '[60_MAQ_7] pet\r',
              '[60_MAQ_6] no one\r', '[60_MAQ_5] colleagues\r', '[60_MAQ_4] friends\r',
              '[60_MAQ_3] other family members\r', '[60_MAQ_2] your children\r', '[60_MAQ_1] spouse/partner\r']

columns_58 = [ '[58_MAQ_8] other', '[58_MAQ_7] pet\r',
              '[58_MAQ_6] no one\r', '[58_MAQ_5] colleagues\r', '[58_MAQ_4] friends\r',
              '[58_MAQ_3] other family members\r', '[58_MAQ_2] your children\r', '[58_MAQ_1] spouse/partner\r']

# Count the number of rows with values in only one column
count_only_25 = (df_ema[columns_25].notna().sum(axis=1) == 1).sum()
count_only_60 = (df_ema[columns_60].notna().sum(axis=1) == 1).sum()
count_only_58 = (df_ema[columns_58].notna().sum(axis=1) == 1).sum()

# Count the number of rows with values in two columns
count_in_25_and_60 = ((df_ema[columns_25].notna() & df_ema[columns_60].notna()).sum(axis=1) == 2).sum()
count_in_25_and_58 = ((df_ema[columns_25].notna() & df_ema[columns_58].notna()).sum(axis=1) == 2).sum()
count_in_60_and_58 = ((df_ema[columns_60].notna() & df_ema[columns_58].notna()).sum(axis=1) == 2).sum()

# Count the number of rows with values in all three columns
count_in_all = ((df_ema[columns_25].notna() & df_ema[columns_60].notna() & df_ema[columns_58].notna()).sum(axis=1) == 3).sum()

# Display the counts
print(f'Count of rows with values in only one column - 25: {count_only_25}')
print(f'Count of rows with values in only one column - 60: {count_only_60}')
print(f'Count of rows with values in only one column - 58: {count_only_58}')

print(f'Count of rows with values in two columns (25 and 60): {count_in_25_and_60}')
print(f'Count of rows with values in two columns (25 and 58): {count_in_25_and_58}')
print(f'Count of rows with values in two columns (60 and 58): {count_in_60_and_58}')

print(f'Count of rows with values in all three columns: {count_in_all}')


In [180]:
# Combine values for [MAQ_6] no one
df_ema['Social Setting: no one\r'] = df_ema['[25_MAQ_6] no one\r'].combine_first(df_ema['[60_MAQ_6] no one\r']).combine_first(df_ema['[58_MAQ_6] no one\r'])

# Combine values for [MAQ_5] colleagues/fellow students
df_ema['Social Setting: colleagues/fellow students'] = df_ema['[25_MAQ_5] colleagues/fellow students'].combine_first(df_ema['[60_MAQ_5] colleagues\r']).combine_first(df_ema['[58_MAQ_5] colleagues\r'])

# Combine values for [MAQ_4] friends
df_ema['Social Setting: friends\r'] = df_ema['[25_MAQ_4] friends\r'].combine_first(df_ema['[60_MAQ_4] friends\r']).combine_first(df_ema['[58_MAQ_4] friends\r'])

# Combine values for [MAQ_3] other family members
df_ema['Social Setting: other family members\r'] = df_ema['[25_MAQ_3] other family members\r'].combine_first(df_ema['[60_MAQ_3] other family members\r']).combine_first(df_ema['[58_MAQ_3] other family members\r'])

# Combine values for [MAQ_2] your children
df_ema['Social Setting: your children\r'] = df_ema['[25_MAQ_2] your children\r'].combine_first(df_ema['[60_MAQ_2] your children\r']).combine_first(df_ema['[58_MAQ_2] your children\r'])

# Combine values for [MAQ_1] spouse/partner
df_ema['Social Setting: spouse/partner\r'] = df_ema['[25_MAQ_1] spouse/partner\r'].combine_first(df_ema['[60_MAQ_1] spouse/partner\r']).combine_first(df_ema['[58_MAQ_1] spouse/partner\r'])

# Combine values for [MAQ_8] other
df_ema['Social Setting: other'] = df_ema['[25_MAQ_8] other'].combine_first(df_ema['[60_MAQ_8] other']).combine_first(df_ema['[58_MAQ_8] other'])

# Combine values for [MAQ_7] pet
df_ema['Social Setting: pet\r'] = df_ema['[25_MAQ_7] pet\r'].combine_first(df_ema['[60_MAQ_7] pet\r']).combine_first(df_ema['[58_MAQ_7] pet\r'])


In [None]:
# Check the number of rows with at least one value in one of the new columns
new_columns = ['Social Setting: no one\r', 'Social Setting: colleagues/fellow students', 
               'Social Setting: friends\r', 'Social Setting: other family members\r', 
               'Social Setting: your children\r', 'Social Setting: spouse/partner\r', 
               'Social Setting: other', 'Social Setting: pet\r']

rows_with_values = df_ema[new_columns].any(axis=1).sum()

print(f"Number of record with social settings data: {rows_with_values}")


In [None]:

# Describe the newly created columns
new_columns_description = df_ema[['Social Setting: no one\r', 'Social Setting: colleagues/fellow students', 
                                   'Social Setting: friends\r', 'Social Setting: other family members\r', 
                                   'Social Setting: your children\r', 'Social Setting: spouse/partner\r', 'Social Setting: other', 'Social Setting: pet\r']].describe()
new_columns_description.head(10)



As we can see above, there are several rows that have overlapping values

In [None]:
# Count the number of rows with values in the same columns
same_columns_count = df_ema[['[25_MAQ_7] pet\r', '[60_MAQ_7] pet\r', '[58_MAQ_7] pet\r']].count(axis=1)

# Count the number of rows with values in only one column
one_column_count = same_columns_count[same_columns_count == 1].count()

# Count the number of rows with values in two columns
two_columns_count = same_columns_count[same_columns_count == 2].count()

# Count the number of rows with values in all three columns
all_columns_count = same_columns_count[same_columns_count == 3].count()

print(f"Rows with values in the same columns: {all_columns_count}")
print(f"Rows with values in only one column: {one_column_count}")
print(f"Rows with values in two columns: {two_columns_count}")


In [None]:
df_ema['[60_MAQ_7] pet\r'].value_counts()

In [None]:
df_ema['[58_MAQ_7] pet\r'].value_counts()

In [None]:
df_ema['[25_MAQ_7] pet\r'].value_counts()

In [None]:
df_ema.head(5)

Finding:
1. Values can only in of those 3 questions (questions can never be asked more than once, which avoids repetition or contradiction and is good). 
2. Only around 3000 rows have even values here, so quite a lot of missing data... --> Would multiple imputation be an option?


#### Build Location features

In [None]:
rows_with_values_in_both = df_ema[['[29_SAQ]  Where are you right now?', '[42_SAQ]  Where are you right now?']].notna().all(axis=1)

# Count the number of rows meeting the criteria
count_rows_with_both_values = rows_with_values_in_both.sum()

# Display the count
print(f'Count of rows with values in both columns: {count_rows_with_both_values}')


In [None]:
rows_with_values_in_both = df_ema[['[29_SAQ]  Where are you right now?', '[42_SAQ]  Where are you right now?']].notna().all(axis=1)

# Create a new DataFrame with only the rows meeting the criteria
df_both_values = df_ema[rows_with_values_in_both].copy()

# Display the new DataFrame
print(df_both_values[['[29_SAQ]  Where are you right now?', '[42_SAQ]  Where are you right now?']])


--> There seem to be 23 rows which have values in both columns [29_SAQ]  Where are you right now?  [42_SAQ]  Where are you right now? but in almost all instances, the entered values are the same. Therefore, merging both columns to Location_Q seems to have been successful and it can be used as Location-self-report feature. 

Legend labelling - Location Q:
1: in my home
2: at my work
3: somewhere on my commute to work or home
4: indoors elsewhere
5: outdoors around my home
6: outdoors in parks or green space
7: ?

In [190]:
column_29_SAQ = '[29_SAQ]  Where are you right now?'
column_42_SAQ = '[42_SAQ]  Where are you right now?'

# Create a new column Location_Q
df_ema['Location_Current_Ordinal'] = df_ema[column_29_SAQ].combine_first(df_ema[column_42_SAQ])


In [None]:

label_mapping = {
    1: 'in my home',
    2: 'at my work',
    3: 'somewhere on my commute to work or home',
    4: 'indoors elsewhere',
    5: 'outdoors around my home',
    6: 'outdoors in parks or green spaces',
    7: 'outdoors elsewhere',
    8: 'other (please describe)'
}

print(df_ema['Location_Current_Ordinal'].value_counts().rename(label_mapping))

In [192]:

# Mapping of categories
category_mapping = {
    1: 'location_current_home',
    2: 'location_current_work',
    3: 'location_current_commute',
    4: 'location_current_indoors_elsewhere',
    5: 'location_current_home_outside',
    6: 'location_current_green_spaces',
    7: 'location_current_outdoors_elsewhere',
    8: 'location_current_other'
}

# Create dummy variables with NA handling
dummy_columns = pd.get_dummies(df_ema['Location_Current_Ordinal'].map(category_mapping), dummy_na=True)

# Concatenate dummy variables to the original DataFrame
df_ema = pd.concat([df_ema, dummy_columns], axis=1)


In [None]:
df_ema['Location_Current_Ordinal'].isna().sum()

In [None]:

print("Number of record with social settings data:", df_ema['Location_Current_Ordinal'].count())

#### Exclude all df_ema['time_of_the_day_categories'] = nighttime values as they mess up the auto-correlation analysis

In [195]:
df_ema = df_ema[df_ema['time_of_the_day_categories'] != 0 ] 

#### Export features df

In [196]:
df_ema['Duration (seconds) from scheduled to completion time'] = pd.to_numeric(df_ema['Duration (seconds) from scheduled to completion time'], errors='coerce')

In [197]:
df_ema.to_parquet(f'../outputs/targetsEMA_features.parquet', index=False)

#### Export clean

In [None]:
df_ema.head(2)

In [199]:
columns_to_keep = ['Subject_ID_Roman_x', 'initial_index_ema', 'Issued Time', 'response delay', 'response_delay_10min', 'compliance', 'completion_time',  'time of the day', 'hour', 'minute', 'time_in_minutes', 'time_of_the_day_categories', 'time_of_the_day_categories_dummy', 'time_of_the_day_categories_dummy_no_last_p' , 'sleep_quality', 'number_of_survey', 'day_of_monitoring', 'day_of_the_week', 'day_of_the_week_dummy', 'incentive_category', 'joining day', 'survey length', 'First_question_length', 'Social Setting: no one\r', 'Social Setting: colleagues/fellow students', 'Social Setting: friends\r', 'Social Setting: other family members\r', 'Social Setting: your children\r', 'Social Setting: spouse/partner\r', 'Social Setting: other', 'Social Setting: pet\r', '[36_VAS] How stressed do you feel right now?', '[37_VAS]  How hungry do you feel right now?', '[38_VAS] How tired do you feel right now?', '[39_VAS] How happy do you feel right now?','Location_Current_Ordinal', 'location_current_commute', 'location_current_green_spaces', 'location_current_home', 'location_current_home_outside', 'location_current_indoors_elsewhere', 'location_current_other', 'time_of_day_nighttime', 'time_of_day_8_to_930', 'time_of_day_1030_to_12', 'time_of_day_13_to_1430','time_of_day_1530_to_17','time_of_day_18_to_1930','time_of_day_2030_to_2130', 'day_of_week_monday', 'day_of_week_tuesday', 'day_of_week_wednesday', 'day_of_week_thursday', 'day_of_week_friday', 'day_of_week_saturday','day_of_week_sunday', 'day_of_the_week_binary', 'nothing']
df_ema = df_ema[columns_to_keep].copy()


In [None]:
df_ema.head(2)

In [201]:
df_ema.rename(columns={
    '[36_VAS] How stressed do you feel right now?': 'stress',
    '[37_VAS]  How hungry do you feel right now?': 'hunger',
    '[38_VAS] How tired do you feel right now?': 'fatigue',
    '[39_VAS] How happy do you feel right now?': 'affect'
}, inplace=True)

# Now df_ema has the columns renamed


In [None]:
df_ema.head(2)

In [None]:
duplicate_columns = df_ema.columns[df_ema.columns.duplicated()]

print("Duplicate column names:", duplicate_columns)


In [204]:
df_ema.to_parquet(f'../outputs/targetsEMA_features_clean.parquet', index=False)

In [None]:
df_ema.head()

# Questionnaire df

## Data Ingestion

In [206]:
df_questionnaire_sarah = pd.read_excel('../data_questionnaire/COBRA Baseline qn.xlsx')
df_questionnaires = pd.read_excel('../data_questionnaire/2023006_COBRA_20230811.xlsx')
df_id_mapping = pd.read_excel('../data_ema/COBRA ID Mapping for Roman.xlsx')
# df_targets = pd.read_parquet('../outputs/targets.parquet')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [207]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
df_questionnaires.head()

In [None]:
df_questionnaire_sarah.head()

In [210]:
df_questionnaire_all = pd.merge(df_questionnaires, df_questionnaire_sarah, left_on='SubjectID', right_on='Subject_ID', how='right')


In [None]:
df_ema.head()

In [212]:
df_ema = pd.merge(df_ema, df_questionnaire_all, left_on='Subject_ID_Roman_x', right_on='SubjectID', how='inner')


In [None]:
df_ema.head(30000).tail(5)

#### Build personality feature

Legend: 
a1_personality = is reserved
a2 = is generally trusting
a3 = tends to be lazy
a4 = is relaxed, handles stress well
a5 = has few artistic skills
a6 = is outgoing, sociable
a7 = tends to find faults with others
a8 = does a thorough job
a9 = gets nervous easily
a10 = has an active imagination



In [None]:
# Assuming df_ema is your DataFrame
df_ema['extraversion'] = (df_ema['a6_personality'] + (5 - df_ema['a1_personality']))/2
df_ema['agreeableness'] = (df_ema['a2_personality'] + (5 - df_ema['a7_personality']))/2
df_ema['conscientiousness'] = (df_ema['a8_personality'] + (5 - df_ema['a3_personality']))/2
df_ema['neuroticism'] = (df_ema['a9_personality'] + (5 - df_ema['a4_personality']))/2
df_ema['openness'] = (df_ema['a10_personality'] + (5 - df_ema['a5_personality']))/2

df_ema.head()


In [None]:
df_ema['a1_personality'].value_counts()

In [None]:
df_ema['extraversion'].value_counts()

In [None]:
df_ema['conscientiousness'].value_counts()

In [None]:
df_ema['agreeableness'].value_counts()

In [None]:
df_ema['neuroticism'].value_counts()

In [None]:
df_ema['openness'].value_counts()

In [None]:

# Describe the newly created columns
new_columns_description = df_ema[['openness', 'neuroticism', 'conscientiousness', 'agreeableness', 'extraversion']].describe()
new_columns_description.head(10)



#### Build screen time feature

In [222]:
df_ema['screen_time_weekday_tv_mins'] = (df_ema['e1_screen_hours'] * 60) + df_ema['e1_screen_mins']
df_ema['screen_time_weekday_tv_connected_mins'] = (df_ema['e2_screen_hours'] * 60) + df_ema['e2_screen_mins']
df_ema['screen_time_weekday_computer_mins'] = (df_ema['e3_screen_hours'] * 60) + df_ema['e3_screen_mins']
df_ema['screen_time_weekday_smartphone_mins'] = (df_ema['e4_screen_hours'] * 60) + df_ema['e4_screen_mins']
df_ema['screen_time_weekday_tablet_mins'] = (df_ema['e5_screen_hours'] * 60) + df_ema['e5_screen_mins']
df_ema['screen_time_weekend_tv_mins'] = (df_ema['e6_screen_hours'] * 60) + df_ema['e6_screen_mins']
df_ema['screen_time_weekend_tv_connected_mins'] = (df_ema['e7_screen_hours'] * 60) + df_ema['e7_screen_mins']
df_ema['screen_time_weekend_computer_mins'] = (df_ema['e8_screen_hours'] * 60) + df_ema['e8_screen_mins']
df_ema['screen_time_weekend_smartphone_mins'] = (df_ema['e9_screen_hours'] * 60) + df_ema['e9_screen_mins']
df_ema['screen_time_weekend_tablet_mins'] = (df_ema['e10_screen_hours'] * 60) + df_ema['e10_screen_mins']


##### Average daily screentime (average weekday & weekday) per device

In [223]:
df_ema['screen_time_average_tv_mins'] = (df_ema['screen_time_weekday_tv_mins']*5+df_ema['screen_time_weekend_tv_mins']*2)/7
df_ema['screen_time_average_tv_connected_mins'] = (df_ema['screen_time_weekday_tv_connected_mins']*2+df_ema['screen_time_weekend_tv_connected_mins']*2)/7
df_ema['screen_time_average_computer_mins'] = (df_ema['screen_time_weekday_computer_mins']*5+df_ema['screen_time_weekend_computer_mins']*2)/7
df_ema['screen_time_average_smartphone_mins'] = (df_ema['screen_time_weekday_smartphone_mins']*5+df_ema['screen_time_weekend_smartphone_mins']*2)/7
df_ema['screen_time_average_tablet_mins'] = (df_ema['screen_time_weekday_tablet_mins']*5+df_ema['screen_time_weekend_tablet_mins']*2)/7







##### Overall daily screentime (weekend vs. weekdays)

In [224]:
df_ema['screen_time_weekend_average'] = df_ema['screen_time_weekend_tv_mins'] + df_ema['screen_time_weekend_tv_connected_mins'] + df_ema['screen_time_weekend_computer_mins'] + df_ema['screen_time_weekend_smartphone_mins'] + df_ema['screen_time_weekend_tablet_mins']
df_ema['screen_time_weekday_average'] = df_ema['screen_time_weekday_tv_mins'] + df_ema['screen_time_weekday_tv_connected_mins'] + df_ema['screen_time_weekday_computer_mins'] + df_ema['screen_time_weekday_smartphone_mins'] + df_ema['screen_time_weekday_tablet_mins']


##### Overall daily screen time (all devices together and average weekend +weekdays)

In [225]:
df_ema['screen_time_average']= (df_ema['screen_time_weekend_average']*2 + df_ema['screen_time_weekday_average']*5)/7

In [None]:
df_ema['screen_time_average_tv_mins'].describe()

In [None]:
df_ema['screen_time_average_tv_connected_mins'].describe()

In [None]:
df_ema['screen_time_average_computer_mins'].describe()

In [None]:
df_ema['screen_time_average_smartphone_mins'].describe()

In [None]:
df_ema['screen_time_average_tablet_mins'].describe()

In [None]:
df_ema['screen_time_weekend_average'].describe()

In [None]:
df_ema['screen_time_weekday_average'].describe()

In [None]:
df_ema['screen_time_average'].describe()

In [234]:
# Assuming df_questionnaires is your DataFrame
df_ema.rename(columns={
    'Int_Age': 'age',
    'a2': 'ethnicity',
    'b1': 'alcohol_consumption',
    'c2a': 'smoking_occasionally_binary',
    'c3': 'smoking_daily_binary',
    'c7': 'smoking_amount',
    'i2': 'education_level',
    'i3': 'familiarity_digital_devices',
    'i4': 'employment_status',
    'i41': 'work_hours_per_week_(only_employed)',
    'i43': 'night_shifts_binary',
    'i5': 'household_income',
    'i6': 'owning_home_binary',
    'i7': 'accommodation_type',
}, inplace=True)


In [None]:
df_ema["age"].describe()

Age - 20 is done to center the age distribution

In [236]:
df_ema["age"]= df_ema["age"]-20

In [None]:
df_ema["age"].describe()

#### Make employement status variable binary (employed/not employed)

In [None]:
df_ema['employment_status'].value_counts()

In [239]:
df_ema['employment_status_binary'] = df_ema['employment_status'].isin([1, 2, 3]).astype(int)
df_ema.loc[df_ema['employment_status'].isin([4, 5, 6, 7, 8, 9]), 'employment_status_binary'] = 1


In [None]:
df_ema.head()

In [None]:
df_ema.head()

#### Make smoking_minary feature cominign occasional and daily smoking

In [242]:
df_ema['smoking_binary'] = 2  # Default value for all rows
df_ema.loc[
    (df_ema['smoking_daily_binary'] == 1) | (df_ema['smoking_occasionally_binary'] == 1),
    'smoking_binary'
] = 1


### Get additional dummy variables

In [243]:
df_ema['Gender'] = df_ema['Gender'].replace({1: 0, 2: 1})
df_ema['night_shifts_binary'] = df_ema['night_shifts_binary'].replace({1: 0, 2: 1})
df_ema['owning_home_binary'] = df_ema['owning_home_binary'].replace({1: 0, 2: 1})
df_ema['smoking_daily_binary'] = df_ema['smoking_daily_binary'].replace({1: 0, 2: 1})
df_ema['smoking_binary'] = df_ema['smoking_binary'].replace({1: 0, 2: 1})


#### Ethnicity

In [244]:
# Assuming df_questionnaire_all is your DataFrame
# Use get_dummies function to one-hot encode 'ethnicity'
df_encoded = pd.get_dummies(df_ema['ethnicity'], prefix='ethnicity')

# Map the new column names to more descriptive labels
ethnicity_labels = {
    1: 'Chinese',
    2: 'Malay',
    3: 'Indian',
    4: 'Mixed Chinese/Indian',
    5: 'Mixed Chinese/Malay',
    6: 'Mixed Indian/Malay',
    7: 'Others'
}

# Set the column names during one-hot encoding
df_encoded.columns = [f'ethnicity_{ethnicity_labels.get(int(col.split("_")[1]), "Unknown")}'
                      for col in df_encoded.columns]

# Concatenate the new one-hot encoded columns with the original DataFrame
df_ema = pd.concat([df_ema, df_encoded], axis=1)


#### Income: Replace "prefer not to say" options with NA

In [245]:
df_ema['household_income'] = df_ema['household_income'].replace([888, 999], pd.NA)

## Exports

In [None]:
df_ema.head()

In [247]:
unique_columns = pd.unique(df_ema.columns)
df_ema = df_ema[unique_columns]

In [248]:

duplicate_ethnicity_columns = ['ethnicity_Chinese', 'ethnicity_Malay', 'ethnicity_Indian',
                               'ethnicity_Mixed Chinese/Malay', 'ethnicity_Mixed Indian/Malay', 'ethnicity_Others']

def deduplicate_columns(df, suffix='_dup'):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + suffix if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols

deduplicate_columns(df_ema, suffix='_dup')


In [249]:
df_ema.to_parquet(f'../outputs/DependentEMA_questionnaire_features.parquet', index=False)

#### Export clean

In [250]:
selected_columns = ['Subject_ID_Roman_x', 'Issued Time', 'response delay', 'response_delay_10min', 'compliance', 'completion_time',  'time_of_the_day_categories', 'time_of_the_day_categories_dummy', 'time_of_the_day_categories_dummy_no_last_p', 'sleep_quality', 'stress', 'hunger', 'fatigue', 'affect', 'number_of_survey', 'day_of_monitoring', 'day_of_the_week',  'day_of_the_week_dummy', 'incentive_category', 'survey length', 'First_question_length', 'Location_Current_Ordinal', 'location_current_commute', 'location_current_green_spaces', 'location_current_home', 'location_current_home_outside', 'location_current_indoors_elsewhere', 'location_current_other', 'time_of_day_8_to_930', 'time_of_day_1030_to_12', 'time_of_day_13_to_1430', 'time_of_day_1530_to_17', 'time_of_day_18_to_1930', 'time_of_day_2030_to_2130', 'day_of_week_monday', 'day_of_week_tuesday', 'day_of_week_wednesday', 'day_of_week_thursday', 'day_of_week_friday', 'day_of_week_saturday', 'day_of_week_sunday', 'day_of_the_week_binary',  'age', 'Gender', 'ethnicity', 'education_level', 'familiarity_digital_devices', 'employment_status',  'night_shifts_binary', 'household_income', 'alcohol_consumption', 'extraversion', 'agreeableness', 'conscientiousness',  'neuroticism', 'openness', 'screen_time_average_smartphone_mins', 'screen_time_average', 'smoking_binary', 'employment_status_binary', 'nothing']
df_ema = df_ema[selected_columns]

In [251]:
df_ema.rename(columns={'Subject_ID_Roman_x': 'subject_id_roman_x', 'Issued Time': 'issued_time', 'survey length': 'survey_length'}, inplace=True)

In [252]:
df_ema.columns = df_ema.columns.str.lower()

#### Make inter-prompt interval feature (in mins)

In [253]:
df_ema['next_issued_time'] = df_ema['issued_time'].shift(-1)
df_ema['ipi'] = (df_ema['next_issued_time'] - df_ema['issued_time']).dt.total_seconds() / 60

#### Make feature in 10 mins responded (yes or no binary)

In [254]:
df_ema['response_ten_binary'] = df_ema['response_delay_10min'].notna().astype(int)
df_ema['response_ten_binary'].fillna(0, inplace=True)


In [None]:
df_ema['response_ten_binary'].value_counts()

#### Make next-prompt compliance feature

In [256]:
df_ema['next_prompt_compliance'] = df_ema['compliance'].shift(-1)

In [None]:
df_ema['next_prompt_compliance'].value_counts()

#### Make completion_time per survey item outcome

In [258]:
df_ema['survey_length_n'] = df_ema['survey_length'] + 4

In [None]:
df_ema['survey_length_n'].value_counts()

In [260]:
valid_surveys = df_ema[(df_ema['survey_length_n'] != 0) & ~df_ema['survey_length_n'].isna() & ~df_ema['completion_time'].isna()]

In [None]:
len(valid_surveys)

In [None]:
valid_surveys['survey_length_n'].isna().sum()

In [None]:
valid_surveys['ct_per_survey_item'] = valid_surveys['completion_time'] / (valid_surveys['survey_length_n'] - 1)


In [264]:
inf_rows = valid_surveys[valid_surveys['ct_per_survey_item'] == float('inf')]

In [None]:
len(inf_rows)

In [None]:
inf_rows['subject_id_roman_x'].nunique()

In [None]:
inf_rows.head(2)

In [268]:
valid_surveys_indices = (df_ema['survey_length_n'] != 0) & ~df_ema['survey_length_n'].isna() & ~df_ema['completion_time'].isna()

# Calculate 'ct_per_survey_item' for valid surveys
df_ema.loc[valid_surveys_indices, 'ct_per_survey_item'] = df_ema.loc[valid_surveys_indices, 'completion_time'] / (df_ema.loc[valid_surveys_indices, 'survey_length_n'] - 1)

# Fill NA for invalid surveys
df_ema.loc[~valid_surveys_indices, 'ct_per_survey_item'] = np.nan


In [None]:
df_ema['ct_per_survey_item'].describe()

In [None]:
len(df_ema)

In [None]:
df_ema['subject_id_roman_x'].nunique()

#### Make careless response feature

In [272]:
df_ema['careless_response'] = np.where(df_ema['ct_per_survey_item'] <= 2, 1, 0)

In [273]:
df_ema['careless_response_one'] = np.where(df_ema['ct_per_survey_item'] <= 1, 1, 0)

In [274]:
df_ema['careless_response_three'] = np.where(df_ema['ct_per_survey_item'] <= 3, 1, 0)

In [275]:
df_ema['careless_response_five'] = np.where(df_ema['ct_per_survey_item'] <= 5, 1, 0)

In [276]:
df_ema['careless_response_sub_two'] = np.where(df_ema['ct_per_survey_item'] < 2, 1, 0)

In [277]:
df_ema['careless_response_sub_one_point_five'] = np.where(df_ema['ct_per_survey_item'] < 1.5, 1, 0)

In [278]:
df_ema['careless_response_one_point_five'] = np.where(df_ema['ct_per_survey_item'] <= 1.5, 1, 0)

In [None]:
df_ema['careless_response'].value_counts()

In [None]:
df_ema['careless_response_one_point_five'].value_counts()

In [None]:
df_ema['careless_response_sub_two'].value_counts()

In [None]:
df_ema['careless_response_one'].value_counts()

#### Make fast-response feature

In [283]:

df_ema['fast_response_one_three'] = np.where(
    (df_ema['ct_per_survey_item'] >= 1) & (df_ema['ct_per_survey_item'] <= 3),
    1,
    0
)

df_ema['fast_response_one_three'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['fast_response_one_three']
)


In [284]:
df_ema['fast_response_one_two'] = np.where(
    (df_ema['ct_per_survey_item'] >= 1) & (df_ema['ct_per_survey_item'] <= 2),
    1,
    0
)

df_ema['fast_response_one_two'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['fast_response_one_two']
)

In [285]:
df_ema['fast_response_one_sub_two'] = np.where(
    (df_ema['ct_per_survey_item'] >= 1) & (df_ema['ct_per_survey_item'] < 2),
    1,
    0
)

df_ema['fast_response_one_sub_two'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['fast_response_one_sub_two']
)

In [286]:
df_ema['fast_response_point_five_two'] = np.where(
    (df_ema['ct_per_survey_item'] >= 0.5) & (df_ema['ct_per_survey_item'] <= 2),
    1,
    0
)

df_ema['fast_response_point_five_two'] = np.where(
    df_ema['ct_per_survey_item'] < 0.5,
    np.nan,
    df_ema['fast_response_point_five_two']
)

In [287]:
df_ema['fast_response_point_five_three'] = np.where(
    (df_ema['ct_per_survey_item'] >= 0.5) & (df_ema['ct_per_survey_item'] <= 3),
    1,
    0
)

df_ema['fast_response_point_five_three'] = np.where(
    df_ema['ct_per_survey_item'] < 0.5,
    np.nan,
    df_ema['fast_response_point_five_three']
)

In [288]:
df_ema['fast_response_point_five_sub_two'] = np.where(
    (df_ema['ct_per_survey_item'] >= 0.5) & (df_ema['ct_per_survey_item'] < 2),
    1,
    0
)

df_ema['fast_response_point_five_sub_two'] = np.where(
    df_ema['ct_per_survey_item'] < 0.5,
    np.nan,
    df_ema['fast_response_point_five_sub_two']
)

In [289]:
df_ema['fast_response_two_four'] = np.where(
    (df_ema['ct_per_survey_item'] >= 2) & (df_ema['ct_per_survey_item'] <= 4),
    1,
   0
)

df_ema['fast_response_two_four'] = np.where(
    df_ema['ct_per_survey_item'] < 2,
    np.nan,
    df_ema['fast_response_two_four']
)

In [290]:
df_ema['fast_response_one_one_point_five'] = np.where(
    (df_ema['ct_per_survey_item'] >= 1) & (df_ema['ct_per_survey_item'] <= 1.5),
    1,
   0
)

df_ema['fast_response_one_one_point_five'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['fast_response_one_one_point_five']
)

In [None]:
df_ema['fast_response_one_one_point_five'].value_counts()

In [None]:

df_ema['fast_response_point_five_sub_two'].value_counts()

In [None]:
df_ema['fast_response_point_five_three'].value_counts()

In [None]:
df_ema['fast_response_point_five_two'].value_counts()

In [None]:
df_ema['fast_response_one_two'].value_counts()

In [None]:
df_ema['fast_response_one_three'].value_counts()

In [None]:
df_ema['fast_response_one_sub_two'].value_counts()

In [None]:
df_ema['fast_response_two_four'].value_counts()

#### Make minimal engagement response

In [299]:
df_ema['length_zero_binary'] = np.where(
    (df_ema['first_question_length'] == 0),
    1,
   0
)


In [None]:
df_ema['length_zero_binary'].value_counts()

In [301]:
df_ema['length_zero_no_careless_one'] = np.where(
    (df_ema['first_question_length'] == 0),
    1,
   0
)

df_ema['length_zero_no_careless_one'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['length_zero_no_careless_one']
)

In [None]:
df_ema['length_zero_no_careless_one'].value_counts()

In [303]:
df_ema['length_zero_no_careless_two'] = np.where(
    (df_ema['first_question_length'] == 0),
    1,
   0
)

df_ema['length_zero_no_careless_two'] = np.where(
    df_ema['ct_per_survey_item'] < 2,
    np.nan,
    df_ema['length_zero_no_careless_two']
)

In [None]:
df_ema['length_zero_no_careless_two'].value_counts()

In [305]:
df_ema['length_zero_no_careless_one_point_five'] = np.where(
    (df_ema['first_question_length'] == 0),
    1,
   0
)

df_ema['length_zero_no_careless_one_point_five'] = np.where(
    df_ema['ct_per_survey_item'] < 1.5,
    np.nan,
    df_ema['length_zero_no_careless_one_point_five']
)

In [None]:
df_ema['length_zero_no_careless_one_point_five'].value_counts()

In [307]:
df_ema['nothing_no_careless_one'] = np.where(
    (df_ema['nothing'] == 1),
    1,
   0
)

df_ema['nothing_no_careless_one'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['nothing_no_careless_one']
)

In [308]:
df_ema['nothing_no_careless_one_point_five'] = np.where(
    (df_ema['nothing'] == 1),
    1,
   0
)

df_ema['nothing_no_careless_one_point_five'] = np.where(
    df_ema['ct_per_survey_item'] < 1.5,
    np.nan,
    df_ema['nothing_no_careless_one_point_five']
)

In [309]:
df_ema['nothing_no_careless_two'] = np.where(
    (df_ema['nothing'] == 1),
    1,
   0
)

df_ema['nothing_no_careless_two'] = np.where(
    df_ema['ct_per_survey_item'] < 2,
    np.nan,
    df_ema['nothing_no_careless_two']
)

In [310]:
df_ema['nothing_no_careless_one_nulllength'] = np.where(
    (df_ema['nothing'] == 1) | (df_ema['first_question_length'] == 0),
    1,
   0
)

df_ema['nothing_no_careless_one_nulllength'] = np.where(
    df_ema['ct_per_survey_item'] < 1,
    np.nan,
    df_ema['nothing_no_careless_one_nulllength']
)

In [311]:
df_ema['nothing_no_careless_one_point_five_nulllength'] = np.where(
    (df_ema['nothing'] == 1) | (df_ema['first_question_length'] == 0),
    1,
   0
)

df_ema['nothing_no_careless_one_point_five_nulllength'] = np.where(
    df_ema['ct_per_survey_item'] < 1.5,
    np.nan,
    df_ema['nothing_no_careless_one_point_five_nulllength']
)

In [312]:
df_ema['nothing_no_careless_two_nulllength'] = np.where(
    (df_ema['nothing'] == 1) | (df_ema['first_question_length'] == 0),
    1,
   0
)

df_ema['nothing_no_careless_two_nulllength'] = np.where(
    df_ema['ct_per_survey_item'] < 2,
    np.nan,
    df_ema['nothing_no_careless_two_nulllength']
)

In [None]:
df_ema['nothing_no_careless_one'].value_counts()

In [None]:
df_ema['nothing_no_careless_one_point_five'].value_counts()

In [None]:
df_ema['nothing_no_careless_two'].value_counts()

In [None]:
df_ema['nothing_no_careless_one_nulllength'].value_counts()

In [None]:
df_ema['nothing_no_careless_two_nulllength'].value_counts()

In [None]:
df_ema['nothing_no_careless_one_point_five_nulllength'].value_counts()

In [None]:
df_ema.head()

In [None]:
data_to_plot = df_ema['ct_per_survey_item'].dropna()

# Define the range for the histogram
hist_range = (0, 4.6)

# Create histogram with specified range
plt.hist(data_to_plot, bins=23, range=hist_range, color='blue', edgecolor='black')

# Add labels and title
plt.xlabel('ct_per_survey_item_one_median (seconds)')
plt.ylabel('Frequency')
plt.title('Histogram of ct_per_survey_item_one_median')

# Show plot
plt.show()


In [None]:
df_ema['subject_id_roman_x'].nunique()

#### Make completion time per survey item intervals

In [322]:
df_ema['ct_per_survey_item_one_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_one_median'] < 1) | (df_ema['ct_per_survey_item_one_median'] > 4.5), 'ct_per_survey_item_one_median'] = pd.NA


In [323]:
df_ema['ct_per_survey_item_one_point_five_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_one_point_five_median'] < 1.5) | (df_ema['ct_per_survey_item_one_point_five_median'] > 4.5), 'ct_per_survey_item_one_point_five_median'] = pd.NA


In [324]:
df_ema['ct_per_survey_item_two_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_two_median'] < 2) | (df_ema['ct_per_survey_item_two_median'] > 4.5), 'ct_per_survey_item_two_median'] = pd.NA

In [325]:
df_ema['ct_per_survey_item_two_point_five_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_two_point_five_median'] < 2.5) | (df_ema['ct_per_survey_item_two_point_five_median'] > 4.5), 'ct_per_survey_item_two_point_five_median'] = pd.NA


In [326]:
df_ema['ct_per_survey_item_three_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_three_median'] < 3) | (df_ema['ct_per_survey_item_three_median'] > 4.5), 'ct_per_survey_item_three_median'] = pd.NA


In [327]:
df_ema['ct_per_survey_item_three_point_five_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_three_point_five_median'] < 3.5) | (df_ema['ct_per_survey_item_three_point_five_median'] > 4.5), 'ct_per_survey_item_three_point_five_median'] = pd.NA


In [328]:
df_ema['ct_per_survey_item_four_median'] = df_ema['ct_per_survey_item']
df_ema.loc[(df_ema['ct_per_survey_item_four_median'] < 4) | (df_ema['ct_per_survey_item_four_median'] > 4.5), 'ct_per_survey_item_four_median'] = pd.NA


In [None]:
df_ema['ct_per_survey_item_one_median'].describe()

In [None]:
df_ema['ct_per_survey_item_one_point_five_median'].describe()

In [None]:
df_ema['ct_per_survey_item_two_median'].describe()

In [None]:
df_ema['ct_per_survey_item_two_point_five_median'].describe()

In [None]:
df_ema['ct_per_survey_item_three_median'].describe()

In [None]:
df_ema['ct_per_survey_item_three_point_five_median'].describe()

In [None]:
df_ema['ct_per_survey_item_four_median'].describe()

#### Make longstring feature

In [336]:
df_ema['longstring'] = np.nan

# Check if all specified columns have the same value in each row
same_values_mask = (df_ema['fatigue'] == df_ema['stress']) & \
                   (df_ema['fatigue'] == df_ema['hunger']) & \
                   (df_ema['fatigue'] == df_ema['affect'])

# Assign 1 to 'longstring' where all values are the same, otherwise 0
df_ema.loc[same_values_mask, 'longstring'] = 1
df_ema.loc[~same_values_mask, 'longstring'] = 0

In [None]:
df_ema['longstring'].value_counts()

In [None]:
df_ema.head(5)

In [339]:
df_no_longstrings = df_ema[df_ema['longstring'] == 0]

In [None]:
df_no_longstrings['ct_per_survey_item'].describe()

In [None]:
df_no_longstrings['survey_length_n'].describe()

In [None]:
df_no_longstrings['first_question_length'].describe()

In [None]:
df_no_longstrings['completion_time'].describe()

In [None]:
df_no_longstrings.head(1)

In [345]:
df_longstrings = df_ema[df_ema['longstring'] == 1]

In [None]:
df_longstrings.head(225).tail(1)

In [None]:
df_longstrings["stress"].value_counts()

#### Make Longstring_careless_one feature

In [348]:
df_ema['longstring'] = df_ema['longstring'].astype(int)
df_ema['careless_response_one'] = df_ema['careless_response_one'].astype(int)

In [349]:
df_ema['longstring_careless_one'] = (df_ema['longstring'] | df_ema['careless_response_one']).astype(int)


In [None]:
df_ema['longstring_careless_one'].value_counts()

#### Longstring values per participant

In [351]:
participant_counts = df_ema[df_ema['longstring'] == 1].groupby('subject_id_roman_x').size()

# Map the participant counts back to the original DataFrame
df_ema['longstrings_per_participant'] = df_ema['subject_id_roman_x'].map(participant_counts)

# Fill NaN values with 0 for participants with no records
df_ema['longstrings_per_participant'].fillna(0, inplace=True)


In [None]:
df_ema['longstrings_per_participant'].describe()

#### Make longstrings per participants that have at least 2 or 3 instances of longstrings

In [353]:
# Creating a new column 'longstring_two' and assigning initial value as 0
df_ema['longstring_two'] = 0

# Conditionally assign 1 to rows where 'longstring' is 1 and 'Longstrings_per_participant' is neither 0 nor 1
mask = (df_ema['longstring'] == 1) & (df_ema['longstrings_per_participant'].isin([0, 1]) == False)
df_ema.loc[mask, 'longstring_two'] = 1


In [None]:
df_ema['longstring_two'].value_counts()

In [355]:
# Creating a new column 'longstring_two' and assigning initial value as 0
df_ema['longstring_three'] = 0

# Conditionally assign 1 to rows where 'longstring' is 1 and 'Longstrings_per_participant' is neither 0, 1, nor 2
mask = (df_ema['longstring'] == 1) & (~df_ema['longstrings_per_participant'].isin([0, 1, 2]))
df_ema.loc[mask, 'longstring_three'] = 1


In [None]:
df_ema['longstring_three'].value_counts()

In [357]:
# Creating a new column 'longstring_two' and assigning initial value as 0
df_ema['longstring_four'] = 0

# Conditionally assign 1 to rows where 'longstring' is 1 and 'Longstrings_per_participant' is neither 0, 1, nor 2
mask = (df_ema['longstring'] == 1) & (~df_ema['longstrings_per_participant'].isin([0, 1, 2,3]))
df_ema.loc[mask, 'longstring_four'] = 1


In [None]:
df_ema['longstring_four'].value_counts()

In [359]:
# Creating a new column 'longstring_two' and assigning initial value as 0
df_ema['longstring_five'] = 0

# Conditionally assign 1 to rows where 'longstring' is 1 and 'Longstrings_per_participant' is neither 0, 1, nor 2
mask = (df_ema['longstring'] == 1) & (~df_ema['longstrings_per_participant'].isin([0, 1, 2,3, 4]))
df_ema.loc[mask, 'longstring_five'] = 1


In [None]:
df_ema['longstring_five'].value_counts()

In [361]:
# Creating a new column 'longstring_two' and assigning initial value as 0
df_ema['longstring_six'] = 0

# Conditionally assign 1 to rows where 'longstring' is 1 and 'Longstrings_per_participant' is neither 0, 1, nor 2
mask = (df_ema['longstring'] == 1) & (~df_ema['longstrings_per_participant'].isin([0, 1, 2,3, 4, 5]))
df_ema.loc[mask, 'longstring_six'] = 1


In [None]:
df_ema['longstring_six'].value_counts()

In [363]:
# Creating a new column 'longstring_two' and assigning initial value as 0
df_ema['longstring_seven'] = 0

# Conditionally assign 1 to rows where 'longstring' is 1 and 'Longstrings_per_participant' is neither 0, 1, nor 2
mask = (df_ema['longstring'] == 1) & (~df_ema['longstrings_per_participant'].isin([0, 1, 2,3, 4, 5, 6]))
df_ema.loc[mask, 'longstring_seven'] = 1


In [None]:
df_ema['longstring_seven'].value_counts()

#### Make longstring & limit completion time feature

In [365]:
df_ema['longstring_no_three'] = 0

# Update the values based on conditions
df_ema.loc[(df_ema['longstring'] == 1) & (df_ema['stress'] != 3), 'longstring_no_three'] = 1


In [366]:
df_ema['longstring_comp_ti_six_point_eight'] = 0
df_ema.loc[(df_ema['longstring'] == 1) & (df_ema['ct_per_survey_item'] <= 6.8), 'longstring_comp_ti_six_point_eight'] = 1


In [367]:
df_ema['longstring_comp_ti_four_point_five'] = 0
df_ema.loc[(df_ema['longstring'] == 1) & (df_ema['ct_per_survey_item'] <= 4.5), 'longstring_comp_ti_four_point_five'] = 1


In [368]:
df_ema['longstring_comp_ti_three'] = 0
df_ema.loc[(df_ema['longstring'] == 1) & (df_ema['ct_per_survey_item'] <= 3), 'longstring_comp_ti_three'] = 1


In [369]:
df_ema['longstring_comp_ti_two'] = 0
df_ema.loc[(df_ema['longstring'] == 1) & (df_ema['ct_per_survey_item'] <= 2), 'longstring_comp_ti_two'] = 1


In [None]:
df_ema['longstring_comp_ti_six_point_eight'].value_counts()

In [None]:
df_ema['longstring_comp_ti_four_point_five'].value_counts()

In [None]:
df_ema['longstring_comp_ti_three'].value_counts()

In [None]:
df_ema['longstring_comp_ti_two'].value_counts()

In [None]:
df_ema['longstring_no_three'].value_counts()

In [None]:
df_ema['longstring'].value_counts()

In [None]:
df_ema['ct_per_survey_item'].describe()

In [377]:
df_new = df_ema.copy()

In [378]:
df_new = df_new[df_new['longstring'] == 1]

#### Combine longstring and non-compliance

In [379]:
df_ema['long_no_comp'] = ((df_ema['longstring'] == 1) | (df_ema['compliance'] == 0)).astype(int)

In [None]:
df_ema['long_no_comp'].value_counts()

In [None]:
df_ema['longstring'].value_counts()

In [None]:
df_ema['compliance'].value_counts()

#### Make next-prompt careless feature

In [383]:
df_ema['next_prompt_long_no_comp'] = df_ema['long_no_comp'].shift(-1)

In [384]:
df_ema['next_prompt_careless'] = df_ema['careless_response'].shift(-1)

In [385]:
df_ema['next_prompt_careless_one'] = df_ema['careless_response_one'].shift(-1)

In [386]:
df_ema['next_prompt_careless_one_point_five'] = df_ema['careless_response_one_point_five'].shift(-1)

In [387]:
df_ema['next_prompt_fast_one_two'] = df_ema['fast_response_one_two'].shift(-1)

In [388]:
df_ema['next_prompt_longstring'] = df_ema['longstring'].shift(-1)

In [389]:
df_ema['next_prompt_longstring_two'] = df_ema['longstring_two'].shift(-1)

In [390]:
df_ema['next_prompt_longstring_three'] = df_ema['longstring_three'].shift(-1)

In [391]:
df_ema['next_prompt_longstring_four'] = df_ema['longstring_four'].shift(-1)

In [392]:
df_ema['next_prompt_longstring_five'] = df_ema['longstring_five'].shift(-1)

In [393]:
df_ema['next_prompt_longstring_six'] = df_ema['longstring_six'].shift(-1)

In [394]:
df_ema['next_prompt_longstring_careless_one'] = df_ema['longstring_careless_one'].shift(-1)

In [395]:
df_ema['next_prompt_longstring_comp_ti_six_point_eight'] = df_ema['longstring_comp_ti_six_point_eight'].shift(-1)

In [396]:
df_ema['next_prompt_longstring_comp_ti_four_point_five'] = df_ema['longstring_comp_ti_four_point_five'].shift(-1)

In [397]:
df_ema['next_prompt_longstring_comp_ti_three'] = df_ema['longstring_comp_ti_three'].shift(-1)

In [398]:
df_ema['next_prompt_longstring_comp_ti_two'] = df_ema['longstring_comp_ti_two'].shift(-1)

In [399]:
df_ema['next_prompt_longstring_no_three'] = df_ema['longstring_no_three'].shift(-1)

In [400]:
df_ema['next_prompt_ct_per_survey_item'] = df_ema['ct_per_survey_item'].shift(-1)

In [401]:
df_ema['next_prompt_completion_time'] = df_ema['completion_time'].shift(-1)

In [None]:
df_ema['next_prompt_ct_per_survey_item'].isna().sum()

In [None]:
df_ema['ct_per_survey_item'].isna().sum()

In [None]:
df_ema['next_prompt_compliance'].isna().sum()

In [None]:
df_ema['next_prompt_compliance'].value_counts()

In [None]:
df_ema['compliance'].value_counts()

In [None]:
df_ema.head(102).tail(0)

In [408]:
df_t = df_ema[df_ema['completion_time'].isna()]

In [None]:
len(df_ema)

In [None]:
df_t.head()

#### Make next-prompt careless or non-compliant feature

In [411]:
df_ema['next_prompt_invalid'] = np.where((df_ema['next_prompt_careless'] == 1) | (df_ema['next_prompt_compliance'] == 0), 1, 0)

In [None]:
df_ema['next_prompt_invalid'].value_counts()

In [None]:
df_ema.head()

In [414]:
df = df_ema.copy()

In [415]:
df['original_time_of_the_day'] = df['time_of_the_day_categories']

In [416]:
df.set_index('time_of_the_day_categories', inplace=True)

In [417]:
df.index.name = None

In [418]:
df['time_of_the_day_categories'] = df['original_time_of_the_day'].copy()

In [None]:
df.head(7)


In [None]:
valid_surveys['ct_per_survey_item'].describe()

In [None]:
df['ct_per_survey_item'].describe()

In [None]:
df.head()

In [None]:
df['ct_per_survey_item'].describe()

#### New Autocorrelation Feature

In [424]:
# Function to perform the specified steps
def process_data(df):
    prev_time_of_the_day_category = -1
    obscomp_steps_counter = 0  # Counter for obscomp_steps

    def calculate_obscomp_steps(row):
        # use outer variables instead of creating new ones
        nonlocal prev_time_of_the_day_category, obscomp_steps_counter

        time_of_the_day_categories = row['time_of_the_day_categories']
        compliance = row['compliance']

        # reset counter if next day
        if time_of_the_day_categories <= prev_time_of_the_day_category:
            obscomp_steps_counter = 0

        prev_time_of_the_day_category = time_of_the_day_categories
        if compliance == 1:
            obscomp_steps = obscomp_steps_counter
            obscomp_steps_counter = 0
            return obscomp_steps
        if compliance == 0:
            # set to nan if compliance is zero, use other default if desired
            obscomp_steps_counter += 1
            return np.nan
        raise ValueError(f"Unknown compliance value: {compliance}")

    df['obscomp_steps'] = df.apply(calculate_obscomp_steps, axis=1)
process_data(df)


In [425]:
# Function to perform the specified steps
def process_data(df):
    prev_time_of_the_day_category = -1
    obscomp_steps_counter = 0  # Counter for obscomp_steps

    def calculate_obscomp_steps(row):
        # use outer variables instead of creating new ones
        nonlocal prev_time_of_the_day_category, obscomp_steps_counter

        time_of_the_day_categories = row['time_of_the_day_categories']
        compliance = row['response_ten_binary']

        # reset counter if next day
        if time_of_the_day_categories <= prev_time_of_the_day_category:
            obscomp_steps_counter = 0

        prev_time_of_the_day_category = time_of_the_day_categories
        if compliance == 1:
            obscomp_steps = obscomp_steps_counter
            obscomp_steps_counter = 0
            return obscomp_steps
        if compliance == 0:
            # set to nan if compliance is zero, use other default if desired
            obscomp_steps_counter += 1
            return np.nan
        raise ValueError(f"Unknown compliance value: {compliance}")

    df['obscomp_steps_ten'] = df.apply(calculate_obscomp_steps, axis=1)
process_data(df)


In [None]:
df['obscomp_steps'].value_counts()

In [None]:
len(df)

In [None]:
df['obscomp_steps_ten'].value_counts()

The Autocorrelation calculation of the code below is correct (without iterating over participants). Iterating over participants is not necessary as every last survey per day anyway receives the value NA, so every day over all participants can be regarded as being calculated independently

Debugging obscomp steps below -> Delete again afterwards

In [None]:
df['obscomp_steps'].value_counts()

In [None]:
df['obscomp_steps_ten'].value_counts()

In the Autocorrelation logic, the value for Obcomp_steps of the first survey of the day always has to be 0 because there cannot be any missed surveys before this. Therefore, the next row sets all Obcomp_step values of the first survey of the day to 0. 


In [431]:
df.loc[df['time_of_the_day_categories'] == 1, 'obscomp_steps'] = 0

(Caveat: I think i would need to also calculate the obscomp-steps feature for cases where the count is 6, because I have this also in the logic for the case of response delay and completion time!)

Do not change anything above this cell as this will also affect the response_delay and completion time dataset!!

##### Impute sleep quality median for all sleep quality nas if the participant has at least 3 valid entries

In [None]:
df['sleep_quality'].isna().sum()

In [None]:
len(df)

In [None]:
df.head(6)

In [None]:
df['sleep_quality'].describe()

#### Median Imputation

In [436]:
# Identify unique participants with at least two rows where time_of_the_day_categories is 1 and sleep_quality is not NA
valid_participants = df[(df['time_of_the_day_categories'] == 1) & ~df['sleep_quality'].isna()]\
    ['subject_id_roman_x'].value_counts()[lambda x: x >= 3].index

# Filter the DataFrame to include only rows from valid participants
df_filtered = df[df['subject_id_roman_x'].isin(valid_participants)]



In [437]:
df_filtered_removed_parts_sleep = df[~df['subject_id_roman_x'].isin(valid_participants)]

In [None]:
df_filtered_removed_parts_sleep['subject_id_roman_x'].nunique()

In [None]:
unique_participant_ids = df_filtered_removed_parts_sleep['subject_id_roman_x'].unique()

# Print the unique participant IDs
print("Unique Participant IDs in df_filtered_parts_sleep:")
print(unique_participant_ids)

# Print the count of unique participant IDs
print("Number of Unique Participant IDs:", len(unique_participant_ids))

In [440]:
unique_participant_ids = pd.DataFrame(unique_participant_ids)
unique_participant_ids.to_csv(f'../outputs/participants_sub_3_first_prompt_responses.csv', index=False)

In [None]:
len(df_filtered)

In [None]:
df_filtered['subject_id_roman_x'].nunique()

In [None]:
df_filtered['sleep_quality'].isna().sum()

7 participants have less than 3 unique sleep quality values (in the first survey of the day) and were thus excluded from the imputation process and from the dataset (582-7 = 575 participants)

In [444]:
df = df_filtered.copy()

###### This participant has duplicate rows, therefore one set of duplicates is removed here

In [None]:
# Get the count of rows for each unique participant ID
id_counts = df['subject_id_roman_x'].value_counts()

# Check if any participant ID has a count different from 54
for participant_id, count in id_counts.items():
    if count != 54:
        print(f"Participant ID {participant_id} has {count} rows instead of 54.")


In [446]:
# df_duplicate = df[df['subject_id_roman_x'] == '2023006-00015A']

In [447]:
# Identify the participant ID with duplicate rows
duplicate_id = '2023006-00015A'

# Create a boolean mask to identify rows to keep
mask = (df['subject_id_roman_x'] != duplicate_id) | (df.groupby('subject_id_roman_x').cumcount() % 2 == 0)

# Apply the mask to the dataframe to exclude every second row for the duplicate participant
df = df[mask]


In [None]:
df['subject_id_roman_x'].nunique()

In [None]:
len(df)

In [None]:

# Adapted code to count unique participants
df_cleaned = df.dropna(subset=['sleep_quality'])

# Filter rows where time_of_the_day_categories is 1
df_filtered = df_cleaned[df_cleaned['time_of_the_day_categories'] == 1]

# Group by participant ID and count occurrences
participant_counts = df_filtered.groupby('subject_id_roman_x').size()

# Filter participants with at least two rows
participants_with_two_rows = participant_counts[participant_counts >= 3]

# Display the result
print("Number of unique participants with at least three rows where time_of_the_day_categories is 1 and sleep_quality is not NA:",
      len(participants_with_two_rows))

In [451]:
df['sleep_quality_median'] = df.groupby('subject_id_roman_x')['sleep_quality'].transform('median')

In [None]:
df['sleep_quality_median'].describe()

In [453]:
df.reset_index(drop=True, inplace=True)


In [454]:
df['sleep_quality'] = df['sleep_quality'].fillna(df['sleep_quality_median'])


In [None]:
df.head(100000).tail(2)

In [456]:
df = df.reset_index(drop=True)

In [None]:
df['sleep_quality'].describe()

In [None]:
df['sleep_quality'].isna().sum()

In [459]:
df_test = df[df['sleep_quality'].isna()]

In [None]:
df['subject_id_roman_x'].nunique()

In [None]:
len(df)

#### Frequencies for analysis

In [None]:
df['next_prompt_compliance'].describe()

In [None]:
df['next_prompt_compliance'].value_counts()

In [None]:
df['compliance'].value_counts()

In [None]:
df['longstring'].value_counts()

In [None]:
df['response delay'].describe()

In [None]:
df['ct_per_survey_item'].describe()

In [None]:
df.head()

##### Remove any participants that still have missing values in any of the predictors

In [None]:
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Filter the DataFrame to include only rows where compliance is not equal to 0
df_filtered = df[df['compliance'] != 0]

# Identify subject_id_roman_x values with missing values in specified columns
subjects_with_missing_values = df_filtered[df_filtered[columns_to_check].isnull().any(axis=1)]['subject_id_roman_x'].unique()

# Creating a new DataFrame without rows containing missing values in specified columns and for the identified subjects
df_predict = df_filtered[~df_filtered['subject_id_roman_x'].isin(subjects_with_missing_values)]

# Check for NaN values in the specified columns
nan_values = df_predict[columns_to_check].isnull().any(axis=1)

# Filter the DataFrame to include only rows with NaN values in at least one column
filtered_df = df_predict[nan_values]

# Count the number of unique participant IDs in the filtered DataFrame
unique_participant_count = filtered_df['subject_id_roman_x'].nunique()
print(f"Number of unique participant IDs with NaN values in at least one column: {unique_participant_count}")

# Get all unique participant IDs from filtered_df
unique_participant_ids = filtered_df['subject_id_roman_x'].unique()

# Print the unique participant IDs
print("Unique Participant IDs in df_missing_predictors:")
print(unique_participant_ids)

# Print the count of unique participant IDs
print("Number of Unique Participant IDs:", len(unique_participant_ids))

unique_participant_ids = pd.DataFrame(unique_participant_ids)
unique_participant_ids.to_csv(f'../outputs/participant_missing_predictors.csv', index=False)


In [None]:

# Get the participant IDs with NaN values in at least one column
participants_with_nan = df_predict[df_predict[columns_to_check].isnull().any(axis=1)]['subject_id_roman_x'].unique()

# Drop rows corresponding to the participant IDs with NaN values
df_no_nan_values = df_predict[~df_predict['subject_id_roman_x'].isin(participants_with_nan)]
df_with_nan_values = df_predict[df_predict['subject_id_roman_x'].isin(participants_with_nan)]

# Print the shape of the original and filtered DataFrames for comparison
print("Original DataFrame shape:", df.shape)
print("DataFrame after dropping rows with NaN values shape:", df_no_nan_values.shape)

print("Number of unique participants in df_no_nan_values:", df_no_nan_values['subject_id_roman_x'].nunique())


In [None]:
subjects_with_missing_values

In [None]:
len(df_filtered)

In [None]:
len(df)

In [474]:
df = df[~df['subject_id_roman_x'].isin(subjects_with_missing_values)]

In [None]:
len(df)

In [None]:
participants_with_nan

In [None]:
df_no_nan_values.head()

In [None]:
df['ct_per_survey_item'].describe()

In [None]:
df['subject_id_roman_x'].nunique()

In [480]:
inf_rows = df[df['ct_per_survey_item'] == float('inf')]

In [None]:
inf_rows.head()

In [482]:
df.to_csv(f'../outputs/Full_df_final_descriptives.csv', index=False)

##### Remove all non-compliant columns

In [None]:
len(df)

In [None]:
df['compliance'].value_counts()

In [485]:
df = df[df['compliance'] != 0]

In [None]:
df['compliance'].value_counts()

In [None]:
len(df)

In [None]:
df['subject_id_roman_x'].nunique()

In [None]:
df['obscomp_steps'].value_counts()

In [None]:
df['obscomp_steps'].describe()

In [None]:
df['next_prompt_compliance'].isna().sum()

In [492]:
df_p = df[df['next_prompt_completion_time'].isna()]

In [None]:
df_p.head()

In [None]:
len(df)

In [495]:
df.to_csv(f'../outputs/Full_df_final_descriptives_no_non-compliant.csv', index=False)

#### Make careless_response column

In [None]:
df['careless_response'].value_counts()

### Changes for next-prompt compliance outcome  (For response delay and completion time, skip these sections)

In [497]:
df_ema = df.copy()

Also, the last survey of each day will afterwards be removed, therefore we assign those surveys the value NA in the obscomp steps column

In [498]:
df_ema.loc[df_ema['time_of_the_day_categories'] == 6, 'obscomp_steps'] = np.nan

Since there are still a couple of compliance = 0 rows, which do not have an obscomp_step value of na, I manually change this in the row below to make sure all compliance = 0 rows are also having obscomp_steps = 0: 

In [None]:
df_ema.head()

In [500]:
df_ema.loc[df_ema['compliance'] == 0, 'obscomp_steps'] = np.nan

In [None]:
df_ema.head(60).tail(3)

In [None]:
len(df_ema)

In [None]:
df_ema.head(2)

In [504]:
df_nextprompt_na = df_ema[df_ema['next_prompt_compliance'].isna()]

In [None]:
df_ema['next_prompt_compliance'].isna().sum()

In [None]:
df_nextprompt_na.head()

In [None]:
df_ema.head(60).tail(2)

In [None]:
df_ema['obscomp_steps'].value_counts()

In [None]:
df_ema['next_prompt_compliance'].describe()

In [None]:
df_ema['next_prompt_compliance'].value_counts()

In [None]:
df_ema['obscomp_steps'].notna().sum()

In [None]:
df_ema['obscomp_steps'].isna().sum()

In [None]:
df_ema['compliance'].value_counts()

In [None]:
len(df_ema)

In [None]:
df_ema['subject_id_roman_x'].nunique()

#### Export clean

In [None]:
df_ema.head(35000).tail(5)

#### Set next-prompt compleiton time of last survey of the study to NAN 

In [517]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_ct_per_survey_item'] = float('nan')


In [518]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_completion_time'] = float('nan')


In [519]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_long_no_comp'] = float('nan')


#### Set ipi of last survey of the study to NAN 

In [520]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'ipi'] = float('nan')


#### Set longstring and longstring_careless one_of the lats survey of the study to NAN 

In [521]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring_careless_one'] = float('nan')

In [522]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring'] = float('nan')

In [523]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring_two'] = float('nan')

In [524]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring_three'] = float('nan')

In [525]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring_four'] = float('nan')

In [526]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring_five'] = float('nan')

In [527]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_longstring_six'] = float('nan')

#### Set next-prompt compliance of last survey of the study to NAN 

In [528]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_compliance'] = float('nan')


#### Set next-prompt careless of last survey of the study to NAN 

In [529]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_careless'] = float('nan')


#### Set next-prompt invalid of last survey of the study to NAN 

In [530]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_invalid'] = float('nan')


#### Set next-prompt fast response between one and two of last survey of the study to NAN 

In [531]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_careless_one'] = float('nan')


In [532]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_careless_one_point_five'] = float('nan')


#### Set next-prompt invalid of last survey of the study to NAN 

In [533]:
df_ema.loc[df_ema['number_of_survey'] == 54, 'next_prompt_fast_one_two'] = float('nan')


In [534]:
df_ema.to_csv(f'../outputs/DependentEMA_questionnaire_features_clean.csv', index=False)

In [None]:
df_ema['compliance'].value_counts()

In [None]:
df_ema['next_prompt_compliance'].describe()

In [None]:
df_ema['next_prompt_compliance'].value_counts()

In [None]:
df_ema['next_prompt_compliance'].isna().sum()

In [539]:
df_parts_characters = df_ema.copy()

In [None]:
unique_df = df_parts_characters.drop_duplicates(subset=['subject_id_roman_x'])
filtered_df = unique_df.copy()
numeric_df = filtered_df.select_dtypes(include='number')
categorical_df = filtered_df.select_dtypes(include='object').drop(columns=['subject_id_roman_x'])
mean_values = numeric_df.mean()
std_dev_values = numeric_df.std()
median_values = numeric_df.median()
stats_df = pd.DataFrame({
    'Mean': mean_values,
    'Standard Deviation': std_dev_values,
    'Median': median_values
})
num_numeric_vars = len(numeric_df.columns)
value_counts_dict = {col: categorical_df[col].value_counts() for col in categorical_df.columns}
print("Mean and Standard Deviation of all numeric columns:")
print(stats_df)
print(f"\nNumber of numeric variables included: {num_numeric_vars}")

print("\nValue counts for all categorical columns (excluding 'subject_id_roman_x'):")
for col, counts in value_counts_dict.items():
    print(f"\nColumn: {col}")
    print(counts)


In [None]:
unique_df = df_parts_characters.drop_duplicates(subset=['subject_id_roman_x'])

filtered_df = unique_df.copy()

numeric_df = filtered_df.select_dtypes(include='number')
categorical_df = filtered_df.select_dtypes(include='object').drop(columns=['subject_id_roman_x'])

mean_values = numeric_df.mean()
std_dev_values = numeric_df.std()

stats_df = pd.DataFrame({
    'Mean': mean_values,
    'Standard Deviation': std_dev_values
})

num_numeric_vars = len(numeric_df.columns)

value_counts_dict = {col: categorical_df[col].value_counts() for col in categorical_df.columns}

value_counts_df = pd.concat(value_counts_dict, axis=1)

combined_df = pd.concat([stats_df, value_counts_df], axis=1)

combined_df.to_csv('../outputs/Part_characteristics.csv', index=True)

print("Mean and Standard Deviation of all numeric columns:")
print(stats_df)
print(f"\nNumber of numeric variables included: {num_numeric_vars}")

print("\nValue counts for all categorical columns (excluding 'subject_id_roman_x'):")
for col, counts in value_counts_dict.items():
    print(f"\nColumn: {col}")
    print(counts)


In [None]:
unique_df = df_parts_characters.drop_duplicates(subset=['subject_id_roman_x'])
filtered_df = unique_df.copy()
numeric_df = filtered_df.select_dtypes(include='number')
categorical_columns = [
    'incentive_category',
    'first_question_length',
    'location_current_ordinal',
    'gender',
    'ethnicity',
    'education_level',
    'familiarity_digital_devices',
    'employment_status',
    'night_shifts_binary',
    'alcohol_consumption',
    'smoking_binary',
    'employment_status_binary',
    'household_income'
]

categorical_df = filtered_df[categorical_columns]
mean_values = numeric_df.mean()
std_dev_values = numeric_df.std()
stats_df = pd.DataFrame({
    'Mean': mean_values,
    'Standard Deviation': std_dev_values
})

num_numeric_vars = len(numeric_df.columns)
value_counts_dict = {col: categorical_df[col].value_counts() for col in categorical_df.columns}

value_counts_df = pd.concat(value_counts_dict, axis=1)

combined_df = pd.concat([stats_df, value_counts_df], axis=1)

combined_df.to_csv('../outputs/Part_characteristics.csv', index=True)

print("Mean and Standard Deviation of all numeric columns:")
print(stats_df)
print(f"\nNumber of numeric variables included: {num_numeric_vars}")

print("\nValue counts for specified categorical columns:")
for col, counts in value_counts_dict.items():
    print(f"\nColumn: {col}")
    print(counts)


In [None]:
unique_df = df_parts_characters.drop_duplicates(subset=['subject_id_roman_x'])

filtered_df = unique_df.copy()

numeric_df = filtered_df.select_dtypes(include='number')

categorical_columns = [
    'incentive_category',
    'first_question_length',
    'location_current_ordinal',
    'gender',
    'ethnicity',
    'education_level',
    'familiarity_digital_devices',
    'employment_status',
    'night_shifts_binary',
    'alcohol_consumption',
    'smoking_binary',
    'employment_status_binary'
]

categorical_df = filtered_df[categorical_columns]

mean_values = numeric_df.mean()
std_dev_values = numeric_df.std()

stats_df = pd.DataFrame({
    'Mean': mean_values,
    'Standard Deviation': std_dev_values
})

num_numeric_vars = len(numeric_df.columns)

value_counts_dict = {col: categorical_df[col].value_counts() for col in categorical_df.columns}

value_counts_df = pd.concat(value_counts_dict, axis=1)

stats_df.to_csv('../outputs/Part_characteristics_numeric_stats.csv', index=True)

value_counts_df.to_csv('../outputs/Part_characteristics_value_counts.csv', index=True)

print("Mean and Standard Deviation of all numeric columns:")
print(stats_df)
print(f"\nNumber of numeric variables included: {num_numeric_vars}")

print("\nValue counts for specified categorical columns:")
for col, counts in value_counts_dict.items():
    print(f"\nColumn: {col}")
    print(counts)


# Cleaning Model Ready df

In [544]:
df_full = df_ema.copy()


In [545]:
df_full = df_full.reset_index(drop=True)

In [None]:
df_full.head(4)

In [None]:
df_full['ipi'].describe()

In [None]:
len(df_full)

In [None]:
df_full.head(2)

### Make a survey number categorization from 1-54 (excluding the ones that were at the last timepoint)

#### Exclude any surveys that are larger than 54

In [550]:
mask = df_full['number_of_survey'] <= 54

# Apply the mask to filter the DataFrame
df_full = df_full[mask]

In [None]:
df_full.head(930).tail(2)

Because the variable above has two participants with a missing 54th survey, the number_of_survey_wo_54 factor does not include the 54th survey

In [552]:
mask = df_full['number_of_survey'] <= 53

# Create the new column 'number_of_survey_wo_54' based on the mask
df_full['number_of_survey_wo_54'] = df_full['number_of_survey'][mask]

In [None]:
df_full['number_of_survey_wo_54'].value_counts()

In [None]:
df_full['number_of_survey'].value_counts()

In [None]:
print(df_full['number_of_survey'].value_counts())

In [None]:
print(df_full['number_of_survey'].value_counts())


In [None]:
counts = df_full['number_of_survey'].value_counts().reset_index()

counts.columns = ['survey_number_wo_last', 'frequency']

plt.figure(figsize=(10, 6))
plt.bar(counts['survey_number_wo_last'], counts['frequency'])
plt.xlabel('Number of Surveys')
plt.ylabel('Frequency')
plt.title('Frequency of Number of Surveys')
plt.show()



In [None]:
df_full['next_prompt_compliance'].value_counts()

#### Select a random sample of 100 and 50 participants respectively

In [559]:

random_participant_ids = df_full['subject_id_roman_x'].sample(n=100, random_state=42).tolist()
df_100 = df_full[df_full['subject_id_roman_x'].isin(random_participant_ids)]


In [560]:
random_participant_ids = df_full['subject_id_roman_x'].sample(n=50, random_state=42).tolist()
df_50 = df_full[df_full['subject_id_roman_x'].isin(random_participant_ids)]


In [561]:
df_100.to_parquet(f'../outputs/Model_ready_df/df_100.parquet', index=False)

In [562]:
df_50.to_parquet(f'../outputs/Model_ready_df/df_50.parquet', index=False)

### Excluding any rows that have a non-completed survey before

In [None]:
df_full.head(60).tail(10)

In [None]:
df_full.head(930).tail(10)

In [None]:
len(df_full)

In [None]:
df_full['next_prompt_compliance'].value_counts()

In [None]:
df_full['next_prompt_compliance'].isna().sum()

In [None]:
df_full['obscomp_steps'].value_counts()

In [None]:
df_full['obscomp_steps'].notna().sum()

In [None]:
df_full['compliance'].value_counts()

In [None]:
df_full['obscomp_steps'].isna().sum()

In [None]:
df_full['compliance'].isna().sum()

full_surveys is actually not necessary since all surveys with an NA in compliance have already been excluded before

#### Remove all last prompt of day surveys

In [573]:
df_no_lapro = df_full[df_full['time_of_the_day_categories'] != 6]

In [None]:
len(df_no_lapro)

In [None]:
df_no_lapro['next_prompt_compliance'].describe()

In [None]:
df_no_lapro['obscomp_steps'].value_counts()

In [None]:
df_no_lapro['obscomp_steps'].notna().sum()

In [None]:
df_no_lapro['obscomp_steps'].isna().sum()

In [None]:
df_no_lapro['compliance'].notna().sum()

In [None]:
df_no_lapro['compliance'].isna().sum()

In [None]:
df_no_lapro['next_prompt_compliance'].notna().sum()

In [None]:
df_no_lapro['next_prompt_compliance'].value_counts()

In [None]:
df_no_lapro['compliance'].value_counts()

In [None]:
len(df_no_lapro)

It turns out that the labelling of na rows in the obscomp_steps column did not work as desired for all! compliance = 0 columns, where obscomp steps should actually be 0 

In [None]:
df_no_lapro.head(60).tail(5)

In [None]:
df_no_lapro['ct_per_survey_item'].describe()

In [587]:
df_full.to_parquet(f'../outputs/Model_ready_df/df_full_surveys_incl_next_prompt_0.parquet', index=False)

In [588]:
df_no_lapro.to_parquet(f'../outputs/Model_ready_df/df_no_lapro.parquet', index=False)

#### Drop any rows where obscopm steps = na (esentially any rows where compliance is = 0)

In [589]:
df_no_lapro_noobsc = df_no_lapro[df_no_lapro['obscomp_steps'].notna()].copy()


In [None]:
df_no_lapro_noobsc['compliance'].value_counts()

In [None]:
df_no_lapro_noobsc['next_prompt_compliance'].value_counts()

In [None]:
df_no_lapro_noobsc['next_prompt_compliance'].notna().sum()

In [None]:
df_no_lapro_noobsc['compliance'].isna().sum()

In [None]:
df_no_lapro_noobsc['compliance'].notna().sum()

In [None]:
df_no_lapro_noobsc['obscomp_steps'].notna().sum()

In [None]:
df_no_lapro_noobsc['obscomp_steps'].isna().sum()

In [None]:
df_no_lapro_noobsc['obscomp_steps'].value_counts()

In [None]:
df_no_lapro_noobsc['obscomp_steps'].describe()

In [None]:
len(df_no_lapro_noobsc)

In [600]:
df_no_lapro_noobsc.to_parquet(f'../outputs/Model_ready_df/df_full_no_last_prompt_full_surveys.parquet', index=False)

##### Frequencies next-prompt compliance outcome

In [601]:
columns_of_interest = ['next_prompt_compliance', 'sleep_quality', 'stress', 'affect', 'hunger', 'fatigue', 'obscomp_steps', 
                       'age', 'gender', 'day_of_monitoring', 'time_of_the_day_categories_dummy', 'day_of_the_week_binary', 'ipi', 
                       'subject_id_roman_x']

# Calculating descriptive statistics
descriptive_stats = df_no_lapro_noobsc[columns_of_interest].describe()

descriptive_stats_transposed = descriptive_stats.transpose()

In [None]:
descriptive_stats_transposed

In [603]:
descriptive_stats_transposed.to_csv('../outputs/for_draft/frequency_tables/frequencies_next_prompt_compliance.csv')

In [None]:
df_no_lapro_noobsc.head()

In [None]:
unique_subjects_with_nan_sleep_quality = df_no_lapro_noobsc.loc[df_no_lapro_noobsc['sleep_quality'].isna(), 'subject_id_roman_x'].nunique()

print("Number of unique subject_id_roman_x values where sleep_quality is NaN:", unique_subjects_with_nan_sleep_quality)


In [606]:
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Identify subject_id_roman_x values with missing values in specified columns
subjects_with_missing_values = df_no_lapro_noobsc[df_no_lapro_noobsc[columns_to_check].isnull().any(axis=1)]['subject_id_roman_x'].unique()

# Creating a new DataFrame without rows containing missing values in specified columns and for the identified subjects
df_no_lapro_noobscpredict = df_no_lapro_noobsc[~df_no_lapro_noobsc['subject_id_roman_x'].isin(subjects_with_missing_values)]




In [None]:
print(subjects_with_missing_values)

In [None]:
import pandas as pd

# Assuming df_no_lapro_noobsc is your DataFrame and columns_to_check is the list of columns
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Check for NaN values in the specified columns
nan_values = df_no_lapro_noobsc[columns_to_check].isnull().any(axis=1)

# Filter the DataFrame to include only rows with NaN values in at least one column
filtered_df = df_no_lapro_noobsc[nan_values]

# Count the number of unique participant IDs in the filtered DataFrame
unique_participant_count = filtered_df['subject_id_roman_x'].nunique()

print(f"Number of unique participant IDs with NaN values in at least one column: {unique_participant_count}")


In [None]:
filtered_df.head()

In [None]:
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Get the participant IDs with NaN values in at least one column
participants_with_nan = df_no_lapro_noobsc[df_no_lapro_noobsc[columns_to_check].isnull().any(axis=1)]['subject_id_roman_x'].unique()

# Drop rows corresponding to the participant IDs with NaN values
df_no_nan_values = df_no_lapro_noobsc[~df_no_lapro_noobsc['subject_id_roman_x'].isin(participants_with_nan)]

# Print the shape of the original and filtered DataFrames for comparison
print("Original DataFrame shape:", df_no_lapro_noobsc.shape)
print("DataFrame after dropping rows with NaN values shape:", df_no_nan_values.shape)


Remove rows from 5 participants because they still have missing values in one of the 5 main variables

In [None]:
df_no_nan_values['subject_id_roman_x'].nunique()

In [None]:
df_no_nan_values.head()

In [613]:
df_missing_sleep_quality = df_no_lapro_noobsc[df_no_lapro_noobsc['sleep_quality'].isnull()]



In [None]:
df_missing_sleep_quality.head(1)

In [None]:
len(df_no_lapro_noobscpredict)

In [None]:
df_no_lapro_noobscpredict['subject_id_roman_x'].nunique()

In [None]:
nan_values = df_no_lapro_noobscpredict.isna().any()

# Display columns with NaN values
columns_with_nan = nan_values[nan_values].index.tolist()
print("Columns with NaN values:", columns_with_nan)

# Count the number of NaN values in each column
nan_count_per_column = df_no_lapro_noobscpredict.isna().sum()
print("Number of NaN values in each column:")
print(nan_count_per_column)


#### Make dataset for careless responding outcome and remove all next prompt-compliance = 0 surveys as those should be NAN for careless responding (longstring/ completion time) because the next survey has not been completed

In [None]:
df_no_lapro_noobsc.head()

In [None]:
df_no_lapro_noobsc['next_prompt_compliance'].value_counts()

In [None]:
df_no_lapro_noobsc['next_prompt_ct_per_survey_item'].isna().sum()

In [None]:
df_no_lapro_noobsc['next_prompt_completion_time'].isna().sum()

In [622]:
fil_df = df_no_lapro_noobsc[df_no_lapro_noobsc['next_prompt_compliance'] != 0]

# Save the remaining rows to a new DataFrame
df_no_lapro_noobsc_no_null_nepro_comp = fil_df.copy()

In [None]:
df_no_lapro_noobsc_no_null_nepro_comp['next_prompt_ct_per_survey_item'].isna().sum()

In [None]:
df_no_lapro_noobsc_no_null_nepro_comp['next_prompt_ct_per_survey_item'].describe()

In [None]:
df_no_lapro_noobsc_no_null_nepro_comp['next_prompt_longstring'].value_counts()

In [626]:
df_no_lapro_noobsc_no_null_nepro_comp.to_parquet(f'../outputs/Model_ready_df/df_nepro_careless.parquet', index=False)

In [None]:
len(df_no_lapro_noobsc_no_null_nepro_comp)

#### Frequencies Next-prompt Straightlining

In [628]:
columns_of_interest = ['next_prompt_longstring', 'sleep_quality', 'stress', 'affect', 'hunger', 'fatigue', 'obscomp_steps', 
                       'age', 'gender', 'day_of_monitoring', 'time_of_the_day_categories_dummy', 'day_of_the_week_binary', 'ipi', 
                       'subject_id_roman_x']

# Calculating descriptive statistics
descriptive_stats = df_no_lapro_noobsc_no_null_nepro_comp[columns_of_interest].describe()

# Transposing the descriptive statistics DataFrame
descriptive_stats_transposed = descriptive_stats.transpose()

In [629]:
descriptive_stats_transposed.to_csv('../outputs/for_draft/frequency_tables/frequencies_next_prompt_straightlining.csv')

Finished Next-prompt_compliance

### Dataset for response delay and completion time outcome (including last survey of the day)

In [None]:
df.head()

In [None]:
len(df)

In [632]:
df_time = df.copy()

In [None]:
df_time['next_prompt_compliance'].value_counts()

In [None]:
df_time['next_prompt_compliance'].isna().sum()

In [None]:
df_time['next_prompt_compliance'].notna().sum()

In [None]:
df_time['compliance'].value_counts()

In [None]:
df_time['compliance'].isna().sum()

In [None]:
df_time['compliance'].notna().sum()

In [None]:
df_time['obscomp_steps'].notna().sum()

In [None]:
df_time['obscomp_steps'].isna().sum()

In [None]:
df_time['obscomp_steps'].value_counts()

In [642]:
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Identify subject_id_roman_x values with missing values in specified columns
subjects_with_missing_values = df_time[df_time[columns_to_check].isnull().any(axis=1)]['subject_id_roman_x'].unique()

# Creating a new DataFrame without rows containing missing values in specified columns and for the identified subjects
df_no_lapro_noobscpredict = df_time[~df_time['subject_id_roman_x'].isin(subjects_with_missing_values)]




In [None]:
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Check for NaN values in the specified columns
nan_values = df_time[columns_to_check].isnull().any(axis=1)

# Filter the DataFrame to include only rows with NaN values in at least one column
filtered_df = df_time[nan_values]

# Count the number of unique participant IDs in the filtered DataFrame
unique_participant_count = filtered_df['subject_id_roman_x'].nunique()

print(f"Number of unique participant IDs with NaN values in at least one column: {unique_participant_count}")


In [None]:
columns_to_check = ['sleep_quality', 'stress', 'affect', 'fatigue', 'hunger']

# Get the participant IDs with NaN values in at least one column
participants_with_nan = df_time[df_time[columns_to_check].isnull().any(axis=1)]['subject_id_roman_x'].unique()

# Drop rows corresponding to the participant IDs with NaN values
df_no_nan_values = df_time[~df_time['subject_id_roman_x'].isin(participants_with_nan)]

# Print the shape of the original and filtered DataFrames for comparison
print("Original DataFrame shape:", df_time.shape)
print("DataFrame after dropping rows with NaN values shape:", df_no_nan_values.shape)


Remove rows from 5 participants because they still have missing values in one of the 5 main variables

In [None]:
df_no_nan_values['subject_id_roman_x'].nunique()

In [None]:
df_no_nan_values.head(1)

In [647]:
df_time = df_no_nan_values.copy()

In [648]:
df_time_comp = df_time[df_time['compliance'] != 0]

In [None]:
df_time_comp['obscomp_steps'].isna().sum()

In [None]:
df_time_comp['obscomp_steps'].notna().sum()

In [None]:
df_time_comp['obscomp_steps'].value_counts()

In [None]:
df_time_comp['obscomp_steps'].describe()

In [None]:
df_time_comp['compliance'].notna().sum()

In [None]:
df_time_comp['compliance'].isna().sum()

In [None]:
df_time_comp['compliance'].value_counts()

#### Remove all rows with response delay > 10 mins

In [None]:
df_time_comp.head()

#### This should be the output for completion time

In [None]:
df_time_comp['completion_time'].describe()

In [None]:
df_time_comp['subject_id_roman_x'].nunique()

df_time_comp.to_parquet(f'../outputs/Model_ready_df/df_completion_time.parquet', index=False)

##### Make output TPI one_median

In [None]:
len(df_time_comp)

In [660]:
df_tpi = df_time_comp.copy()

In [None]:
len(df_tpi)

In [None]:
df_tpi['ct_per_survey_item'].describe()

In [663]:
df_tpi.to_parquet(f'../outputs/Model_ready_df/df_tpi.parquet', index=False)

##### Frequencies fast responging/completion time per survey item (TPI)

In [664]:
columns_of_interest = ['ct_per_survey_item_one_median', 'sleep_quality', 'stress', 'affect', 'hunger', 'fatigue', 'obscomp_steps', 
                       'age', 'gender', 'day_of_monitoring', 'time_of_the_day_categories_dummy', 'day_of_the_week_binary', 'ipi', 
                       'subject_id_roman_x', 'ct_per_survey_item_one_point_five_median', 'ct_per_survey_item_two_median']

# Calculating descriptive statistics
descriptive_stats = df_tpi[columns_of_interest].describe()

# Transposing the descriptive statistics DataFrame
descriptive_stats_transposed = descriptive_stats.transpose()

In [665]:
descriptive_stats_transposed.to_csv('../outputs/for_draft/frequency_tables/frequencies_fast_responding.csv')

#### This is the output for response delay

In [666]:
df_time_comp_no_respona = df_time_comp.dropna(subset=['response_delay_10min'])

In [None]:
df_time_comp_no_respona['response_delay_10min'].describe()

In [None]:
df_time_comp_no_respona['compliance'].notna().sum()

In [None]:
df_time_comp_no_respona['obscomp_steps'].value_counts()

In [None]:
df_time_comp_no_respona['obscomp_steps'].describe()

In [None]:
df_time_comp_no_respona['obscomp_steps'].notna().sum()

In [672]:
df_time_comp_no_respona.to_parquet(f'../outputs/Model_ready_df/df_response_delay_10mins.parquet', index=False)

##### Frequencies late responing

In [673]:
columns_of_interest = ['response_delay_10min', 'sleep_quality', 'stress', 'affect', 'hunger', 'fatigue', 'obscomp_steps', 
                       'age', 'gender', 'day_of_monitoring', 'time_of_the_day_categories_dummy', 'day_of_the_week_binary', 'ipi', 
                       'subject_id_roman_x', 'obscomp_steps_ten', 'survey_length_n']

# Calculating descriptive statistics
descriptive_stats = df_time_comp_no_respona[columns_of_interest].describe()

# Transposing the descriptive statistics DataFrame
descriptive_stats_transposed = descriptive_stats.transpose()

In [None]:
df_time_comp_no_respona['survey_length_n'].describe()

In [None]:
df_time_comp_no_respona.head()

In [676]:
descriptive_stats_transposed.to_csv('../outputs/for_draft/frequency_tables/frequencies_late_responding.csv')

### Visualisations

In [None]:
df_time_comp_no_respona.head()

In [None]:
df_time_comp.head()

In [None]:
df_ema['first_question_length'].value_counts().sort_index().plot(kind='bar', color='skyblue')

# Add labels and title
plt.xlabel('First question Length')
plt.ylabel('Count')
plt.title('Distribution of first question Length')
plt.show()


In [None]:
df_ema['first_question_length'].value_counts()

In [None]:
df_ema['length_zero_no_careless_one_point_five'].value_counts()

In [None]:
df_ema['length_zero_no_careless_one'].value_counts()

In [None]:
df_ema['length_zero_no_careless_two'].value_counts()

In [None]:
df_ema['nothing_no_careless_two_nulllength'].value_counts()

In [None]:
df_ema['nothing_no_careless_one_nulllength'].value_counts()

In [None]:
df_ema['nothing_no_careless_one_point_five_nulllength'].value_counts()

In [None]:
df_tpi['subject_id_roman_x'].nunique()

#### Final participants count

In [None]:
df_time_comp_no_respona['subject_id_roman_x'].nunique()

In [None]:
len(df_time_comp_no_respona)

In [None]:
df_time_comp['subject_id_roman_x'].nunique()

In [None]:
df_tpi['subject_id_roman_x'].nunique()

In [None]:
len(df_tpi)

In [None]:
df_no_lapro_noobsc_no_null_nepro_comp['subject_id_roman_x'].nunique()

In [None]:
len(df_no_lapro_noobsc_no_null_nepro_comp)

In [None]:
df_no_lapro_noobsc['subject_id_roman_x'].nunique()

In [None]:
len(df_no_lapro_noobsc)

In [None]:
df_time_comp_no_respona.head()

#### Calculate average age and gender for the included participants

In [None]:
avg_age = df_no_lapro_noobsc.groupby('subject_id_roman_x')['age'].mean()

# Calculating the overall average age
overall_avg_age = avg_age.mean()

# Calculating the standard deviation of average ages
std_avg_age = avg_age.std()

# Displaying the results
print("Overall Average Age:", overall_avg_age)
print("Standard Deviation of Average Age:", std_avg_age)

In [None]:
avg_age = df_no_lapro_noobsc.groupby('subject_id_roman_x')['age'].mean()

# Finding the minimum and maximum average ages
min_avg_age = avg_age.min()
max_avg_age = avg_age.max()

# Displaying the results
print("Minimum Average Age:", min_avg_age)
print("Maximum Average Age:", max_avg_age)

In [None]:
gender_distribution = df_no_lapro_noobsc.groupby('subject_id_roman_x')['gender'].unique()

# Counting occurrences of 0 (male) and 1 (female) in the gender column
male_count = (gender_distribution.apply(lambda x: 0 in x)).sum()
female_count = (gender_distribution.apply(lambda x: 1 in x)).sum()

# Displaying the results
print("Male Count:", male_count)
print("Female Count:", female_count)