In [None]:
import pandas as pd
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import datetime

# Configuration

Specify the configuration file name that contains information about:

- the type of the EMA survey (weekly, morning, midday1, midday2, evening)
- amount by which the recorded time of survey responses should be shifted to become PT (UW phase I data is in MT zone).
- the cutoff for a response being considered empty
- the input files where column and scale names are specified.
- output files where various results (tables and figures) are stored.

Note: the input files can be found in script-input repository.

In [None]:
prompt = """\
Specify the **absolute path** of the configuration file containing information about:

- the type of the EMA survey (weekly, morning, midday1, midday2, evening)
- amount by which the recorded time of survey responses should be shifted to become PT (UW phase I data is in MT zone).
- the cutoff for a response being considered empty
- the input files where column and scale names are specified.
- output files where various results (tables and figures) are stored.

Note: the input files can be found in script-input repository.

Example (find a sample in script-input repository): emaprep-config-evening.json 

Tips:

- Place your configuration files in the same directory as this notebook.
- Use a different configuration file for each different analysis rather than modifying a single configuration file.
  For example, have separate files for weekly surveys and EMA surveys.

  
"""
#config_file = 'emaprep-config-evening.json'
config_file = input(prompt)
print('using configurations specified in {}'.format(config_file))

In [None]:
with open(config_file, 'r') as file_obj:
    config = json.load(file_obj)

institution = config['institution']
survey = config['survey']
data_file = config['data_file']
timeshift = config['timeshift']
survey_date_file = config['survey_date_file']
empty_count_threshold = config['empty_count_threshold']
column_name_file = config['column_name_file']
column_grouping_file = config['column_grouping_file']
cleaned_data_file = config['cleaned_data_file']
# TO-DO there should not be a separate file for discrimination, substance use, or stressor info. 
#       There should be one files at the end that captures these all.
data_with_discrimination_info = config['data_with_discrimination_info']
data_with_substance_info = config['data_with_substance_info']

In [None]:
print('processing survey', survey, 'of', institution, '...')
print('   data is obtained from', data_file)
print('   the times recorded in survey responses are shifted by', timeshift, 'hours to reflect PT/PST time.')
print('   the response dates are matched against the survey schedule dates recorded in ', survey_date_file)
print('   any response with more than', empty_count_threshold, 'empty items is removed from further processing.')
print('  ', column_name_file, 'provides the more readable column names; it is a mapping between column names in the data file and the more readable names.')
print('  ', column_grouping_file,'provides the grouping of columns for further score calculations.')
print('   data is stored in', cleaned_data_file, 'after clean-up.')
print('   data with discrimination summary column is stored in', data_with_discrimination_info)
print('   data with substance use summary columns is stored in', data_with_substance_info)

# Setup

In [None]:
# read column names
with open(column_name_file, 'r') as file_obj:
    columns = file_obj.readlines()
columns = [column.strip() for column in columns]

In [None]:
# read column groupings
with open(column_grouping_file, 'r') as file_obj:
    scale_grouping = json.load(file_obj)

In [None]:
# read EMA responses
responses = pd.read_csv(data_file, header=None, names=columns, skiprows=2)
responses['institution'] = institution
responses['survey'] = survey
#responses.info()

# Cleaning

In [None]:
# clean-up - remove invalid responses
# according to qualtrics (https://goo.gl/p4g16k):
#    status ==  0 --> normal
#    status ==  1 --> preview
#    status ==  2 --> test (NA in our data)
#    status ==  4 --> imported (NA in our data)
#    status ==  8 --> spam (e.g. because of duplicate submissions)
#    status == 16 --> offline (NA in our data)
valid = responses['status'] == 0
print('removing {} response(s) with invalid status'.format(responses.shape[0] - sum(valid)))
responses = responses[valid]
#responses.info()

In [None]:
# clean-up - remove unfinished responses
# according to qualtrics (https://goo.gl/p4g16k):
#    finished == 0 --> closed without completion (progress < 100)
#    finished == 1 --> submitted (progress == 100)
finished = responses['finished'] == 1
print('removing {} unfinished response(s)'.format(responses.shape[0] - sum(finished)))
responses = responses[finished]
#responses.info()

In [None]:
columns_survey = []
for item in scale_grouping:
    if(item == 'time' or item == 'default' or item == 'identifier'):
        continue
    columns_survey.extend(scale_grouping[item])
#print(columns_survey)

In [None]:
# clean-up - remove empty resposnes
empty = responses[columns_survey].isnull().all(axis=1)
print('removing {} empty response(s)'.format(sum(empty)))
responses = responses[~empty]
print('remaining empty response(s): {}'.format(sum(responses[columns_survey].isnull().all(axis=1))))
#responses.info()

In [None]:
# clean-up - remove near empty responses
responses['empty_count'] = responses[columns_survey].isnull().sum(axis=1)
print('removing {} near empty response(s)'.format(sum(responses['empty_count'] >= empty_count_threshold)))
responses = responses[responses['empty_count'] < empty_count_threshold]
#responses.info()

In [None]:
# clean-up - tag long submissions
# long submissions are submissions with duration longer than two standard error of the mean of all the submissions
# TO-DO double check with Jen if we want to have a fixed cut-off for each survey type or it is fine to have this
#       data-driven cut-off regime
min_duration = responses['duration'].min()
max_duration = responses['duration'].max()
mean_duration = responses['duration'].mean()
std_duration = responses['duration'].std()
stderr_duration = std_duration / math.sqrt(responses.shape[0])
explanation = 'time on survey varies'
print('{} from {} (sec) to {} (sec) (M = {:.3f}, std = {:.3f}, std_err = {:.3f})'.format(explanation,
                                                                                         min_duration,
                                                                                         max_duration,
                                                                                         mean_duration,
                                                                                         std_duration,
                                                                                         stderr_duration))
cut_off = mean_duration + 2 * std_duration
responses['long'] = responses['duration'] >= cut_off
print('tagged {} response(s) longer than two standard deviation of the mean (>= {:.3f} secs)'.format(responses[responses['long']].shape[0],
                                                                                                     cut_off))

In [None]:
print('distribution of responses NOT considered long')
ax = responses[~responses['long']].hist(column='duration', 
                                        bins=100, 
                                        grid=False, 
                                        figsize=(12,8), 
                                        color='#86bf91', 
                                        zorder=2, 
                                        rwidth=0.9)

ax = ax[0]
for x in ax:

    # Despine
    x.spines['right'].set_visible(False)
    x.spines['top'].set_visible(False)
    x.spines['left'].set_visible(False)

    # Switch off ticks
    x.tick_params(axis="both", 
                  which="both", 
                  bottom="off", 
                  top="off", 
                  labelbottom="on", 
                  left="off", 
                  right="off", 
                  labelleft="on")

    # Draw horizontal axis lines
    vals = x.get_yticks()
    for tick in vals:
        x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

    # Remove title
    x.set_title("")

    # Set x-axis label
    x.set_xlabel("Response Duration Not Considered Long (Seconds)", labelpad=20, weight='bold', size=12)

    # Set y-axis label
    x.set_ylabel("Responses", labelpad=20, weight='bold', size=12)

In [None]:
# conversion - convert the date columns to datetime objects and adjust for time zone different
columns_date = scale_grouping['time'].copy()
columns_date.remove('duration')
responses[columns_date] = responses[columns_date].apply(pd.to_datetime)
responses[columns_date] = responses[columns_date] + pd.DateOffset(hours=timeshift) # records of phase I are in MT
responses[columns_date] = responses[columns_date].apply(pd.to_datetime) # to ensure columns date are datetime objects
# NOTE: DateOffset data is stored as object data rahter than datetime data
#       I compared the results of time related calculations below with and without the additional type conversion.
#       Eveything remained the same in terms of late tagging and values of not_late_min, not_late_max, not_late_mean,
#       not_late_std, and not_late_stderr.
#responses.head()

In [None]:
# multisubmission removal - this is relevant if there are multiple submissions by the same person on the same date
# Unfortunately, Qualtrics status code has failed to identify some of the multi submission (e.g. because there 
# were made from different IP addresses).
responses['date_'] = responses['start_date'].dt.date
responses['date_'] = responses['date_'].astype('datetime64[ns]')
# NOTE I decided not to correct for start_time before 10am. While an outlier in terms of submission 
#      time the responses are arguably valid. Also, it is unclear by how much I should correct and
#      with something as long as 3 hours I may mess up with the logic below for removing the multi
#      submissions (e.g. if the correction causes the later response to go before the earlier one)
cols_ = list(responses.columns)
cols_.remove('date_')
cols_.remove('PID')
cols_.remove('empty_count')
# NOTE of multi responses pick the one w/ smaller number of empty questions, earlier start_date, and shorter duration
responses = responses.groupby(by=['date_', 'PID']).apply(lambda x: x.sort_values(
                                                                     by=['empty_count', 'start_date', 'duration'],
                                                                     ascending=[True, True, True]).iloc[0])[cols_].reset_index()
# NOTE this is a special case of a very late submission. Given I did not want to correct late submissions here 
# and this was the only case where this is happening
if(survey == 'weekly'):
    ind = responses[(responses['PID'] == 83) & (responses['date_'] == datetime.date(2018, 1, 22))].index
    responses = responses.drop(ind)

counter = responses.groupby(['date_', 'PID']).size()
multisub = counter[counter > 1] 
print('number of multi submissions after cleanup: {} (must be zero)'.format(len(multisub)))
responses = responses.drop(columns=['date_'])

In [None]:
# returns the indices of 
def check_late(srvy, resps):
    """\
    returns the boolean indices of respones in dataframe resps that are considered as acceptable (i.e. not late)
    for survey srvy
    """
    # NOTE: it is important to use end_date rather that recorded_date. While the former indicates the time that 
    #       a responder submits her/his response, the latter is the time when the data appears on Qualtrics surveys. 
    #       When the connection is slow, recorded_date can be much later than end_date
    ind  = (resps['start_date'] >= srvy['from']) & (resps['end_date'] <= srvy['to'])
    #print(srvy['from'], ' - ', srvy['to'], ' : ', resps[ind]['late'].size)
    return ind

In [None]:
# clean-up - tag late submissions
# acceptable submissions are submissions that started on the survey date and after start time and have been recorded
# on the survey date and before the expiry date. Any other submission is late.
survey_dates = pd.read_csv(survey_date_file)
survey_dates['from'] = pd.to_datetime(survey_dates['date'] + ' ' + survey_dates['start'])
survey_dates['to'] = pd.to_datetime(survey_dates['date'] + ' ' + survey_dates['expiry'])
survey_dates['date'] = pd.to_datetime(survey_dates['date'])
survey_dates['start'] = pd.to_datetime(survey_dates['start'],format= '%H:%M:%S').dt.time
survey_dates['expiry'] = pd.to_datetime(survey_dates['expiry'],format= '%H:%M:%S').dt.time
inds = survey_dates[survey_dates['type'] == survey].apply(lambda x : check_late(x, responses), axis = 1)
responses['late'] = True
responses.loc[inds.T.any(axis=1), 'late'] = False
print('tagged {} late response(s)'.format(responses[responses['late']].shape[0]))
# TO-DO consider a 10 minute of grace period
# TO-DO test: systematically evaluate entries with late = True and late = False if they make sense
#       (I eye balled the results for a few dattes and they were OK)

In [None]:
not_late_min = responses[~responses['late']]['duration'].min()
not_late_max = responses[~responses['late']]['duration'].max()
not_late_mean = responses[~responses['late']]['duration'].mean()
not_late_std = responses[~responses['late']]['duration'].std()
not_late_stderr = not_late_std / math.sqrt(responses[~responses['late']].shape[0])
explanation = 'time on survey for responses that are NOT late varies'
print('{} from {} (sec) to {} (sec) (M = {:.3f}, std = {:.3f}, std_err = {:.3f})'.format(explanation,
                                                                                         not_late_min,
                                                                                         not_late_max,
                                                                                         not_late_mean,
                                                                                         not_late_std,
                                                                                         not_late_stderr))
print('{} late response(s) are also long'.format(responses[responses['late']]['long'].sum()))

In [None]:
# grouping for tagging information
scale_grouping['tags'] = ['survey', 'long', 'late']

In [None]:
# clean-up - remove all the columns no longer needed
columns_drop = scale_grouping['default']
responses.drop(columns=columns_drop, inplace=True)

In [None]:
# store the cleaned up data
responses.to_csv(cleaned_data_file, index=False)

# Coherence Validation

In [None]:
# TO-DO test if values for each Likert Style or Yes/No scale fall in the expected range of values

In [None]:
# TO-DO test if values add up within each scale
#       e.g. did not experience stress and any other stressor are not both selected)
#       or did not use drug and a type of drug are not selected at the same time

In [None]:
# TO-DO check that unfair_not and other columns within discrimination are not checked at the same time 
# TO-DO similarly check that demand_not and other columns within demand are not checked at the same time
# TO-DO similarly check that alcohol_yesno as no and alcohol_amount or alcohol_duration don't have values at the same time
# TO-DO similarly check that drug_not and other columns within drug are not checked at the same time
# TO-DO similarly check that alcohol_yesno as no or drug_not do not co-occure with substance_negative or substance_alone

# Compliance Analysis

In [None]:
# TO-DO prepare the study size per survey date
#       - can add this to the survey date file and rename it as survey_file
#       - or can create a separate file containing this information
# TO-DO find the number of unique valid responses (not long, not late) for each survey
#       - create a function similar to check_late that returns the number of unique and not late responses
#       - similarly use apply and lambda to get the number 
# TO-DO the compliance rate for each date is the ratio of unique number of responses over the study size
# TO-DO run t-test to compare weekly rates (Sun vs. Wed) and ANOVA to compare daily rates (morning, midday1, midday2, evening)
# TO-DO find whose responses are missing on each survey date
#       - prepare a table where active PID's are listed for each survey date
#       - join these PID's with the unique PID's from responses

In [None]:
def completion_data(completions):
    completions = completions[completions['miss count'].notnull()]
    number_of_responses_expected = completions.shape[0] * 4
    number_of_response_missing = completions['miss count'].sum()
    rate = 100 - number_of_response_missing / number_of_responses_expected * 100
    return pd.Series({'missing': number_of_response_missing, 'expected': number_of_responses_expected, 'rate': rate})

In [None]:
# temporary analysis of completion rates based on compliance status obtained through excel for daily surveys
# if completion is irrelevant to a participant on a certain date (e.g. because they have dropped out of the 
# study), there is NA in the excel files. This is read as null in the pandas dataframe using read_csv command
# below. 
daily_completions = pd.read_csv('/Users/yasaman/Downloads/data/ema-daily-completions.csv')

In [None]:
PID_drop = [18, 26, 27, 41, 53, 56, 69, 71, 81, 83, 85, 89, 100, 101, 107, 112, 114, 119, 121, 129, 131, 
            133, 135, 139, 141, 147, 152, 164, 182, 192, 197, 200, 208]
daily_completions = daily_completions[~daily_completions['PID'].isin(PID_drop)]

In [None]:
daily_completions = daily_completions.groupby(by=['date']).apply(completion_data).reset_index()

In [None]:
daily_completions.to_csv('/Users/yasaman/Downloads/results/ema-daily-completion-rates_176.csv', index=False)

In [None]:
# temporary analysis of weekly average of completions rates
rates = pd.read_csv('/Users/yasaman/Downloads/results/completion_rates_176.csv')

In [None]:
avg_rates = rates.groupby('week').mean().reset_index()

In [None]:
avg_rates.to_csv('/Users/yasaman/Downloads/results/average_completion_rates_176.csv', index=False)

# Response Characteristics

distribution of responses for items of the following scales

- affect (anxiety, depression, frustration, happiness, being overwhelmed, lonliness, social interactions)
- stress
- coping
- substance use

In [None]:
# TO-DO for affect scales, create the bar plot over the response range (1 (not at all) to 5 (extremely))
# TO-DO for stress, create a bar plot for different kinds of stressors
#       also create a bar plot over the response range for the stress forecast
# TO-DO for coping, create a bar plot for different coping skills
#       also create a bar plot over the response range for effectiveness
# TO-DO consider other evening survey metrics

# Summary Information

<span style="color:red">TO-DO remove this once it is implemented in EMAaggregation for both horizontal and vertical aggregation. </span>

## discrimination

In [None]:
# TO-DO only applicable to weekly and evening surveys; make sure to condition the following steps on survey type

In [None]:
# prepare discrimination data
columns_unfair = scale_grouping['discrimination'].copy()
columns_unfair.remove('unfair_not')
if 'unfair_yesno' in columns_unfair:
    # NOTE: responses to weekly EMA's of phase I are only valid for discrimination analysis if unfair_yesno column is
    #       NULL while at least another unfair column is not NULL
    columns_unfair.remove('unfair_yesno')
    unfair_reported = (responses['unfair_yesno'].isnull() 
                       & responses[columns_unfair].notnull().any(axis=1))
    no_unfair_reported = (responses['unfair_yesno'].isnull()
                          & responses[columns_unfair].isnull().all(axis=1)) # TO-DO test
    #print('valid dates for discrimination analysis\n', 
    #       responses[unfair_reported]['recorded_date'].dt.date.unique())
else:
    unfair_reported = responses[columns_unfair].notnull().any(axis=1)
    no_unfair_reported = responses[columns_unfair].isnull().all(axis=1) # TO-DO test
responses.loc[unfair_reported, 'discriminated'] = 'YES'
responses.loc[no_unfair_reported, 'discriminated'] = 'NO'
unfair_subset = responses[unfair_reported]
#unfair_subset = unfair_subset.sort_values('PID')
no_unfair_subset = responses[no_unfair_reported]
#no_unfair_subset = no_unfair_subset.sort_values('PID')

In [None]:
# store discrimination information together with the data if applicable
responses.to_csv(data_with_discrimination_info, index=False) # TO-DO this should be done at the end after all aggregations

## alcohol consumption

In [None]:
# prepare substance use data
if survey == 'weekly' or survey == 'morning':
    responses['any_drug'] = responses[scale_grouping['substance'][6:16]].any(axis=1) | (responses['drug_yesno'] == 1)
    responses['any_substance'] = (responses['any_drug'] 
                                  | (responses['stimulant_yesno'] == 1)
                                  | (responses['alcohol_yesno'] == 1))
    responses.to_csv(data_with_substance_info, index=False) # TO-DO this should be done at the end after all aggregations

## stressors

In [None]:
# TO-DO

## TO-DO other data

In [None]:
# TO-DO