# Exploring the UTx000 Dataset
From the first cohort in Spring 2020

In [2]:
import warnings
warnings.filterwarnings('ignore')

# EMA Summary
This dataset was more or less a trial run, but some of these data could be useful. We need to look at what kind of data we can recover from the EMAs in addition to getting some statistics on the participation level.

In [3]:
import os

import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

# Data Import
We can import the morning and evening surveys now that they have been processed.

In [8]:
survey = {}
for timing in ['morning','evening']:
    df = pd.read_csv(f'../data/processed/bpeace1-{timing}-survey.csv',
                     index_col=0,parse_dates=True,infer_datetime_format=True)
    print(df.head())
    survey[timing] = df

                           ID  Content  Stress  Lonely  Sad  Energy  \
2020-02-03 11:06:25  6mkypp1o        2       1     0.0    0       2   
2020-03-01 20:01:45  6mkypp1o        2       1     0.0    0       3   
2020-02-28 09:06:27  6mkypp1o        1       1     0.0    0       1   
2020-02-21 08:30:11  6mkypp1o        2       0     0.0    0       1   
2020-02-10 11:25:38  6mkypp1o        1       1     0.0    0       2   

                            TST            SOL  NAW  Restful  
2020-02-03 11:06:25   6-7 hours  10-20 minutes  NaN        2  
2020-03-01 20:01:45  9-10 hours  10-20 minutes  NaN        2  
2020-02-28 09:06:27   6-7 hours  10-20 minutes  2.0        1  
2020-02-21 08:30:11   5-6 hours  10-20 minutes  2.0        1  
2020-02-10 11:25:38   6-7 hours  10-20 minutes  NaN        1  
                           ID  Content  Stress  Lonely  Sad  Energy
2020-02-04 11:10:22  6mkypp1o      2.0     2.0       1    1       3
2020-03-07 12:19:00  6mkypp1o      1.0     2.0       0    2

In [27]:
weekly = pd.read_csv('../data/processed/bpeace1-weekly-survey.csv',
                     index_col=0,parse_dates=True,infer_datetime_format=True)
weekly.replace({'Not at all':0,'A little bit':1,'Quite a bit':2,'Very Much':3,
                                'Never':0,'Almost Never':1,'Sometimes':2,'Fairly Often':3,'Very Often':4,
                                'Low energy':0,'Low Energy':0,'Somewhat low energy':1,'Neutral':2,'Somewhat high energy':3,'High energy':4,'High Energy':4,
                                'Not at all restful':0,'Slightly restful':1,'Somewhat restful':2,'Very restful':3,
                                'NO_ANSWER_SELECTED':-1,'NOT_PRESENTED':-1,'SKIP QUESTION':-1},inplace=True)
weekly

Unnamed: 0,ID,Upset,Unable,Stressed,Confident,Your_Way,Cope,Able,Top,Angered,Overcome
2020-01-31 12:12:29,6mkypp1o,,2,1,2,3,2,0,3,2,2
2020-02-23 11:28:13,6mkypp1o,,2,0,2,3,3,1,3,2,2
2020-02-15 22:15:59,6mkypp1o,,2,0,3,2,2,0,3,2,2
2020-02-29 09:28:55,6mkypp1o,,1,0,2,3,3,1,3,2,1
2020-02-03 10:05:29,6mkypp1o,,2,1,2,3,2,0,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...
2020-02-08 18:12:10,ozxf6hkg,,2,2,2,2,2,1,2,2,2
2020-01-25 08:22:46,ozxf6hkg,,0,0,1,2,3,0,2,3,1
2020-01-22 12:06:20,ozxf6hkg,,0,0,1,2,3,0,3,3,1
2020-02-29 20:09:14,ozxf6hkg,,0,0,0,2,2,0,2,2,0


In [28]:
parent_dir = '../data/raw/bpeace1/beiwe/survey_answers/'
morning_survey_id = 'vBewaVfZ6oWcsiAoPvF6CZi7'
evening_survey_id = 'OymqfwTdyaHFIsJoUNIfPWyG'
weekly_survey_id = 'aMIwBMFUgO8Rtw2ZFjyMTzDn'
participant = '6mkypp1o'
file = '2020-01-31 18_12_29.csv'
df = pd.read_csv(f'{parent_dir}{participant}/survey_answers/{weekly_survey_id}/{file}')
participant_df = pd.DataFrame(columns=['ID','Upset','Unable','Stressed','Confident','Your_Way','Cope','Able','Top','Angered','Overcome'])
participant_df.loc[datetime.strptime(file[:-4],'%Y-%m-%d %H_%M_%S') - timedelta(hours=6)] = [participant,df.loc[1,'answer'],df.loc[2,'answer'],df.loc[3,'answer'],df.loc[4,'answer'],df.loc[5,'answer'],df.loc[6,'answer'],df.loc[7,'answer'],df.loc[8,'answer'],df.loc[9,'answer'],df.loc[10,'answer']]

participant_df.replace({'Not at all':0,'A little bit':1,'Quite a bit':2,'Very Much':3,
                                'Never':0,'Almost Never':1,'Sometimes':2,'Fairly Often':3,'Very Often':4,
                                'Low energy':0,'Low Energy':0,'Somewhat low energy':1,'Neutral':2,'Somewhat high energy':3,'High energy':4,'High Energy':4,
                                'Not at all restful':0,'Slightly restful':1,'Somewhat restful':2,'Very restful':3,
                                'NO_ANSWER_SELECTED':-1,'NOT_PRESENTED':-1,'SKIP QUESTION':-1},inplace=True)
participant_df


Unnamed: 0,ID,Upset,Unable,Stressed,Confident,Your_Way,Cope,Able,Top,Angered,Overcome
2020-01-31 12:12:29,6mkypp1o,2,1,2,3,2,0,3,2,2,0


# Data Inspection
Here we do some simple visualizations to check out how much data we have available.

## By the Numbers


In [9]:
for key, val in survey.items():
    n = len(val['ID'].unique())
    print(f'Number of Participants submitting {key} surveys:\t{n}')

Number of Participants submitting morning surveys:	73
Number of Participants submitting evening surveys:	73


## Heatmap
