### Load Data

In [1]:
import pickle
import numpy as np
import pandas as pd
from functools import reduce
import datetime

In [2]:
# Label for ROO and DND
with open('EXPERIENCING_SAMPLE_R00_DND.pickle', 'rb') as handle:
    exp = pickle.load(handle)

In [3]:
# Label for only ROO
with open('EXPERIENCING_SAMPLE_DF.pickle', 'rb') as handle:
    exp_roo = pickle.load(handle)

In [4]:
# Predictor: Steps
with open('STEPS_TIMESERIES_R00.pickle', 'rb') as handle:
    steps = pickle.load(handle)

In [5]:
# Predictor: Heart Rate
with open('HR_with_dates_R00', 'rb') as handle:
    hr = pickle.load(handle)

In [6]:
# Predictor: Sleep
with open('SLEEP_R00.pickle', 'rb') as handle:
    sleep = pickle.load(handle)
sleep['ID'] = sleep['ID'].astype('int')

In [7]:
# Predicor: Survey
with open('SURVEY_DF_age_bmi.pickle', 'rb') as handle:
    survey = pickle.load(handle)

### Pre-processing

##### Add timestamp for sleep, hr, step

(1) HR

In [8]:
hr['datetime'] = hr['Date'].astype(str) + ' ' + hr['TIME'].astype(str)
hr['datetime'] = pd.to_datetime(hr['datetime'], errors='coerce')

(2) Steps

In [9]:
steps['datetime'] = steps['Date'].astype(str) + ' ' + steps['TIME'].astype(str)
steps['datetime'] = pd.to_datetime(steps['datetime'], errors='coerce')

##### Merge Survey Data

In [8]:
survey.keys()

odict_keys(['Demographics', 'Medical Screening', 'AVI', 'BISBAS', 'BIS', 'TPQ-NS', 'NEO-SF', 'SWLS', 'FTP', 'SBQ'])

In [9]:
# Get columns for Medical Screening
med_df = survey['Medical Screening'][['Subject','BMI']]

In [10]:
# Get columns for SBQ
survey['SBQ']
sbq_df = survey['SBQ'][['Subject', 'SBQ']]

In [11]:
# Get columns for FTP
# survey['FTP']
ftp_df = survey['FTP'][['Subject', 'FTP']]

In [12]:
# Get columns for SWLS
survey['SWLS']
swls_df = survey['SWLS'][['Subject', 'SWLS']]

In [13]:
# Get columns for NEO-SF
neosf_df = survey['NEO-SF'][['Subject', 'Neuroticism', 'Extraversion', 'Openness', 'Agreeableness','Conscientiousness']]

In [14]:
# Get columns for TPQ-NS
tpqns_df = survey['TPQ-NS'][['Subject','NS_total']]

In [15]:
# Get columns for BIS
bis_df = survey['BIS'][['Subject','BIS_total']]

In [16]:
# Get columns for BISBAS
survey['BISBAS'].head().T
bisbas_cols = ['Subject', 'BIS.5', 'BAS_D', 'BAS_FS', 'BAS_RR']
bisbas_df = survey['BISBAS'][bisbas_cols]

In [17]:
# Get columns for AVI
avi_col_actual = [x for x in survey['AVI'].columns.values if 'actual' in x]
avi_col_ideal = [x for x in survey['AVI'].columns.values if 'ideal' in x]
avi_col = np.concatenate([['Subject'], avi_col_actual,avi_col_ideal])
avi_df = survey['AVI'][avi_col]
avi_df['Subject'] = avi_df['Subject'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
# Get columns for Demographics
demo_df = survey['Demographics'][['Subject', 'Education', 'Ethnicity', 'Sex', 'Marital_Status', 'Children', 'Household_income', 'Religion', 'Age', 'Medications']]

In [19]:
# Merge all dataframes
data_frames = [sbq_df, ftp_df, swls_df, neosf_df, tpqns_df, bis_df, bisbas_df, avi_df, demo_df, med_df]
survey_df = reduce(lambda  left,right: pd.merge(left,right,on=['Subject'],
                                            how='left'), data_frames)

In [20]:
# Convert data type
survey_df['Ethnicity'] = survey_df['Ethnicity'].astype('str')
survey_df['Sex'] = survey_df['Sex'].astype('str')
survey_df['Marital_Status'] = survey_df['Marital_Status'].astype('str')
survey_df['Household_income'] = survey_df['Household_income'].astype('str')
survey_df['Religion'] = survey_df['Religion'].astype('str')
survey_df['Medications'] = survey_df['Medications'].astype('str')

In [21]:
survey_df.head()

(36, 41)

In [24]:
# Convert categorical to numerical
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
survey_df[['Ethnicity_ca','Sex_ca','Marital_Status_ca','Household_income_ca','Religion_ca','Medications_ca']] = survey_df[['Ethnicity','Sex','Marital_Status','Household_income','Religion','Medications']].apply(le.fit_transform)

In [26]:
survey_num = survey_df[['Subject', 'SBQ', 'FTP', 'SWLS', 'Neuroticism', 'Extraversion',
       'Openness', 'Agreeableness', 'Conscientiousness', 'NS_total',
       'BIS_total', 'BIS.5', 'BAS_D', 'BAS_FS', 'BAS_RR', 'HAP_actual',
       'P_actual', 'LAP_actual', 'LA_actual', 'LAN_actual', 'N_actual',
       'HAN_actual', 'HA_actual', 'HAP_ideal', 'P_ideal', 'LAP_ideal',
       'LA_ideal', 'LAN_ideal', 'N_ideal', 'HAN_ideal', 'HA_ideal',
       'Education', 'Children', 'Age', 'BMI',
       'Ethnicity', 'Sex', 'Marital_Status', 'Household_income','Religion','Medications',
       'Ethnicity_ca', 'Sex_ca', 'Marital_Status_ca', 'Household_income_ca',
       'Religion_ca', 'Medications_ca']]

### Feature Engineering

**(1) Steps**

In [27]:
# Parse 'start_survey' to datetime
exp_roo['start_survey'] = pd.to_datetime(exp_roo['start_survey'])
# Add 'survey_date' to associate with sleep data
exp_roo['survey_date'] = pd.to_datetime(exp_roo['start_survey'].dt.date)

In [28]:
# Generate 30 mins prior time point
exp_roo['start_survey_30m_ahead'] = exp_roo['start_survey'] - datetime.timedelta(minutes=30)
exp_roo['start_survey_1h_ahead'] = exp_roo['start_survey'] - datetime.timedelta(minutes=60)
exp_roo['start_survey_3h_ahead'] = exp_roo['start_survey'] - datetime.timedelta(hours=3)

In [40]:
def step_feature_gen(df):
    if df.shape[0] == 0:
        return [None,None,None,None,None,None,None]
    else:
        mean = df['VALUE'].mean()
        var = df['VALUE'].var()
        median = df['VALUE'].median()
        move_rate = df[df['VALUE'] > 0].shape[0]/df.shape[0]
        active_rate = df[df['VALUE'] > 10].shape[0]/df.shape[0]
        very_active_rate = df[df['VALUE'] > 20].shape[0]/df.shape[0]
        running_rate = df[df['VALUE'] > 30].shape[0]/df.shape[0]
        return [mean, var, median, move_rate, active_rate, very_active_rate, running_rate]

def gen_step_features(row):
    subject_id = row['subject']
    start_3h_time = row['start_survey_3h_ahead']
    start_1h_time = row['start_survey_1h_ahead']
    start_30m_time = row['start_survey_30m_ahead']
    end_time = row['start_survey']
    df_3h = steps[(steps['ID'].astype(int) == subject_id) & (steps['datetime'] >= start_3h_time) & (steps['datetime'] <= end_time)]
    df_1h = steps[(steps['ID'].astype(int) == subject_id) & (steps['datetime'] >= start_1h_time) & (steps['datetime'] <= end_time)]
    df_30m = steps[(steps['ID'].astype(int) == subject_id) & (steps['datetime'] >= start_30m_time) & (steps['datetime'] <= end_time)]
    # Generate features
    step_features_3h = step_feature_gen(df_3h)
    step_features_1h = step_feature_gen(df_1h)
    step_features_30m = step_feature_gen(df_30m)
    return step_features_3h + step_features_1h + step_features_30m
        

In [41]:
steps_features = exp_roo.apply(gen_step_features, axis=1, result_type='expand')

In [44]:
steps_features.shape

(779, 21)

In [46]:
steps_features = pd.DataFrame(steps_features.dropna())
steps_features_col_names = ['steps_mean_3h', 'steps_var_3h', 'steps_median_3h', 'move_rate_3h', 'active_rate_3h', 'very_active_rate_3h', 'running_rate_3h',
                'steps_mean_1h', 'steps_var_1h', 'steps_median_1h', 'move_rate_1h', 'active_rate_1h', 'very_active_rate_1h', 'running_rate_1h',
                'steps_mean_30m', 'steps_var_30m', 'steps_median_30m', 'move_rate_30m', 'active_rate_30m', 'very_active_rate_30m', 'running_rate_30m']
steps_features[steps_features_col_names] = pd.DataFrame(steps_features.values.tolist(), index=steps_features.index)
# steps_features = steps_features.set_index('index')

In [52]:
# Clean, with named columns
steps_features_cl = steps_features[steps_features_col_names]

**(2) Heart Rate**

In [53]:
# Let's calculate resting heart rate
resting_hr = pd.DataFrame(hr.groupby('ID')['VALUE'].agg('min')).reset_index()

In [56]:
# Calcuate different level of HR activity by subject
rest_hr = pd.DataFrame(hr.groupby('ID')['VALUE'].quantile([0, .3, .5, .8])).unstack()
rest_hr.columns = rest_hr.columns.droplevel()
rest_hr = rest_hr.reset_index()
rest_hr['ID'] = rest_hr['ID'].astype('int')

In [61]:
def hr_feature_gen(df, rest_thred, moderate_thred, active_thred):
    if df.shape[0] == 0:
        return [None,None,None,None,None,None,None]
    else:
        mean = df['VALUE'].mean()
        var = df['VALUE'].var()
        std = df['VALUE'].std()
        median = df['VALUE'].median()
        rest_rate = df[df['VALUE'] <= rest_thred].shape[0]/df.shape[0]
        moderate_rate = df[df['VALUE'] > moderate_thred].shape[0]/df.shape[0]
        very_active_rate = df[df['VALUE'] > active_thred].shape[0]/df.shape[0]
        return [mean, var, std, median, rest_rate, moderate_rate, very_active_rate]
def gen_hr_features(row):
    subject_id = row['subject']
#     print(subject_id)
    start_3h_time = row['start_survey_3h_ahead']
    start_1h_time = row['start_survey_1h_ahead']
    start_30m_time = row['start_survey_30m_ahead']
    end_time = row['start_survey']
    rest_thred = rest_hr.loc[rest_hr['ID']==subject_id, 0.3].values[0]
    moderate_thred = rest_hr.loc[rest_hr['ID']==subject_id, 0.5].values[0]
    active_thred = rest_hr.loc[rest_hr['ID']==subject_id, 0.8].values[0]
    df_3h = hr[(hr['ID'].astype(int) == subject_id) & (hr['datetime'] >= start_3h_time) & (hr['datetime'] <= end_time)]
    df_1h = hr[(hr['ID'].astype(int) == subject_id) & (hr['datetime'] >= start_1h_time) & (hr['datetime'] <= end_time)]
    df_30m = hr[(hr['ID'].astype(int) == subject_id) & (hr['datetime'] >= start_30m_time) & (hr['datetime'] <= end_time)]
    # Generate features
    hr_features_3h = hr_feature_gen(df_3h, rest_thred, moderate_thred, active_thred)
    hr_features_1h = hr_feature_gen(df_1h, rest_thred, moderate_thred, active_thred)
    hr_features_30m = hr_feature_gen(df_30m, rest_thred, moderate_thred, active_thred)
    return hr_features_3h + hr_features_1h + hr_features_30m

In [62]:
hr_features = exp_roo.apply(gen_hr_features, axis=1, result_type='expand')

In [63]:
hr_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,79.605805,107.223097,10.354859,77.0,0.012172,0.444757,0.104869,76.411043,31.498216,5.612327,...,0.01227,0.309816,0.009202,78.654545,37.800665,6.148225,77.0,0.0,0.460606,0.018182
4,58.940524,93.349638,9.661762,58.0,0.844758,0.044355,0.006048,59.724234,113.496366,10.653467,...,0.816156,0.052925,0.016713,60.913514,150.818566,12.280821,57.0,0.854054,0.102703,0.032432


In [64]:
hr_features = pd.DataFrame(hr_features.dropna())
hr_features_col_names = ['hr_mean_3h', 'hr_var_3h', 'hr_std_3h', 'hr_median_3h', 'hr_rest_rate_3h', 'hr_moderate_rate_3h', 'hr_very_active_rate_3h',
                        'hr_mean_1h', 'hr_var_1h', 'hr_std_1h', 'hr_median_1h', 'hr_rest_rate_1h', 'hr_moderate_rate_1h', 'hr_very_active_rate_1h',
                        'hr_mean_30m', 'hr_var_30m', 'hr_std_30m', 'hr_median_30m', 'hr_rest_rate_30m', 'hr_moderate_rate_30m', 'hr_very_active_rate_30m']
hr_features[hr_features_col_names] = pd.DataFrame(hr_features.values.tolist(), index=hr_features.index)
# hr_features = hr_features.set_index('index')

In [65]:
# Clean, with named columns
hr_features_cl = hr_features[hr_features_col_names]

In [66]:
# Merge design matrix progressively
exp_roo_steps = pd.merge(steps_features_cl, exp_roo, how='inner', left_index=True, right_index=True)
exp_roo_steps_hr = pd.merge(hr_features_cl, exp_roo_steps, how='inner', left_index=True, right_index=True)
exp_roo_steps_hr_sleep = pd.merge(sleep, exp_roo_steps_hr, how='inner', left_on=['ID','Date'], right_on=['subject','survey_date'])

In [67]:
exp_roo_steps_hr_sleep.to_csv('exp_roo_steps_hr_sleep.csv')

In [68]:
exp_roo_steps_hr_sleep_survey = pd.merge(exp_roo_steps_hr_sleep, survey_num, how='left', left_on=['ID'], right_on=['Subject'])

In [None]:
# Add survey hour as a feature
exp_roo_steps_hr_sleep_survey['survey_hour'] = exp_roo_steps_hr_sleep_survey.start_survey.dt.hour

In [69]:
exp_roo_steps_hr_sleep_survey.to_csv('exp_roo_steps_hr_sleep_survey.csv')

In [70]:
exp_roo_steps_hr.columns

Index(['hr_mean_3h', 'hr_var_3h', 'hr_std_3h', 'hr_median_3h',
       'hr_rest_rate_3h', 'hr_moderate_rate_3h', 'hr_very_active_rate_3h',
       'hr_mean_1h', 'hr_var_1h', 'hr_std_1h', 'hr_median_1h',
       'hr_rest_rate_1h', 'hr_moderate_rate_1h', 'hr_very_active_rate_1h',
       'hr_mean_30m', 'hr_var_30m', 'hr_std_30m', 'hr_median_30m',
       'hr_rest_rate_30m', 'hr_moderate_rate_30m', 'hr_very_active_rate_30m',
       'steps_mean_3h', 'steps_var_3h', 'steps_median_3h', 'move_rate_3h',
       'active_rate_3h', 'very_active_rate_3h', 'running_rate_3h',
       'steps_mean_1h', 'steps_var_1h', 'steps_median_1h', 'move_rate_1h',
       'active_rate_1h', 'very_active_rate_1h', 'running_rate_1h',
       'steps_mean_30m', 'steps_var_30m', 'steps_median_30m', 'move_rate_30m',
       'active_rate_30m', 'very_active_rate_30m', 'running_rate_30m',
       'subject', 'la_p', 'ha_p', 'ha_n', 'la_n', 'la', 'p', 'n', 'ha',
       'start_survey', 'survey_no', 'survey_date', 'start_survey_30m_ahead

In [181]:
# This is our design matrix
exp_roo_steps_hr_sleep

Unnamed: 0,Date,Minutes Asleep,Minutes Awake,Number of Awakenings,Time in Bed,DATE,ID,Experiment,hr_mean,hr_var,...,la_n,la,p,n,ha,start_survey,survey_no,survey_date,start_survey_30m_ahead,start_survey_3h_ahead
0,2015-07-16,477,20,16,505,0716,1002,R00,67.014184,36.371226,...,2,1,4,0,0,2015-07-16 00:24:25,2,2015-07-16,2015-07-15 23:54:25,2015-07-15 21:24:25
1,2015-07-16,477,20,16,505,0716,1002,R00,55.404715,92.215592,...,1,4,4,0,0,2015-07-16 10:37:28,3,2015-07-16,2015-07-16 10:07:28,2015-07-16 07:37:28
2,2015-07-16,477,20,16,505,0716,1002,R00,91.353989,196.070719,...,0,4,4,0,0,2015-07-16 15:33:23,4,2015-07-16,2015-07-16 15:03:23,2015-07-16 12:33:23
3,2015-07-16,477,20,16,505,0716,1002,R00,99.977011,73.491665,...,1,3,4,0,0,2015-07-16 21:45:35,5,2015-07-16,2015-07-16 21:15:35,2015-07-16 18:45:35
4,2015-07-17,404,45,17,462,0717,1002,R00,55.383227,38.189857,...,2,1,3,0,0,2015-07-17 10:02:31,6,2015-07-17,2015-07-17 09:32:31,2015-07-17 07:02:31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,2015-07-06,460,17,13,479,0706,1004,R00,69.185902,129.985472,...,1,1,2,0,1,2015-07-06 19:47:37,14,2015-07-06,2015-07-06 19:17:37,2015-07-06 16:47:37
1067,2015-07-07,440,20,9,474,0707,1004,R00,68.645468,149.915912,...,0,1,1,1,0,2015-07-07 12:41:00,15,2015-07-07,2015-07-07 12:11:00,2015-07-07 09:41:00
1068,2015-07-08,74,7,4,86,0708,1004,R00,63.573407,205.193633,...,3,1,1,0,1,2015-07-08 09:02:37,16,2015-07-08,2015-07-08 08:32:37,2015-07-08 06:02:37
1069,2015-07-09,184,5,5,194,0709,1004,R00,43.843504,63.830657,...,2,0,1,1,0,2015-07-09 09:16:06,18,2015-07-09,2015-07-09 08:46:06,2015-07-09 06:16:06


### Models

(1) Linear Regression

linear_model_1 with R-squared 0.031

In [189]:
import statsmodels.api as sm

In [206]:
exp_roo_steps_sleep['intercept'] = 1

In [210]:
linear_model_1 = sm.OLS(exp_roo_steps_sleep['p'], exp_roo_steps_sleep[['intercept','Minutes Asleep','Minutes Awake', 'Number of Awakenings', 'Time in Bed', 'mean', 'var','move_rate', 'active_rate', 'very_active_rate', 'running_rate']]).fit()

In [212]:
linear_model_1.summary()

0,1,2,3
Dep. Variable:,p,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.022
Method:,Least Squares,F-statistic:,3.551
Date:,"Thu, 23 Jan 2020",Prob (F-statistic):,0.000121
Time:,18:02:18,Log-Likelihood:,-1615.4
No. Observations:,1123,AIC:,3253.0
Df Residuals:,1112,BIC:,3308.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,2.1927,0.076,28.663,0.000,2.043,2.343
Minutes Asleep,-0.0018,0.002,-1.136,0.256,-0.005,0.001
Minutes Awake,-0.0079,0.003,-2.734,0.006,-0.014,-0.002
Number of Awakenings,-0.0011,0.006,-0.196,0.845,-0.012,0.010
Time in Bed,0.0020,0.002,1.295,0.196,-0.001,0.005
mean,0.0198,0.016,1.206,0.228,-0.012,0.052
var,-0.0004,0.000,-2.722,0.007,-0.001,-0.000
move_rate,0.8405,1.025,0.820,0.412,-1.170,2.851
active_rate,-3.4963,1.983,-1.764,0.078,-7.386,0.394

0,1,2,3
Omnibus:,16.947,Durbin-Watson:,1.405
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10.751
Skew:,-0.076,Prob(JB):,0.00463
Kurtosis:,2.545,Cond. No.,79800.0


We can observe that 'Minutes Awake' and 'var' are the best 2 predictors, with p < 0.01. 

'active_rate' is also relative small compared to other p-values.

Let's build a model only use these 3 features.

In [220]:
linear_model_2 = sm.OLS(exp_roo_steps_sleep['p'], exp_roo_steps_sleep[['intercept','Minutes Awake', 'var', 'active_rate']]).fit()

In [221]:
linear_model_2.summary()

0,1,2,3
Dep. Variable:,p,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,4.384
Date:,"Thu, 23 Jan 2020",Prob (F-statistic):,0.00446
Time:,18:08:40,Log-Likelihood:,-1626.5
No. Observations:,1123,AIC:,3261.0
Df Residuals:,1119,BIC:,3281.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,2.1524,0.058,36.802,0.000,2.038,2.267
Minutes Awake,-0.0039,0.002,-2.380,0.017,-0.007,-0.001
var,-3.503e-06,7.2e-05,-0.049,0.961,-0.000,0.000
active_rate,0.5146,0.232,2.217,0.027,0.059,0.970

0,1,2,3
Omnibus:,17.763,Durbin-Watson:,1.367
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10.813
Skew:,-0.053,Prob(JB):,0.00449
Kurtosis:,2.531,Cond. No.,5110.0
