# Daily Step Count Methods V3

Workbook created by Benjamin Winiarski, re-using code from serval other notebooks.

Goal of this workbook is to create a pipeline of functions to that will transform any dataset into a daily step count table with different calculation methods and the adherence added to it

In [1]:
# Importing required functions
import pandas as pd
import numpy as np
from datetime import datetime

Loading in the reading data functions that were created in Martin's notebooks

In [2]:
def read_CLEAN_data(filename):
    #Read in CSV file
    dat = pd.read_csv(filename)
    #Convert datetimes
    dat["Date"] = pd.to_datetime(dat["Date"], format = '%Y-%m-%d').dt.date
    
    return dat

Load in the adherence measures from Stanley's code

In [11]:
# 10 hours of non zeroes 
def ten_hours_non_zeros(df,hours):
    df1 = df.copy()
    df1['Not Zero'] = df1['Steps'] > 0
    df1 = df1.groupby('Date').sum()
    df1 = df1['Not Zero'] > hours 
    return df1

#Greater than 500 steps
def greater_than_certain_steps(df,min_steps):
    df1 = df.copy()
    df1 = df1.groupby('Date').sum()
    df1 = df1['Steps']> min_steps
    return df1 

#Active in 3 time blocks 
def three_time_blocks(df):
    df1 = df.copy()
    bins = [-1,2,10,14,25,np.inf]
    names = ['3','1','2', '4', '5']
    df1['3timeblock'] = pd.cut(df1['Hour'],bins, labels = names)
    df1 = df1.replace({'3timeblock':r'4'},{'3timeblock':'3'},regex = True)
    df1 = df1.replace({'3timeblock':r'5'},{'3timeblock':'3'},regex = True)
    df1 = df1.groupby(['Date','3timeblock'], as_index=False)['Steps'].sum()
    df1['Steps within timeblock'] = df1['Steps']> 0 
    df1 = df1.groupby('Date').sum()
    df1 = df1['Steps within timeblock'] ==3 
    return df1

#Combining it all into 1 dataframe
def three_method_table(x):
    valid_1= ten_hours_non_zeros(x,10)
    valid_2= greater_than_certain_steps(x,500)
    valid_3= three_time_blocks(x)
    result = pd.concat([valid_1,valid_2,valid_3], axis=1, join='inner')
    result.reset_index(level=0, inplace=True)
    result.columns = ["Date", "Ten Hours Non Zero","Steps > Five Hundred","Three Timeblock"]
    result['All Three True'] = result["Three Timeblock"]&result["Ten Hours Non Zero"]&result["Steps > Five Hundred"]
    result_sum = result.sum()
    return result , result_sum

In [12]:
filename = "../../data/cleaned/User1.csv"
df = read_CLEAN_data(filename)
df.head()

Unnamed: 0,Date,Hour,Steps
0,2014-12-07,9,941.0
1,2014-12-07,10,408.0
2,2014-12-07,11,157.0
3,2014-12-07,12,1017.0
4,2014-12-07,13,0.0


In [13]:
df2 = three_method_table(df)[0]
df2.head()

Unnamed: 0,Date,Ten Hours Non Zero,Steps > Five Hundred,Three Timeblock,All Three True
0,2014-12-07,False,True,True,False
1,2014-12-08,True,True,True,True
2,2014-12-09,False,True,True,False
3,2014-12-10,False,True,True,False
4,2014-12-11,False,True,False,False


In [14]:
df3 = df.merge(df2,on ='Date')
df3.head()

Unnamed: 0,Date,Hour,Steps,Ten Hours Non Zero,Steps > Five Hundred,Three Timeblock,All Three True
0,2014-12-07,9,941.0,False,True,True,False
1,2014-12-07,10,408.0,False,True,True,False
2,2014-12-07,11,157.0,False,True,True,False
3,2014-12-07,12,1017.0,False,True,True,False
4,2014-12-07,13,0.0,False,True,True,False


## Create a Function to return the daily steps with one of the adherence measures used

In [15]:
def filter_by_adherence(df, measure):
    df2 = three_method_table(df)[0]
    df3 = df.merge(df2,on ='Date')
    if(measure == 1):
        mask = (df3['Ten Hours Non Zero'] == True)
    elif(measure == 2):
        mask = (df3['Steps > Five Hundred'] == True)
    elif(measure == 3):
        mask = (df3['Three Timeblock'] == True)    
    elif(measure == 4):
        mask = (df3['All Three True'] == True)
    filtered_df = df3.loc[mask].drop(['Ten Hours Non Zero', 'Steps > Five Hundred', 'Three Timeblock', 'All Three True'], axis=1)
    #filtered_df = filtered_df.drop('index', axis=1)
    return filtered_df

Test filter function

In [16]:
filename = "../../data/cleaned/User1.csv"
df = read_CLEAN_data(filename)
measure = 1
df2 = filter_by_adherence(df, measure)
df2.head()

Unnamed: 0,Date,Hour,Steps
15,2014-12-08,0,0.0
16,2014-12-08,1,0.0
17,2014-12-08,2,0.0
18,2014-12-08,3,0.0
19,2014-12-08,4,0.0


Works, so create the full function

In [26]:
# First I need my method functions

# Method 1 calculates daily steps based on all the step activity that is available during the day
def all_step_activity(df):
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"],format='%Y-%m-%d')
    df.index = df["Date"]
    daily_step_count = df.drop(['Hour'], axis=1).resample('D').sum(min_count=1).dropna().reset_index()
    
    return(daily_step_count)

# Method 2 calculates daily steps based on the 10 most active hours of the day
def top_ten_most_active_hours(df):
    df = df.copy()
    df = df.sort_values(by=["Date", "Steps"], ascending=False).groupby('Date').head(10)
    df["Date"] = pd.to_datetime(df["Date"],format='%Y-%m-%d')
    df.index = df["Date"]
    daily_step_count = df.drop(['Hour'], axis=1).resample('D').sum(min_count=1).dropna().reset_index()
    
    return(daily_step_count)

# Method 3 calculates daily steps based on a 10 hour time block during the day
def ten_hour_time_block(df, start_hour, end_hour):
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"],format='%Y-%m-%d')
    mask = (df['Hour'] >= start_hour) & (df['Hour'] <= end_hour)
    df = df.loc[mask]
    df.index = df["Date"]
    daily_step_count = df.drop(['Hour'], axis=1).resample('D').sum(min_count=1).dropna().reset_index()
    
    return(daily_step_count)

In [27]:
# Test the methods out
dailysteps1 = ten_hours_non_zeros(df2, 10)
dailysteps1.head()

Date
2014-12-08    True
2014-12-12    True
2014-12-17    True
2014-12-25    True
2015-01-05    True
Name: Not Zero, dtype: bool

Method looks to work well

### Full Pipeline Function

In [28]:
def calculate_daily_steps(file_name, filter_measure, start_time, end_time):
    
    df = read_CLEAN_data(filename)
    filtered_df = filter_by_adherence(df, filter_measure)
    
    
    dailysteps_method1 = all_step_activity(filtered_df)
    dailysteps_method2 = top_ten_most_active_hours(filtered_df)
    dailysteps_method3 = ten_hour_time_block(filtered_df, start_time, end_time)
    
    dailysteps_merged = dailysteps_method1.merge(dailysteps_method2,on ='Date').merge(dailysteps_method3,on ='Date')
    dailysteps_merged.columns = [["Date", "all_step_activity", "top_ten_most_active_hours", "ten_hour_time_block"]]
    
    return (dailysteps_merged)
    

Test Pipeline Function out

In [29]:
filename_user1 = "../../data/cleaned/user1.csv"

filter_measure = 4
start_time = 8
end_time = 18

daily_steps_user1 = calculate_daily_steps(filename_user1, filter_measure, start_time, end_time)
daily_steps_user1.head()

Unnamed: 0,Date,all_step_activity,top_ten_most_active_hours,ten_hour_time_block
0,2014-12-08,6567.0,6550.0,6173.0
1,2014-12-12,5978.0,5886.460839,5034.0
2,2014-12-17,7192.0,6999.0,4773.0
3,2014-12-25,7227.0,7122.0,6939.0
4,2015-01-05,8010.0,7836.0,6913.0


In [30]:
daily_steps_user1.describe()

Unnamed: 0,all_step_activity,top_ten_most_active_hours,ten_hour_time_block
count,419.0,419.0,419.0
mean,8399.615993,7600.989275,6556.824145
std,3828.483717,3934.16836,3505.329823
min,751.0,717.0,440.0
25%,5904.763502,4814.23051,4006.080856
50%,7477.0,7009.659578,5927.726869
75%,10296.33899,9713.649943,8610.886991
max,30234.470106,27521.159646,22032.174027


As you can see, by addding the adherence measure, we are getting very different results when comparing the 3 methods