### Prototype code to slide though a dataframe of activities and select columns for feature engineering  

We start from a dataframe that includes all activities done by all subjects at all times. We want to create time slices of a specified length for each activity and then calculate statistical properties of the feature columns for those slices. We need to ensure that each slice corresponds to one subject doing one activity.

In [1]:
import pandas as pd
import numpy as np

In [101]:
all_data = pd.read_csv("All_subjects_activities.csv")

In [102]:
all_data.head()

Unnamed: 0,timestamp,activityID,heartrate,T_hand,A1C1_hand,A1C2_hand,A1C3_hand,A2C1_hand,A2C2_hand,A2C3_hand,...,G1C2_ankle,G1C3_ankle,M1C1_ankle,M1C2_ankle,M1C3_ankle,O1_ankle,O2_ankle,O3_ankle,O4_ankle,subject
0,8.38,0,104.0,30.0,2.37223,8.60074,3.51048,2.43954,8.76165,3.35465,...,0.00925,-0.01758,-61.1888,-38.9599,-58.1438,1.0,0.0,0.0,0.0,1.0
1,8.39,0,,30.0,2.18837,8.5656,3.66179,2.39494,8.55081,3.64207,...,-0.004638,0.000368,-59.8479,-38.8919,-58.5253,1.0,0.0,0.0,0.0,1.0
2,8.4,0,,30.0,2.37357,8.60107,3.54898,2.30514,8.53644,3.7328,...,0.000148,0.022495,-60.7361,-39.4138,-58.3999,1.0,0.0,0.0,0.0,1.0
3,8.41,0,,30.0,2.07473,8.52853,3.66021,2.33528,8.53622,3.73277,...,-0.020301,0.011275,-60.4091,-38.7635,-58.3956,1.0,0.0,0.0,0.0,1.0
4,8.42,0,,30.0,2.22936,8.83122,3.7,2.23055,8.59741,3.76295,...,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694,1.0,0.0,0.0,0.0,1.0


In [103]:
#Choose to remove activity 0?
all_data = all_data[all_data['activityID'] != 0]

In [104]:
dt = 0.01 #sample rate in seconds
sliding_window_length = 512 #window length in samples
sliding_window_offset = 100 #offset between windows in samples
#Values from Reiss (2012)


for subjectID in all_data['subject'].unique():
    
    print('\n-------------------------------')
    print('Subject %s' %subjectID)
    
    subjectDF = all_data[all_data['subject'] == subjectID]
    
    for activityID in subjectDF['activityID'].unique():
        
        print("Activity %s" %activityID)
        
        activityDF = subjectDF[subjectDF['activityID'] == activityID].reset_index(drop=True)
        #print(activityDF.head())
        
        #check that time is continuous and ascending within each activityDF 
        time_values = activityDF['timestamp']
        
        if time_values.is_monotonic:
        
            #Check for gaps in the time series
            diffs = np.diff(time_values)
            gap_indices = np.where(abs(np.diff(time_values)-dt)>1e-6)[0]
            
            #print(gap_indices)

            #If gaps exist, take note of their indices so that we can deal with the
            #continuous timeseries in sections that don't contan gaps
            gap_indices_list = []
            
            if len(gap_indices > 0):
                for i in range(len(gap_indices)):
                    if i == 0:
                        gap_indices_list.append((0,gap_indices[i]))
                    elif i == (len(gap_indices)-1):
                        gap_indices_list.append((gap_indices[i]+1,len(activityDF)-1))
                    else:
                        gap_indices_list.append((gap_indices[i-1]+1,gap_indices[i]))
            else:
                gap_indices_list.append((0,len(activityDF)))
                
            #print(gap_indices_list)
            
            for index_pair in gap_indices_list:
                
                #This dt_slice dataframe will be sampled at constant dt
                
                constant_dt_slice = activityDF.loc[index_pair[0]:index_pair[1]].reset_index(drop=True)
                       
                #Check if the slice that we've made has enough samples to calculate features
                
                if len(constant_dt_slice) < sliding_window_length:
                    print("Not enough points to create slice of length %i" %sliding_window_length)
                    
                #Check that the slice is indeed sampled at constant dt
                    
                elif (abs(np.mean(np.diff(constant_dt_slice['timestamp']))) - dt > 1e-6):
                    print("Error in slicing gap indices!")
                
                else:
                                        
                    #Move though the slice in this number of units. We will lose some data
                    #at the end of each constant_dt_slice dataframe because we must take an integer
                    #number of steps 
                
                    nslices = int(np.floor((len(constant_dt_slice)-sliding_window_length)/sliding_window_offset))
                    
                    print("Length of slice in samples: %i" %len(constant_dt_slice))
                    print("Number of feature calculations to be done: %i" %nslices)
             
                    t1 = 0
                    for j in range(nslices):
                        t2 = t1 + sliding_window_length - 1
                        #print(t1,t2)

                        # This is the dataframe on which we will calculate statistical features
                        feature_slice = constant_dt_slice.loc[t1:t2]
                        
                        t1 = t1 + sliding_window_offset
                        #print(len(feature_slice))
                
        
        else:
            
            #This shouldn't happen
            print('Error: Timeseries for activity %s is not monotonic' %activityID)
        


-------------------------------
Subject 1.0
Activity 1
Length of slice in samples: 27187
Number of feature calculations to be done: 266
Activity 2
Length of slice in samples: 23480
Number of feature calculations to be done: 229
Activity 3
Length of slice in samples: 21717
Number of feature calculations to be done: 212
Activity 17
Length of slice in samples: 23573
Number of feature calculations to be done: 230
Activity 16
Length of slice in samples: 22941
Number of feature calculations to be done: 224
Activity 12
Length of slice in samples: 8120
Number of feature calculations to be done: 76
Activity 13
Length of slice in samples: 7480
Number of feature calculations to be done: 69
Activity 4
Length of slice in samples: 22253
Number of feature calculations to be done: 217
Activity 7
Length of slice in samples: 20265
Number of feature calculations to be done: 197
Activity 6
Length of slice in samples: 23575
Number of feature calculations to be done: 230
Activity 5
Length of slice in sampl