### Prototype code to slide though a dataframe of activities and select columns for feature engineering  

We start from a dataframe that includes all activities done by all subjects at all times. We want to create time slices of a specified length for each activity and then calculate statistical properties of the feature columns for those slices. We need to ensure that each slice corresponds to one subject doing one activity.

In [13]:
import pandas as pd
import numpy as np

In [14]:
all_data = pd.read_csv("All_subject_data.csv")

In [15]:
all_data.head()

Unnamed: 0,index,timestamp,activityID,heart_rate,hand_temp,hand_acc16g_x,hand_acc16g_y,hand_acc16g_z,hand_acc6g_x,hand_acc6g_y,...,ankle_gyro_y,ankle_gyro_z,ankle_mag_x,ankle_mag_y,ankle_mag_z,ankle_ori_0,ankle_ori_1,ankle_ori_2,ankle_ori_3,subject
0,1000,18.38,0,101.0,30.125,2.38774,8.71824,3.93367,2.29398,8.62749,...,0.007882,0.02102,-60.8641,-38.629,-59.0093,1.0,0.0,0.0,0.0,1.0
1,1001,18.39,0,101.0,30.125,2.50057,8.83085,3.8965,2.28004,8.70315,...,-0.005661,0.005893,-60.7469,-38.857,-58.7652,1.0,0.0,0.0,0.0,1.0
2,1002,18.4,0,101.0,30.125,2.23391,8.71874,3.85439,2.28143,8.77874,...,-0.029526,0.032437,-60.741,-38.4355,-57.2754,1.0,0.0,0.0,0.0,1.0
3,1003,18.41,0,101.0,30.125,2.12001,8.64374,3.85294,2.26535,8.73348,...,-0.016553,-0.010741,-60.5084,-39.3196,-57.7818,1.0,0.0,0.0,0.0,1.0
4,1004,18.42,0,101.0,30.125,2.2028,8.72071,4.04636,2.26484,8.65799,...,-0.038506,0.012706,-59.9536,-38.7909,-57.5332,1.0,0.0,0.0,0.0,1.0


In [4]:
#Choose to remove activity 0?
all_data = all_data[all_data['activityID'] != 0]

Faster option - look for breaks in activity within subject and then take those timeseries and chunk them

In [24]:
dt = 0.01 #sample rate in seconds
sliding_window_length = 512 #window length in samples
sliding_window_offset = 100 #offset between windows in samples
#Values from Reiss (2012)


for subjectID in all_data['subject'].unique():
    
    print('\n-------------------------------')
    print('Subject %s' %subjectID)
    
    subjectDF = all_data[all_data['subject'] == subjectID]
    subjectDF.reset_index(inplace=True,drop=True)
      
    #Detect changes in activity ID
    activities = subjectDF['activityID']
    diffs = np.diff(activities)
    gap_indices = np.where(abs(diffs)>1e-6)[0]
    
    #Generate a list of the start and end indices of each activity
    gap_indices_list = []

    if len(gap_indices > 0):
        for i in range(len(gap_indices)):
            if i == 0:
                gap_indices_list.append((0,gap_indices[i]))
            else:
                gap_indices_list.append((gap_indices[i-1]+1,gap_indices[i]))
                
        gap_indices_list.append((gap_indices[-1]+1,len(subjectDF)))
    else:
        gap_indices_list.append((0,len(subjectDF)))
            
    print(gap_indices_list)

    for index_pair in gap_indices_list:

        constant_activity_slice = subjectDF.loc[index_pair[0]:index_pair[1]]
        constant_activity_slice.reset_index(inplace=True,drop=True)
        #print(constant_dt_slice['timestamp'].is_monotonic)
        
        print('Activity %i' %np.mean(constant_activity_slice['activityID']))

        #Check if the slice that we've made has enough samples to calculate features

        if len(constant_activity_slice) < sliding_window_length:
            print("Not enough points to create slice of length %i" %sliding_window_length)

        #Check that the slice is sampled at constant dt. Discard if it isn't!

        else:

            #Move though the slice in this number of units. We will lose some data
            #at the end of each constant_dt_slice dataframe because we must take an integer
            #number of steps 

            nslices = int(np.floor((len(constant_activity_slice)-sliding_window_length)/sliding_window_offset))

            print("Length of slice in samples: %i" %len(constant_activity_slice))
            print("Number of feature calculations to be done: %i" %nslices)

            t1 = 0
            for j in range(nslices):
                t2 = t1 + sliding_window_length - 1
                #print(t1,t2)

                # This is the dataframe on which we will calculate statistical features
                feature_slice = constant_activity_slice.loc[t1:t2]
                
                if (abs(np.mean(np.diff(feature_slice['timestamp']))) - dt > 1e-6):
                    print("Error in slicing gap indices! Data chunk is not sampled at constant dt!")

                t1 = t1 + sliding_window_offset
                #print(len(feature_slice))
                
    
        


-------------------------------
Subject 1.0
[(0, 927), (928, 26114), (26115, 47594), (47595, 67311), (67312, 74966), (74967, 96539), (96540, 104759), (104760, 125700), (125701, 136280), (136281, 142400), (142401, 151395), (151396, 156875), (156876, 158561), (158562, 164331), (164332, 169750), (169751, 191758), (191759, 212011), (212012, 217174), (217175, 235439), (235440, 243095), (243096, 264670), (264671, 268741), (268742, 288006), (288007, 300887), (300888, 311799), (311800, 341353), (341354, 393872), (393873, 441937), (441938, 523583), (523584, 527438), (527439, 579527), (579528, 596642), (596643, 621756), (621757, 625771)]
Activity 0
Length of slice in samples: 928
Number of feature calculations to be done: 4
Activity 1
Length of slice in samples: 25187
Number of feature calculations to be done: 246
Activity 2
Length of slice in samples: 21480
Number of feature calculations to be done: 209
Activity 3
Length of slice in samples: 19717
Number of feature calculations to be done: 192

In [6]:
time_values

0        31.20
1        31.21
2        31.22
3        31.23
4        31.24
5        31.25
6        31.26
7        31.27
8        31.28
9        31.29
10       31.30
11       31.31
12       31.32
13       31.33
14       31.34
15       31.35
16       31.36
17       31.37
18       31.38
19       31.39
20       31.40
21       31.41
22       31.42
23       31.43
24       31.44
25       31.45
26       31.46
27       31.47
28       31.48
29       31.49
         ...  
12752    94.81
12753    94.82
12754    94.83
12755    94.84
12756    94.85
12757    94.86
12758    94.87
12759    94.88
12760    94.89
12761    94.90
12762    94.91
12763    94.92
12764    94.93
12765    94.94
12766    94.95
12767    94.96
12768    94.97
12769    94.98
12770    94.99
12771    95.00
12772    95.01
12773    95.02
12774    95.03
12775    95.04
12776    95.05
12777    95.06
12778    95.07
12779    95.08
12780    95.09
12781    95.10
Name: timestamp, Length: 12782, dtype: float64

In [7]:
time_values.diff

<bound method Series.diff of 0        31.20
1        31.21
2        31.22
3        31.23
4        31.24
5        31.25
6        31.26
7        31.27
8        31.28
9        31.29
10       31.30
11       31.31
12       31.32
13       31.33
14       31.34
15       31.35
16       31.36
17       31.37
18       31.38
19       31.39
20       31.40
21       31.41
22       31.42
23       31.43
24       31.44
25       31.45
26       31.46
27       31.47
28       31.48
29       31.49
         ...  
12752    94.81
12753    94.82
12754    94.83
12755    94.84
12756    94.85
12757    94.86
12758    94.87
12759    94.88
12760    94.89
12761    94.90
12762    94.91
12763    94.92
12764    94.93
12765    94.94
12766    94.95
12767    94.96
12768    94.97
12769    94.98
12770    94.99
12771    95.00
12772    95.01
12773    95.02
12774    95.03
12775    95.04
12776    95.05
12777    95.06
12778    95.07
12779    95.08
12780    95.09
12781    95.10
Name: timestamp, Length: 12782, dtype: float64>

In [8]:
np.diff(time_values )

array([0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01])

In [10]:
time_values.

AttributeError: 'Series' object has no attribute 'is_increasing'