#### Here we load all the data, interpolate it, redact and combine into one dataframe

In [4]:
import pandas as pd
import numpy as np
import os
import glob
import time

In [81]:
columns_to_keep = ['timestamp', 'activityID', 'heart_rate','hand_temp', 'hand_acc16g_x', 'hand_acc16g_y', 
                   'hand_acc16g_z','hand_gyro_x', 'hand_gyro_y', 'hand_gyro_z', 
                   'hand_mag_x', 'hand_mag_y', 'hand_mag_z','chest_temp', 'chest_acc16g_x', 
                   'chest_acc16g_y', 'chest_acc16g_z','chest_gyro_x', 'chest_gyro_y', 
                   'chest_gyro_z', 'chest_mag_x', 'chest_mag_y', 'chest_mag_z','ankle_temp', 
                   'ankle_acc16g_x', 'ankle_acc16g_y', 'ankle_acc16g_z','ankle_gyro_x', 
                   'ankle_gyro_y', 'ankle_gyro_z', 'ankle_mag_x', 'ankle_mag_y', 'ankle_mag_z','subject']
                   

In [71]:
#Functions to load data 
#Slightly adapted from Chiu-Yun's code

def exportColName():
    
    '''
    Create columns names
    '''
    
    handColName=['hand_temp', 'hand_acc16g_x', 'hand_acc16g_y', 'hand_acc16g_z', 'hand_acc6g_x', 'hand_acc6g_y', 'hand_acc6g_z', 
             'hand_gyro_x', 'hand_gyro_y', 'hand_gyro_z', 'hand_mag_x', 'hand_mag_y', 'hand_mag_z', 'hand_ori_0', 'hand_ori_1', 
             'hand_ori_2', 'hand_ori_3']
    chestColName=['chest_temp', 'chest_acc16g_x', 'chest_acc16g_y', 'chest_acc16g_z', 'chest_acc6g_x', 'chest_acc6g_y', 
                  'chest_acc6g_z', 'chest_gyro_x', 'chest_gyro_y', 'chest_gyro_z', 'chest_mag_x', 'chest_mag_y', 'chest_mag_z', 
                  'chest_ori_0', 'chest_ori_1', 'chest_ori_2', 'chest_ori_3']
    ankleColName=['ankle_temp', 'ankle_acc16g_x', 'ankle_acc16g_y', 'ankle_acc16g_z', 'ankle_acc6g_x', 'ankle_acc6g_y', 
                  'ankle_acc6g_z', 'ankle_gyro_x', 'ankle_gyro_y', 'ankle_gyro_z', 'ankle_mag_x', 'ankle_mag_y', 'ankle_mag_z', 
                  'ankle_ori_0', 'ankle_ori_1', 'ankle_ori_2', 'ankle_ori_3']
    return ['timestamp', 'activityID', 'heart_rate']+handColName+chestColName+ankleColName


def remove_activity_start_end(df,n=1000,dt=0.01):
    
    '''
    Remove n samples from the start and end of each activity within a subject dataframe. The Reiss paper used
    10 seconds or 1000 samples
    '''
    
    df_redacted_parts = []
        
    #Detect changes in activity ID
    activities = df['activityID']
    diffs = np.diff(activities)
    gap_indices = np.where(abs(diffs)>1e-6)[0]
    
    #Generate a list of the start and end indices of each activity
    gap_indices_list = []

    if len(gap_indices > 0):
        for i in range(len(gap_indices)):
            if i == 0:
                gap_indices_list.append((0,gap_indices[i]))
            else:
                gap_indices_list.append((gap_indices[i-1]+1,gap_indices[i]))
                
        gap_indices_list.append((gap_indices[-1]+1,len(df)))
    else:
        gap_indices_list.append((0,len(df)))
    
    #Loop though the start and end index pairs and remove the first and last n datapoints
    
    for index_pair in gap_indices_list:
        if ((index_pair[1]-n)-(index_pair[0]+n)) > 0:
            df_redacted_parts.append(df.loc[index_pair[0]+n:index_pair[1]-n])
        else:
            print(index_pair[1]-n,index_pair[0]+n)
    
    redacted_df = pd.concat(df_redacted_parts)
    
    return redacted_df
        

def loadSubject(filename):
    
    '''
    Load a single subject from file and return a dataframe
    '''
    
    col=exportColName()
    index = int(filename.split('.')[0][-1])
    tempData = pd.read_csv(filename, sep=' ', names=col)
    tempData['subject'] = (index)*np.ones(len(tempData))
    interpData = interpolate_all(tempData)
    redacteddf = remove_activity_start_end(interpData)
    redacteddf.reset_index(inplace=True)
    return redacteddf

def interpolate_all(df):
    
    '''
    Interpolate values in the columns of a dataframe so that they have the
    same sampling rate as the other columns
    '''
    
    df.interpolate(inplace=True)
    
    return df

def loadAllSubjects(dirname):
    
    '''
    Load all subject files & return a dataframe
    '''
    
    if os.path.exists(dirname):
        dfiles = list(sorted(glob.glob("%s/*.dat" %dirname)))
    else:
        print ("Given dirname %s not found" %dirname)
        
    col=exportColName()
    dfs = []
    
    for i in range(len(dfiles)):
        filename=dfiles[i]
        df = loadSubject(filename)
        dfs.append(df)
    
    allData = pd.concat(dfs)
    allData.dropna(inplace=True)
    allData.reset_index(drop=True,inplace=True)
    
    return allData


In [63]:
t1 = "/Users/rmartinshort/Documents/Berkeley/GDSO/PAMAP2_data/PAMAP2_Dataset/Protocol/subject101.dat"

In [64]:
df1 = loadSubject(t1)

[  2927  30114  53594  75311  84966 108539 118759 141700 154280 162400
 173395 180875 184561 192331 199750 223758 246011 253174 273439 283095
 306670 312741 334006 348887 361799]


In [65]:
len(df1)

324418

In [69]:
protocol_datadir = "/Users/rmartinshort/Documents/Berkeley/GDSO/PAMAP2_data/PAMAP2_Dataset/Protocol"
optional_datadir = "/Users/rmartinshort/Documents/Berkeley/GDSO/PAMAP2_data/PAMAP2_Dataset/Optional"

In [72]:
st= time.time()
all_data_protocol = loadAllSubjects(protocol_datadir)
ed = time.time()

169395 170309
328576 330575
360817 362561
50160 52154
572 1000
7477 8964


In [73]:
print(ed-st)

65.4854679107666


In [74]:
st= time.time()
all_data_optional = loadAllSubjects(optional_datadir)
ed = time.time()

128963 129524
167472 168617
179412 180652
192472 194008


In [75]:
print(ed-st)

23.092968940734863


In [76]:
all_data = pd.concat([all_data_protocol,all_data_optional])

In [77]:
all_data.head()

Unnamed: 0,index,timestamp,activityID,heart_rate,hand_temp,hand_acc16g_x,hand_acc16g_y,hand_acc16g_z,hand_acc6g_x,hand_acc6g_y,...,ankle_gyro_y,ankle_gyro_z,ankle_mag_x,ankle_mag_y,ankle_mag_z,ankle_ori_0,ankle_ori_1,ankle_ori_2,ankle_ori_3,subject
0,1000,18.38,0,101.0,30.125,2.38774,8.71824,3.93367,2.29398,8.62749,...,0.007882,0.02102,-60.8641,-38.629,-59.0093,1.0,0.0,0.0,0.0,1.0
1,1001,18.39,0,101.0,30.125,2.50057,8.83085,3.8965,2.28004,8.70315,...,-0.005661,0.005893,-60.7469,-38.857,-58.7652,1.0,0.0,0.0,0.0,1.0
2,1002,18.4,0,101.0,30.125,2.23391,8.71874,3.85439,2.28143,8.77874,...,-0.029526,0.032437,-60.741,-38.4355,-57.2754,1.0,0.0,0.0,0.0,1.0
3,1003,18.41,0,101.0,30.125,2.12001,8.64374,3.85294,2.26535,8.73348,...,-0.016553,-0.010742,-60.5084,-39.3196,-57.7818,1.0,0.0,0.0,0.0,1.0
4,1004,18.42,0,101.0,30.125,2.2028,8.72071,4.04636,2.26484,8.65799,...,-0.038506,0.012706,-59.9536,-38.7909,-57.5332,1.0,0.0,0.0,0.0,1.0


### Check for NaNs 

In [78]:
numRow, numCol=all_data.shape
all_data.isnull().sum()/numRow*100

index             0.0
timestamp         0.0
activityID        0.0
heart_rate        0.0
hand_temp         0.0
hand_acc16g_x     0.0
hand_acc16g_y     0.0
hand_acc16g_z     0.0
hand_acc6g_x      0.0
hand_acc6g_y      0.0
hand_acc6g_z      0.0
hand_gyro_x       0.0
hand_gyro_y       0.0
hand_gyro_z       0.0
hand_mag_x        0.0
hand_mag_y        0.0
hand_mag_z        0.0
hand_ori_0        0.0
hand_ori_1        0.0
hand_ori_2        0.0
hand_ori_3        0.0
chest_temp        0.0
chest_acc16g_x    0.0
chest_acc16g_y    0.0
chest_acc16g_z    0.0
chest_acc6g_x     0.0
chest_acc6g_y     0.0
chest_acc6g_z     0.0
chest_gyro_x      0.0
chest_gyro_y      0.0
chest_gyro_z      0.0
chest_mag_x       0.0
chest_mag_y       0.0
chest_mag_z       0.0
chest_ori_0       0.0
chest_ori_1       0.0
chest_ori_2       0.0
chest_ori_3       0.0
ankle_temp        0.0
ankle_acc16g_x    0.0
ankle_acc16g_y    0.0
ankle_acc16g_z    0.0
ankle_acc6g_x     0.0
ankle_acc6g_y     0.0
ankle_acc6g_z     0.0
ankle_gyro

We still have some NaNs left over from the interpolation. Not sure why this is, but lets just remove them. 

### Write to file - this will be our starting point for future work

In [82]:
all_data[columns_to_keep].to_csv("All_subject_data.csv",index=False)