In [30]:
import numpy as np
import pandas as pd
import os
from glob import glob
import turicreate as tc

In [31]:
dataset_root = "../dataset/HAPT Data Set/"
idle_label = '<IDLE>'
train_subjects = [i for i in range(1, 22)]
test_subjecs = [i for i in range(22, 31)]

In [32]:
def activity_map(dataset_root=None):
    act_map = {}
    with open(os.path.join(dataset_root, "activity_labels.txt"), "r") as al:
        for line in al.readlines():
            line = line.strip()
            label, activity = line.split(" ")
            act_map[int(label)] = activity
              
    return act_map

In [33]:
act_map = activity_map(dataset_root)

In [34]:
print act_map

{1: 'WALKING', 2: 'WALKING_UPSTAIRS', 3: 'WALKING_DOWNSTAIRS', 4: 'SITTING', 5: 'STANDING', 6: 'LAYING', 7: 'STAND_TO_SIT', 8: 'SIT_TO_STAND', 9: 'SIT_TO_LIE', 10: 'LIE_TO_SIT', 11: 'STAND_TO_LIE', 12: 'LIE_TO_STAND'}


In [35]:
gyro_files = glob(os.path.join(dataset_root, "RawData", "gyro_*.txt"))
acc_files = glob(os.path.join(dataset_root, "RawData", "acc_*.txt"))
label_file = os.path.join(dataset_root, "RawData", "labels.txt")                        

In [36]:
print len(gyro_files)
print len(acc_files)

61
61


### Turicreate

In [48]:
def find_label_for_containing_interval(intervals, index):
    shifted_index = index + intervals[:,1][0]
    containing_interval = intervals[:, 0][(intervals[:, 1] <= shifted_index) & (shifted_index <= intervals[:, 2])]
    
    if len(containing_interval) == 1:
        return containing_interval[0]
        

In [49]:
labels = tc.SFrame.read_csv(dataset_root + '/RawData/labels.txt', delimiter=' ', header=False, verbose=False)
labels = labels.rename({'X1': 'exp_id', 'X2': 'user_id', 'X3': 'activity_id', 'X4': 'start', 'X5': 'end'})
labels

exp_id,user_id,activity_id,start,end
1,1,5,250,1232
1,1,7,1233,1392
1,1,4,1393,2194
1,1,8,2195,2359
1,1,5,2360,3374
1,1,11,3375,3662
1,1,6,3663,4538
1,1,10,4539,4735
1,1,4,4736,5667
1,1,9,5668,5859


In [64]:
data = tc.SFrame()
files = zip(sorted(acc_files), sorted(gyro_files))
for acc_file, gyro_file in files:
    exp_id = int(acc_file.split('_')[1][-2:])
    usr_id = int(acc_file.split('_')[2].split('.')[0][-2:])
    # Load accel data
    sf = tc.SFrame.read_csv(acc_file, delimiter=' ', header=False, verbose=False)
    sf = sf.rename({'X1': 'acc_x', 'X2': 'acc_y', 'X3': 'acc_z'})

    # Load gyro data
    gyro_sf = tc.SFrame.read_csv(gyro_file, delimiter=' ', header=False, verbose=False)
    gyro_sf = gyro_sf.rename({'X1': 'gyro_x', 'X2': 'gyro_y', 'X3': 'gyro_z'})
    sf = sf.add_columns(gyro_sf)

    
    # Calc labels
    exp_labels = labels[labels['exp_id'] == exp_id][['activity_id', 'start', 'end']].to_numpy()
    sf = sf.add_row_number()
    sf['activity_id'] = sf['id'].apply(lambda x: find_label_for_containing_interval(exp_labels, x))
    sf = sf.remove_columns(['id'])
    sf['exp_id'] = exp_id
    sf['usr_id'] = usr_id
    data = data.append(sf)

In [65]:
data_df = data.to_dataframe()

In [66]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1122772 entries, 0 to 1122771
Data columns (total 9 columns):
acc_x          1122772 non-null float64
acc_y          1122772 non-null float64
acc_z          1122772 non-null float64
gyro_x         1122772 non-null float64
gyro_y         1122772 non-null float64
gyro_z         1122772 non-null float64
activity_id    815614 non-null float64
exp_id         1122772 non-null int64
usr_id         1122772 non-null int64
dtypes: float64(7), int64(2)
memory usage: 77.1 MB


In [67]:
data_df.to_pickle('dataset_imputed.pkl')

In [68]:
def get_activity(act_id):
    if pd.isnull(act_id):
        return '<IDLE>'
    else: 
        return act_map[int(act_id)]

In [69]:
train, test = tc.activity_classifier.util.random_split_by_session(data, session_id='exp_id', fraction=0.7)
train.materialize()
test.materialize()

In [70]:
# Imputed Dataset Splits
temp_df = train.to_dataframe()
temp_df['activity'] = temp_df['activity_id'].apply(lambda x: get_activity(x))
temp_df = temp_df.drop(columns = ['activity_id'])
temp_df.to_pickle('train_imputed.pkl')

temp_df = test.to_dataframe()
temp_df['activity'] = temp_df['activity_id'].apply(lambda x: get_activity(x))
temp_df = temp_df.drop(columns = ['activity_id'])
temp_df.to_pickle('test_imputed.pkl')

In [72]:
data_df['activity'] = data_df['activity_id'].apply(lambda x: get_activity(x))
data_df = data_df.drop(columns = ['activity_id'])
data_df = data_df[data_df['activity'] != '<IDLE>']
data_df.info()
data_df.to_pickle('dataset.pkl')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 815614 entries, 0 to 1121213
Data columns (total 9 columns):
acc_x       815614 non-null float64
acc_y       815614 non-null float64
acc_z       815614 non-null float64
gyro_x      815614 non-null float64
gyro_y      815614 non-null float64
gyro_z      815614 non-null float64
exp_id      815614 non-null int64
usr_id      815614 non-null int64
activity    815614 non-null object
dtypes: float64(6), int64(2), object(1)
memory usage: 62.2+ MB


In [73]:
data =  tc.SFrame(data_df)
train, test = tc.activity_classifier.util.random_split_by_session(data, session_id='exp_id', fraction=0.7)
train.materialize()
test.materialize()

In [75]:
# Non-Imputed Dataset Splits
temp_df = train.to_dataframe()
temp_df.to_pickle('train.pkl')
temp_df = test.to_dataframe()
temp_df.to_pickle('test.pkl')

In [158]:
gyro_data = []
acc_data = []
exp_ids = []
sub_ids = []
for g_f, _ in zip(gyro_files, acc_files):
    file_name = os.path.basename(g_f)
    _, exp_id, sub_id = file_name.split("_")
    a_f = os.path.join(
        os.path.dirname(g_f), "acc_" + exp_id + "_" + sub_id)

    exp_ids.append(int(exp_id[3:]))
    sub_ids.append(int(sub_id.split(".")[0][4:]))

    gyro_ = np.loadtxt(g_f, delimiter=" ")
    acc_ = np.loadtxt(a_f, delimiter=" ")
    
    gyro_data.append(gyro_)
    acc_data.append(acc_)
    
print len(gyro_data[0])
print len(acc_data)
# gyro_data = np.vstack(gyro_data)
# acc_data = np.vstack(acc_data)

sensor_data = np.hstack([acc_data, gyro_data])

20598
61


In [138]:
sensor_data.shape

(122,)

In [295]:
label_data = pd.read_csv(label_file, sep=" ", header=None, names=['exp_id', 'user_id', 'activity_id', 'start', 'end'])

In [296]:
label_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1214 entries, 0 to 1213
Data columns (total 5 columns):
exp_id         1214 non-null int64
user_id        1214 non-null int64
activity_id    1214 non-null int64
start          1214 non-null int64
end            1214 non-null int64
dtypes: int64(5)
memory usage: 47.5 KB


In [297]:
label_gp = label_data.groupby(['exp_id'])

In [310]:
df_dict = {}
for key, item in label_gp:
    df_dict[key] = pd.DataFrame(label_gp.get_group(key))
    print df_dict[key]

    exp_id  user_id  activity_id  start    end
0        1        1            5    250   1232
1        1        1            7   1233   1392
2        1        1            4   1393   2194
3        1        1            8   2195   2359
4        1        1            5   2360   3374
5        1        1           11   3375   3662
6        1        1            6   3663   4538
7        1        1           10   4539   4735
8        1        1            4   4736   5667
9        1        1            9   5668   5859
10       1        1            6   5860   6786
11       1        1           12   6787   6977
12       1        1            1   7496   8078
13       1        1            1   8356   9250
14       1        1            1   9657  10567
15       1        1            1  10750  11714
16       1        1            3  13191  13846
17       1        1            2  14069  14699
18       1        1            3  14869  15492
19       1        1            2  15712  16377
20       1   

     exp_id  user_id  activity_id  start    end
673      35       17            5    414   1680
674      35       17            7   1681   1834
675      35       17            4   1835   2969
676      35       17            8   2970   3127
677      35       17            5   3128   4409
678      35       17           11   4410   4745
679      35       17            6   4746   5811
680      35       17           10   5812   6082
681      35       17            4   6083   7145
682      35       17            9   7146   7347
683      35       17            6   7348   8875
684      35       17           12   8876   9113
685      35       17            1  10091  11142
686      35       17            1  11545  12559
687      35       17            3  13292  13886
688      35       17            2  14099  14736
689      35       17            3  15049  15585
690      35       17            2  15877  16503
691      35       17            3  16811  17400
692      35       17            2  17666

In [299]:
type(df_dict[1])

pandas.core.frame.DataFrame

In [209]:
def expand_labels(exp_labels, length):
#     print(exp_labels)
#     exp_labels = exp_labels.sort_values(by=["start"], axis=1)
    activity_labels = exp_labels.apply(lambda x: expand_along_activity(x), axis=1, reduce=True)
    activity_labels = np.hstack(activity_labels)
    
#     print(activity_labels.shape, length)
    
def expand_along_activity(x):
#     print(x)
    start = x['start']
    end = x['end']
    activity = x['activity_id']
    return [activity]*(end-start)

In [210]:
for exp in exp_ids:
    labels = label_data[label_data['exp_id']==exp]
    labels.apply(lambda x: expand_labels)

In [300]:
label_data.head()

Unnamed: 0,exp_id,user_id,activity_id,start,end
0,1,1,5,250,1232
1,1,1,7,1233,1392
2,1,1,4,1393,2194
3,1,1,8,2195,2359
4,1,1,5,2360,3374


In [301]:
label_data.sort_values(['start'], ascending=[1])

Unnamed: 0,exp_id,user_id,activity_id,start,end
167,9,5,5,136,1221
187,10,5,5,153,1152
593,31,15,5,185,1229
473,25,12,5,189,1753
126,7,4,5,198,1291
308,16,8,5,202,1313
227,12,6,5,207,1218
533,28,14,5,210,1372
207,11,6,5,210,1116
147,8,4,5,230,1292


In [312]:
def loop_activities(exp,user,activity,n_rows):
    rows_act = []
    for i in xrange(n_rows):
        new_row = []
        new_row.append(exp)
        new_row.append(user)
        new_row.append(activity)
        rows_act.append(new_row)
    return rows_act
    

In [359]:
labels_padded_list = []
time_ac = []
time_my = []
for g in df_dict.keys():
    
    g_data = df_dict[g]
    fr = g_data.iloc[0:]['start'].values[0]
    lr = g_data.iloc[-1:]['end'].values[0]
    time_ac.append(lr-fr)
    print "timesteps:"+str(lr - fr)
#     print g_data
    total_time = 0
    for i,(index, row) in enumerate(g_data.iterrows(),0):
        
        timesteps = g_data.iloc[i]['end'] - g_data.iloc[i]['start']
        
        if i == 0:
            rows_act = loop_activities(row['exp_id'],row['user_id'],act_map[str(row['activity_id'])],timesteps)
            labels_padded_list.extend(rows_act)
            total_time+=timesteps
#             print "norm:"+str(timesteps)
            continue
    #     print labels.iloc[i]['start'] - labels.iloc[i-1]['end']
#         print i
        if g_data.iloc[i]['start'] - g_data.iloc[i-1]['end'] > 1:
            idle_time = g_data.iloc[i]['start'] - g_data.iloc[i-1]['end'] - 2
            rows_act = loop_activities(row['exp_id'],row['user_id'],act_map['0'],idle_time)
            labels_padded_list.extend(rows_act)
            total_time+=idle_time
#             print "idle:"+str(idle_time)
            rows_act = loop_activities(row['exp_id'],row['user_id'],act_map[str(row['activity_id'])],timesteps)
            labels_padded_list.extend(rows_act)
            total_time+=timesteps
#             print "norm:"+str(timesteps)
        else:
            rows_act = loop_activities(row['exp_id'],row['user_id'],act_map[str(row['activity_id'])],timesteps)
            labels_padded_list.extend(rows_act)
            total_time+=timesteps
    time_my.append(total_time)
#             print "norm:"+str(timesteps)
#     row = g_data.iloc[-1:]
#     timesteps = row['end'].values[0] - row['start'].values[0]
#     rows_act = loop_activities(str(row['exp_id'].values[0]),str(row['user_id'].values[0]),act_map[str(row['activity_id'].values[0])],timesteps)
#     labels_padded_list.extend(rows_act)
#     print "norm:"+str(timesteps)
#     print len(labels_padded_list)

timesteps:17720
timesteps:18174
timesteps:16572
timesteps:14750
timesteps:19909
timesteps:16297
timesteps:16616
timesteps:14777
timesteps:15589
timesteps:14010
timesteps:15553
timesteps:14875
timesteps:15818
timesteps:14611
timesteps:14571
timesteps:15504
timesteps:14988
timesteps:14006
timesteps:14663
timesteps:6721
timesteps:6679
timesteps:14958
timesteps:15176
timesteps:15073
timesteps:15025
timesteps:16665
timesteps:15418
timesteps:16497
timesteps:15411
timesteps:15675
timesteps:15551
timesteps:16993
timesteps:17781
timesteps:19526
timesteps:17889
timesteps:20709
timesteps:18176
timesteps:16828
timesteps:15711
timesteps:17882
timesteps:16435
timesteps:19389
timesteps:18028
timesteps:16371
timesteps:16028
timesteps:17501
timesteps:16931
timesteps:19257
timesteps:17760
timesteps:19245
timesteps:19020
timesteps:19391
timesteps:18673
timesteps:16907
timesteps:17594
timesteps:19274
timesteps:15830
timesteps:17363
timesteps:16288
timesteps:18806
timesteps:17523


In [360]:
time_ac = np.array(time_ac)

In [362]:
time_my = np.array(time_my)

In [361]:
time_ac.sum()

1002961

In [363]:
time_my.sum()

1001297

In [349]:

labels_padded_df = pd.DataFrame(labels_padded_list)
labels_padded_df.columns = ['exp_id', 'user_id', 'activity_name']

In [350]:
labels_padded_df

Unnamed: 0,exp_id,user_id,activity_name
0,1,1,STANDING
1,1,1,STANDING
2,1,1,STANDING
3,1,1,STANDING
4,1,1,STANDING
5,1,1,STANDING
6,1,1,STANDING
7,1,1,STANDING
8,1,1,STANDING
9,1,1,STANDING


In [112]:
labels = pd.read_csv(label_file, delimiter=' ', header=None, verbose=False, names=['exp_id', 'user_id', 'activity_id', 'start', 'end'])

In [113]:
data = pd.DataFrame()
files = zip(sorted(acc_files), sorted(gyro_files))
for acc_file, gyro_file in files:
    _, exp_id, sub_id = os.path.basename(acc_file).split("_")
    
    exp_id = int(exp_id[3:])
    user_id = int(sub_id.split(".")[0][4:])

    # Load accel data
    sf = pd.read_csv(acc_file, delimiter=' ', header=None, verbose=False, names=['acc_x', 'acc_y', 'acc_z'])
    sf.insert(0, 'exp_id',  [exp_id]*len(sf))
    sf.insert(0, 'user_id',  [user_id]*len(sf))
    
    # Load gyro data
    gyro_sf = pd.read_csv(gyro_file, delimiter=' ', header=None, verbose=False, names=['gyro_x', 'gyro_y', 'gyro_z'])
    sf = pd.concat([sf, gyro_sf], axis=1)
    
    # Calc labels
    exp_labels = labels[labels['exp_id'] == exp_id]
#     sf.insert(0, "id", range(0, len(sf)))
    
    sf.insert(2, 'activity_id', expand_labels(exp_labels, len(sf)))

    data = data.append(sf)

  after removing the cwd from sys.path.


In [165]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1122772 entries, 0 to 19081
Data columns (total 9 columns):
user_id        1122772 non-null int64
exp_id         1122772 non-null int64
activity_id    0 non-null object
acc_x          1122772 non-null float64
acc_y          1122772 non-null float64
acc_z          1122772 non-null float64
gyro_x         1122772 non-null float64
gyro_y         1122772 non-null float64
gyro_z         1122772 non-null float64
dtypes: float64(6), int64(2), object(1)
memory usage: 85.7+ MB


In [None]:
                           Score
Classifiers                     
KNeighborsClassifier    0.748558
GaussianNB              0.724805
SVC                     0.769596
DecisionTreeClassifier  0.725144
Classifier:KNeighborsClassifier
              precision    recall  f1-score   support

           0       0.93      0.73      0.82       496
           1       0.96      0.75      0.84       471
           2       0.99      0.36      0.53       420
           3       0.61      0.82      0.70       491
           4       0.48      0.75      0.58       532
           5       1.00      1.00      1.00       537

   micro avg       0.75      0.75      0.75      2947
   macro avg       0.83      0.74      0.75      2947
weighted avg       0.82      0.75      0.75      2947

Classifier:GaussianNB
              precision    recall  f1-score   support

           0       0.49      0.77      0.60       496
           1       0.67      0.47      0.55       471
           2       0.72      0.59      0.65       420
           3       0.91      0.57      0.70       491
           4       0.71      0.89      0.79       532
           5       1.00      0.99      1.00       537

   micro avg       0.72      0.72      0.72      2947
   macro avg       0.75      0.71      0.72      2947
weighted avg       0.76      0.72      0.72      2947

Classifier:SVC
              precision    recall  f1-score   support

           0       0.57      0.82      0.67       496
           1       0.74      0.55      0.64       471
           2       0.69      0.52      0.59       420
           3       0.87      0.76      0.81       491
           4       0.78      0.89      0.83       532
           5       1.00      1.00      1.00       537

   micro avg       0.77      0.77      0.77      2947
   macro avg       0.78      0.76      0.76      2947
weighted avg       0.78      0.77      0.77      2947

Classifier:DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.60      0.60      0.60       496
           1       0.66      0.60      0.63       471
           2       0.61      0.62      0.62       420
           3       0.70      0.74      0.72       491
           4       0.73      0.74      0.74       532
           5       1.00      1.00      1.00       537

   micro avg       0.73      0.73      0.73      2947
   macro avg       0.72      0.72      0.72      2947
weighted avg       0.72      0.73      0.72      2947

              precision    recall  f1-score   support

           0       0.42      0.36      0.39       496
           1       0.49      0.45      0.47       471
           2       0.44      0.28      0.34       420
           3       0.65      0.75      0.70       491
           4       0.52      0.70      0.60       532
           5       1.00      1.00      1.00       537

   micro avg       0.61      0.61      0.61      2947
   macro avg       0.59      0.59      0.58      2947
weighted avg       0.60      0.61      0.60      2947