## Load Dataset and Create Dataframe

In [1]:
import pandas as pd
import os
from random import randrange
%load_ext autoreload
%autoreload 2

In [2]:
def load_to_dataframe(path, field_names, device):
    data = pd.read_csv(path, header=None)
    data = data.rename(columns={i : field_names[i] for i in range(len(field_names))})
    
    # change column order
    cols = data.columns.to_list()
    cols.remove('timestamp')
    cols.insert(0, 'timestamp')
    data = data[cols]
    
    data[cols[-1]] = data[cols[-1]].apply(lambda x: float(x.replace(';', '')))
    
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    
    # add device columns
    if device == 'phone':
        data['phone'] = 1
        data['watch'] = 0
    else:
        data['phone'] = 0
        data['watch'] = 1
        
    return data

In [3]:
dirs = ['datasets/phone/accel/', 'datasets/phone/gyro/', 'datasets/watch/accel/', 'datasets/watch/gyro/']

def make_filenames_list(path):
    filenames_list = os.listdir(path)
    filenames_list = [(path + file) for file in filenames_list if file != '.DS_Store']
    return sorted(filenames_list)

datasets = [make_filenames_list(dirs[0]), make_filenames_list(dirs[1]), make_filenames_list(dirs[2]), make_filenames_list(dirs[3])]

In [4]:
labels = {
    'A': 'walking',
    'B': 'jogging',
    'C': 'stairs',
    'D': 'sitting',
    'E': 'standing',
    'F': 'typing',
    'G': 'brushing_teeth',
    'H': 'eating_soup',
    'I': 'eating_chips',
    'J': 'eating_pasta',
    'K': 'drinking_from_cup',
    'L': 'eating_sandwich',
    'M': 'kicking',
    'O': 'playing_catch',
    'P': 'dribbling',
    'Q': 'writing',
    'R': 'clapping',
    'S': 'folding',
}

field_names_accel = ['id', 'activity', 'timestamp', 'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)']
field_names_gyro = ['id', 'activity', 'timestamp', 'Gyroscope x (m/s^2)', 'Gyroscope y (m/s^2)', 'Gyroscope z (m/s^2)']

In [5]:
def make_df(path, field_names, device):
    # make dataframe
    dataframe = load_to_dataframe(path, field_names, device)
    # split by activity
    grouped = dataframe.groupby('activity')
    grouped_list = [grouped.get_group(x) for x in grouped.groups]
    return grouped_list

In [6]:
len(datasets[0])

51

In [7]:
# participant_no = randrange(52)

In [8]:
participant_no = 21

In [9]:
phone_accel_file = datasets[0][participant_no]
phone_gyro_file = datasets[1][participant_no]
watch_accel_file = datasets[2][participant_no]
watch_gyro_file = datasets[3][participant_no]

In [10]:
phone_accel_list = make_df(phone_accel_file, field_names_accel, 'phone')
phone_gyro_list = make_df(phone_gyro_file, field_names_gyro, 'phone')
watch_accel_list = make_df(watch_accel_file, field_names_accel, 'watch')
watch_gyro_list = make_df(watch_gyro_file, field_names_gyro, 'watch')

In [58]:
phone_accel_list[-1]

Unnamed: 0,timestamp,id,activity,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),phone,watch
77177,1970-01-02 23:31:40.062018259,1621,S,0.462679,8.185177,-0.650026,1,0
77178,1970-01-02 23:31:40.084421228,1621,S,0.046088,8.541314,-0.271143,1,0
77179,1970-01-02 23:31:40.124182530,1621,S,0.494403,10.111312,-0.038906,1,0
77180,1970-01-02 23:31:40.163981279,1621,S,0.440533,9.911995,-0.849343,1,0
77181,1970-01-02 23:31:40.204192791,1621,S,0.377087,9.461287,-1.225831,1,0
...,...,...,...,...,...,...,...,...
81683,1970-01-02 23:34:39.811654907,1621,S,-0.620098,9.828198,-0.551265,1,0
81684,1970-01-02 23:34:39.851297722,1621,S,-1.008557,9.937134,-0.517746,1,0
81685,1970-01-02 23:34:39.891509022,1621,S,-0.910395,9.874287,-0.440533,1,0
81686,1970-01-02 23:34:39.931381679,1621,S,-0.551864,9.809045,-0.395642,1,0


## Preprocess data

In [12]:
from Chapter2.CreateDataset_old import CreateDataset
from util.VisualizeDataset import VisualizeDataset
from util import util
from pathlib import Path
import copy
import sys
import pickle
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from tqdm import tqdm
from functools import reduce

In [13]:
# aggregate phone data
preprocessed_phone = CreateDataset('', granularity=250)

for activity_df in phone_accel_list:
    preprocessed_phone.add_numerical_dataset(activity_df, 'accel')
preprocessed_phone.concat_datasets('accel')

for activity_df in phone_gyro_list:
    preprocessed_phone.add_numerical_dataset(activity_df, 'gyro')
preprocessed_phone.concat_datasets('gyro')

preprocessed_phone.merge_datasets()

In [14]:
cols = ['id', 'activity', 'phone', 'watch', 'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)', 
        'Gyroscope x (m/s^2)', 'Gyroscope y (m/s^2)', 'Gyroscope z (m/s^2)', '_merge']

preprocessed_phone.data_table = preprocessed_phone.data_table[cols]

In [30]:
timestamps = preprocessed_phone.data_table.index.to_list()

In [39]:
preprocessed_phone.data_table.loc[preprocessed_phone.data_table['activity'] == 'B', :]

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (m/s^2),Gyroscope y (m/s^2),Gyroscope z (m/s^2),_merge
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-01-02 22:24:11.500,1621,B,1,0,4.034529,7.090029,0.526325,-0.060187,0.095341,-0.311057,both
1970-01-02 22:24:11.750,1621,B,1,0,1.229921,6.679823,-3.969786,1.037035,1.504864,-0.300227,both
1970-01-02 22:24:12.000,1621,B,1,0,1.938448,10.001521,-1.370253,-0.148985,0.237706,-0.020240,both
1970-01-02 22:24:12.250,1621,B,1,0,2.524287,8.686662,-1.346539,-0.781371,-0.196719,0.156949,both
1970-01-02 22:24:12.500,1621,B,1,0,3.896364,11.012430,1.032200,-0.114161,0.656736,-0.454690,both
...,...,...,...,...,...,...,...,...,...,...,...
1970-01-02 22:27:10.250,1621,B,1,0,0.838484,5.687084,1.320659,-0.224010,-0.804275,0.200270,both
1970-01-02 22:27:10.500,1621,B,1,0,-1.040380,7.798713,-0.002294,-0.712839,0.110965,-0.468006,both
1970-01-02 22:27:10.750,1621,B,1,0,0.889745,10.819497,0.938128,0.395568,0.757758,-0.025389,both
1970-01-02 22:27:11.000,1621,B,1,0,0.100385,5.012347,1.661747,-0.329167,0.036828,0.278795,both


In [15]:
preprocessed_phone.data_table.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12972 entries, 1970-01-02 22:20:33.750000 to 1970-01-02 23:34:40
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   id                      12972 non-null  int64   
 1   activity                12972 non-null  object  
 2   phone                   12972 non-null  int64   
 3   watch                   12972 non-null  int64   
 4   Acceleration x (m/s^2)  12971 non-null  float64 
 5   Acceleration y (m/s^2)  12971 non-null  float64 
 6   Acceleration z (m/s^2)  12971 non-null  float64 
 7   Gyroscope x (m/s^2)     12972 non-null  float64 
 8   Gyroscope y (m/s^2)     12972 non-null  float64 
 9   Gyroscope z (m/s^2)     12972 non-null  float64 
 10  _merge                  12972 non-null  category
dtypes: category(1), float64(6), int64(3), object(1)
memory usage: 1.1+ MB


In [16]:
preprocessed_phone.data_table.isna().sum()

id                        0
activity                  0
phone                     0
watch                     0
Acceleration x (m/s^2)    1
Acceleration y (m/s^2)    1
Acceleration z (m/s^2)    1
Gyroscope x (m/s^2)       0
Gyroscope y (m/s^2)       0
Gyroscope z (m/s^2)       0
_merge                    0
dtype: int64

In [17]:
# preprocessed_phone.data_table.loc[preprocessed_phone.data_table['activity'] == 'D', :]

In [18]:
preprocessed_phone.data_table.loc[preprocessed_phone.data_table['_merge'] != 'both', :]

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (m/s^2),Gyroscope y (m/s^2),Gyroscope z (m/s^2),_merge
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-01-02 23:34:40,1621,S,1,0,,,,-0.094809,0.0,0.084156,right_only


In [19]:
# save dataframe to csv
preprocessed_phone.data_table.to_csv('intermediate_datafiles/preprocessed_phone_data_person_{}.csv'.format(participant_no))

In [20]:
# aggregate watch data
preprocessed_watch = CreateDataset('', granularity=250)

for activity_df in watch_accel_list:
    preprocessed_watch.add_numerical_dataset(activity_df, 'accel')
preprocessed_watch.concat_datasets('accel')

for activity_df in watch_gyro_list:
    preprocessed_watch.add_numerical_dataset(activity_df, 'gyro')
preprocessed_watch.concat_datasets('gyro')

preprocessed_watch.merge_datasets()

In [21]:
preprocessed_watch.data_table = preprocessed_watch.data_table[cols]

In [22]:
preprocessed_watch.data_table

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (m/s^2),Gyroscope y (m/s^2),Gyroscope z (m/s^2),_merge
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-01-03 01:14:27.000,1621.0,A,0.0,1.0,-4.573973,-8.353220,2.070985,0.773692,-0.355514,0.165107,both
1970-01-03 01:14:27.250,1621.0,A,0.0,1.0,-2.865710,-7.348733,-0.980665,-3.969433,0.894113,-0.683696,both
1970-01-03 01:14:27.500,1621.0,A,0.0,1.0,-2.073708,-7.208911,-7.482838,-0.212902,-0.045846,-0.408432,both
1970-01-03 01:14:27.750,1621.0,A,0.0,1.0,-2.842566,-4.964826,-8.095993,-1.392197,-0.615743,-0.372852,both
1970-01-03 01:14:28.000,1621.0,A,0.0,1.0,-3.209038,-5.278227,-6.037698,0.850212,-0.022589,-0.084165,both
...,...,...,...,...,...,...,...,...,...,...,...
1970-01-03 02:28:27.000,1621.0,S,0.0,1.0,5.044433,-4.596987,5.981524,-0.613955,-1.475738,0.280963,both
1970-01-03 02:28:27.250,1621.0,S,0.0,1.0,9.728928,-5.214212,5.998283,-1.108878,-0.395771,-0.180083,both
1970-01-03 02:28:27.500,1621.0,S,0.0,1.0,4.657052,-10.631333,-1.227896,-2.777724,0.597272,1.910605,both
1970-01-03 02:28:27.750,1621.0,S,0.0,1.0,1.127998,-8.841428,-0.667653,0.186056,0.225919,2.030127,both


In [23]:
preprocessed_watch.data_table.loc[preprocessed_watch.data_table['_merge'] != 'both', :]

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (m/s^2),Gyroscope y (m/s^2),Gyroscope z (m/s^2),_merge
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-01-03 01:21:44.500,1621.0,P,0.0,1.0,10.319159,-16.939426,4.869806,,,,left_only
1970-01-03 01:24:44.500,1621.0,P,0.0,1.0,-11.798476,-19.649662,19.64203,,,,left_only
1970-01-03 01:28:59.000,1621.0,O,0.0,1.0,17.11151,-9.95554,-6.964733,,,,left_only
1970-01-03 01:49:38.000,1621.0,H,0.0,1.0,-3.504363,-8.239196,5.701941,,,,left_only
1970-01-03 02:07:54.500,1621.0,G,0.0,1.0,-10.199449,-0.374094,-3.92664,,,,left_only


In [24]:
preprocessed_watch.data_table.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12965 entries, 1970-01-03 01:14:27 to 1970-01-03 02:28:28
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   id                      12964 non-null  float64 
 1   activity                12965 non-null  object  
 2   phone                   12964 non-null  float64 
 3   watch                   12964 non-null  float64 
 4   Acceleration x (m/s^2)  12964 non-null  float64 
 5   Acceleration y (m/s^2)  12964 non-null  float64 
 6   Acceleration z (m/s^2)  12964 non-null  float64 
 7   Gyroscope x (m/s^2)     12959 non-null  float64 
 8   Gyroscope y (m/s^2)     12959 non-null  float64 
 9   Gyroscope z (m/s^2)     12959 non-null  float64 
 10  _merge                  12965 non-null  category
dtypes: category(1), float64(9), object(1)
memory usage: 1.1+ MB


In [25]:
preprocessed_watch.data_table.isna().sum()

id                        1
activity                  0
phone                     1
watch                     1
Acceleration x (m/s^2)    1
Acceleration y (m/s^2)    1
Acceleration z (m/s^2)    1
Gyroscope x (m/s^2)       6
Gyroscope y (m/s^2)       6
Gyroscope z (m/s^2)       6
_merge                    0
dtype: int64

In [26]:
# save dataframe to csv
preprocessed_watch.data_table.to_csv('intermediate_datafiles/preprocessed_watch_data_person_{}.csv'.format(participant_no))