## Load Dataset and Create Dataframe

In [1]:
import pandas as pd
import os
from random import randrange
%load_ext autoreload
%autoreload 2

In [2]:
def load_to_dataframe(path, field_names, device):
    data = pd.read_csv(path, header=None)
    data = data.rename(columns={i : field_names[i] for i in range(len(field_names))})
    
    # change column order
    cols = data.columns.to_list()
    cols.remove('timestamp')
    cols.insert(0, 'timestamp')
    data = data[cols]
    
    data[cols[-1]] = data[cols[-1]].apply(lambda x: float(x.replace(';', '')))
    
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    
    # add device columns
    if device == 'phone':
        data['phone'] = 1
        data['watch'] = 0
    else:
        data['phone'] = 0
        data['watch'] = 1
        
    return data

In [3]:
dirs = ['datasets/phone/accel/', 'datasets/phone/gyro/', 'datasets/watch/accel/', 'datasets/watch/gyro/']

def make_filenames_list(path):
    filenames_list = os.listdir(path)
    filenames_list = [(path + file) for file in filenames_list if file != '.DS_Store']
    return sorted(filenames_list)

datasets = [make_filenames_list(dirs[0]), make_filenames_list(dirs[1]), make_filenames_list(dirs[2]), make_filenames_list(dirs[3])]

In [4]:
labels = {
    'A': 'walking',
    'B': 'jogging',
    'C': 'stairs',
    'D': 'sitting',
    'E': 'standing',
    'F': 'typing',
    'G': 'brushing_teeth',
    'H': 'eating_soup',
    'I': 'eating_chips',
    'J': 'eating_pasta',
    'K': 'drinking_from_cup',
    'L': 'eating_sandwich',
    'M': 'kicking',
    'O': 'playing_catch',
    'P': 'dribbling',
    'Q': 'writing',
    'R': 'clapping',
    'S': 'folding',
}

field_names_accel = ['id', 'activity', 'timestamp', 'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)']
field_names_gyro = ['id', 'activity', 'timestamp', 'Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)']

In [5]:
def make_df(path, field_names, device):
    # make dataframe
    dataframe = load_to_dataframe(path, field_names, device)
    # split by activity
    grouped = dataframe.groupby('activity')
    grouped_list = [grouped.get_group(x) for x in grouped.groups]
    return grouped_list

In [6]:
len(datasets[0])

51

In [7]:
participant_no = randrange(52)

In [8]:
participant_no = 21

In [9]:
phone_accel_file = datasets[0][participant_no]
phone_gyro_file = datasets[1][participant_no]
watch_accel_file = datasets[2][participant_no]
watch_gyro_file = datasets[3][participant_no]

In [10]:
phone_accel_list = make_df(phone_accel_file, field_names_accel, 'phone')
phone_gyro_list = make_df(phone_gyro_file, field_names_gyro, 'phone')
watch_accel_list = make_df(watch_accel_file, field_names_accel, 'watch')
watch_gyro_list = make_df(watch_gyro_file, field_names_gyro, 'watch')

In [11]:
watch_accel_list[3]

Unnamed: 0,timestamp,id,activity,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),phone,watch
10806,1970-01-03 01:40:27.127288180,1621,D,8.008755,-4.649989,2.640805,0,1
10807,1970-01-03 01:40:27.176788180,1621,D,8.801235,-5.396979,2.341529,0,1
10808,1970-01-03 01:40:27.226288180,1621,D,8.592939,-4.061015,2.427720,0,1
10809,1970-01-03 01:40:27.275788180,1621,D,8.288876,-5.054609,2.597709,0,1
10810,1970-01-03 01:40:27.325288180,1621,D,8.655189,-3.182343,2.525883,0,1
...,...,...,...,...,...,...,...,...
18721,1970-01-03 01:43:26.985041501,1621,D,-0.120159,8.532186,4.580257,0,1
18722,1970-01-03 01:43:26.995012631,1621,D,-0.144101,8.503456,4.549133,0,1
18723,1970-01-03 01:43:27.004983761,1621,D,-0.170437,8.510639,4.553921,0,1
18724,1970-01-03 01:43:27.014954891,1621,D,-0.177620,8.491486,4.630536,0,1


## Preprocess data

In [12]:
from Chapter2.CreateDataset_old import CreateDataset
from util.VisualizeDataset import VisualizeDataset
from util import util
from pathlib import Path
import copy
import sys
import pickle
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from tqdm import tqdm
from functools import reduce

In [13]:
# aggregate phone data
preprocessed_phone = CreateDataset('', granularity=250)

for activity_df in phone_accel_list:
    preprocessed_phone.add_numerical_dataset(activity_df, 'accel')
preprocessed_phone.concat_datasets('accel')

for activity_df in phone_gyro_list:
    preprocessed_phone.add_numerical_dataset(activity_df, 'gyro')
preprocessed_phone.concat_datasets('gyro')

preprocessed_phone.merge_datasets()

In [14]:
cols = ['id', 'activity', 'phone', 'watch', 'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)', 
        'Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)', '_merge']

preprocessed_phone.data_table = preprocessed_phone.data_table[cols]

In [15]:
# replace label values
preprocessed_phone.data_table = preprocessed_phone.data_table.replace({'activity': labels})

In [16]:
labels_list = list(preprocessed_phone.data_table.activity.unique())

for label in labels_list:
    column_name = 'label_'+label
    preprocessed_phone.data_table[column_name] = 0
    preprocessed_phone.data_table.loc[(preprocessed_phone.data_table['activity'] == label), column_name] = 1

In [17]:
preprocessed_phone.data_table = preprocessed_phone.data_table.drop(['_merge'], axis=1)

In [18]:
preprocessed_phone.data_table

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),...,label_eating_soup,label_standing,label_clapping,label_eating_chips,label_brushing_teeth,label_eating_pasta,label_eating_sandwich,label_writing,label_typing,label_folding
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-02 22:20:33.750,1621,walking,1,0,-4.883174,5.048174,5.753865,-0.581279,-1.742062,0.300049,...,0,0,0,0,0,0,0,0,0,0
1970-01-02 22:20:34.000,1621,walking,1,0,1.296289,4.474848,7.304880,-0.863169,-3.958675,0.747816,...,0,0,0,0,0,0,0,0,0,0
1970-01-02 22:20:34.250,1621,walking,1,0,8.671200,2.888305,3.798700,0.111498,-2.537282,0.423798,...,0,0,0,0,0,0,0,0,0,0
1970-01-02 22:20:34.500,1621,walking,1,0,9.415797,2.884714,1.443105,0.126411,-0.448476,0.045984,...,0,0,0,0,0,0,0,0,0,0
1970-01-02 22:20:34.750,1621,walking,1,0,8.788815,2.858876,0.847946,0.028585,-0.191215,-0.061608,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-02 23:34:39.000,1621,folding,1,0,1.660279,9.538400,0.291095,0.027874,0.000888,-0.077764,...,0,0,0,0,0,0,0,0,0,1
1970-01-02 23:34:39.250,1621,folding,1,0,0.959476,9.708830,0.012057,0.109722,-0.046111,-0.295687,...,0,0,0,0,0,0,0,0,0,1
1970-01-02 23:34:39.500,1621,folding,1,0,0.376289,9.648633,-0.197621,0.080605,-0.053796,-0.255486,...,0,0,0,0,0,0,0,0,0,1
1970-01-02 23:34:39.750,1621,folding,1,0,-0.600147,9.826702,-0.464874,0.033911,0.026632,-0.094986,...,0,0,0,0,0,0,0,0,0,1


In [19]:
preprocessed_phone.data_table.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12972 entries, 1970-01-02 22:20:33.750000 to 1970-01-02 23:34:40
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       12972 non-null  int64  
 1   activity                 12972 non-null  object 
 2   phone                    12972 non-null  int64  
 3   watch                    12972 non-null  int64  
 4   Acceleration x (m/s^2)   12971 non-null  float64
 5   Acceleration y (m/s^2)   12971 non-null  float64
 6   Acceleration z (m/s^2)   12971 non-null  float64
 7   Gyroscope x (rad/s)      12972 non-null  float64
 8   Gyroscope y (rad/s)      12972 non-null  float64
 9   Gyroscope z (rad/s)      12972 non-null  float64
 10  label_walking            12972 non-null  int64  
 11  label_jogging            12972 non-null  int64  
 12  label_dribbling          12972 non-null  int64  
 13  label_playing_catch      12972 non

In [20]:
preprocessed_phone.data_table.isna().sum()

id                         0
activity                   0
phone                      0
watch                      0
Acceleration x (m/s^2)     1
Acceleration y (m/s^2)     1
Acceleration z (m/s^2)     1
Gyroscope x (rad/s)        0
Gyroscope y (rad/s)        0
Gyroscope z (rad/s)        0
label_walking              0
label_jogging              0
label_dribbling            0
label_playing_catch        0
label_kicking              0
label_stairs               0
label_sitting              0
label_drinking_from_cup    0
label_eating_soup          0
label_standing             0
label_clapping             0
label_eating_chips         0
label_brushing_teeth       0
label_eating_pasta         0
label_eating_sandwich      0
label_writing              0
label_typing               0
label_folding              0
dtype: int64

In [21]:
# preprocessed_phone.data_table.loc[preprocessed_phone.data_table['activity'] == 'D', :]

In [22]:
# save dataframe to csv
preprocessed_phone.data_table.dropna()
preprocessed_phone.data_table.to_csv('intermediate_datafiles/data_phone.csv')

In [23]:
# aggregate watch data
preprocessed_watch = CreateDataset('', granularity=250)

for activity_df in watch_accel_list:
    preprocessed_watch.add_numerical_dataset(activity_df, 'accel')
preprocessed_watch.concat_datasets('accel')

for activity_df in watch_gyro_list:
    preprocessed_watch.add_numerical_dataset(activity_df, 'gyro')
preprocessed_watch.concat_datasets('gyro')

preprocessed_watch.merge_datasets()

In [24]:
preprocessed_watch.data_table = preprocessed_watch.data_table[cols]

In [25]:
# replace label values
preprocessed_watch.data_table = preprocessed_watch.data_table.replace({'activity': labels})

In [26]:
preprocessed_watch.data_table

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),_merge
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-01-03 01:14:27.000,1621.0,walking,0.0,1.0,-4.573973,-8.353220,2.070985,0.773692,-0.355514,0.165107,both
1970-01-03 01:14:27.250,1621.0,walking,0.0,1.0,-2.865710,-7.348733,-0.980665,-3.969433,0.894113,-0.683696,both
1970-01-03 01:14:27.500,1621.0,walking,0.0,1.0,-2.073708,-7.208911,-7.482838,-0.212902,-0.045846,-0.408432,both
1970-01-03 01:14:27.750,1621.0,walking,0.0,1.0,-2.842566,-4.964826,-8.095993,-1.392197,-0.615743,-0.372852,both
1970-01-03 01:14:28.000,1621.0,walking,0.0,1.0,-3.209038,-5.278227,-6.037698,0.850212,-0.022589,-0.084165,both
...,...,...,...,...,...,...,...,...,...,...,...
1970-01-03 02:28:27.000,1621.0,folding,0.0,1.0,5.044433,-4.596987,5.981524,-0.613955,-1.475738,0.280963,both
1970-01-03 02:28:27.250,1621.0,folding,0.0,1.0,9.728928,-5.214212,5.998283,-1.108878,-0.395771,-0.180083,both
1970-01-03 02:28:27.500,1621.0,folding,0.0,1.0,4.657052,-10.631333,-1.227896,-2.777724,0.597272,1.910605,both
1970-01-03 02:28:27.750,1621.0,folding,0.0,1.0,1.127998,-8.841428,-0.667653,0.186056,0.225919,2.030127,both


In [27]:
labels_list = list(preprocessed_watch.data_table.activity.unique())

for label in labels_list:
    column_name = 'label_'+label
    preprocessed_watch.data_table[column_name] = 0
    preprocessed_watch.data_table.loc[(preprocessed_watch.data_table['activity'] == label), column_name] = 1

In [28]:
preprocessed_watch.data_table = preprocessed_watch.data_table.drop(['_merge'], axis=1)

In [29]:
preprocessed_watch.data_table

Unnamed: 0_level_0,id,activity,phone,watch,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),...,label_eating_soup,label_standing,label_clapping,label_eating_chips,label_brushing_teeth,label_eating_pasta,label_eating_sandwich,label_writing,label_typing,label_folding
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-03 01:14:27.000,1621.0,walking,0.0,1.0,-4.573973,-8.353220,2.070985,0.773692,-0.355514,0.165107,...,0,0,0,0,0,0,0,0,0,0
1970-01-03 01:14:27.250,1621.0,walking,0.0,1.0,-2.865710,-7.348733,-0.980665,-3.969433,0.894113,-0.683696,...,0,0,0,0,0,0,0,0,0,0
1970-01-03 01:14:27.500,1621.0,walking,0.0,1.0,-2.073708,-7.208911,-7.482838,-0.212902,-0.045846,-0.408432,...,0,0,0,0,0,0,0,0,0,0
1970-01-03 01:14:27.750,1621.0,walking,0.0,1.0,-2.842566,-4.964826,-8.095993,-1.392197,-0.615743,-0.372852,...,0,0,0,0,0,0,0,0,0,0
1970-01-03 01:14:28.000,1621.0,walking,0.0,1.0,-3.209038,-5.278227,-6.037698,0.850212,-0.022589,-0.084165,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-03 02:28:27.000,1621.0,folding,0.0,1.0,5.044433,-4.596987,5.981524,-0.613955,-1.475738,0.280963,...,0,0,0,0,0,0,0,0,0,1
1970-01-03 02:28:27.250,1621.0,folding,0.0,1.0,9.728928,-5.214212,5.998283,-1.108878,-0.395771,-0.180083,...,0,0,0,0,0,0,0,0,0,1
1970-01-03 02:28:27.500,1621.0,folding,0.0,1.0,4.657052,-10.631333,-1.227896,-2.777724,0.597272,1.910605,...,0,0,0,0,0,0,0,0,0,1
1970-01-03 02:28:27.750,1621.0,folding,0.0,1.0,1.127998,-8.841428,-0.667653,0.186056,0.225919,2.030127,...,0,0,0,0,0,0,0,0,0,1


In [30]:
preprocessed_watch.data_table.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12965 entries, 1970-01-03 01:14:27 to 1970-01-03 02:28:28
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       12964 non-null  float64
 1   activity                 12965 non-null  object 
 2   phone                    12964 non-null  float64
 3   watch                    12964 non-null  float64
 4   Acceleration x (m/s^2)   12964 non-null  float64
 5   Acceleration y (m/s^2)   12964 non-null  float64
 6   Acceleration z (m/s^2)   12964 non-null  float64
 7   Gyroscope x (rad/s)      12959 non-null  float64
 8   Gyroscope y (rad/s)      12959 non-null  float64
 9   Gyroscope z (rad/s)      12959 non-null  float64
 10  label_walking            12965 non-null  int64  
 11  label_jogging            12965 non-null  int64  
 12  label_dribbling          12965 non-null  int64  
 13  label_playing_catch      12965 non-null  

In [31]:
preprocessed_watch.data_table.isna().sum()

id                         1
activity                   0
phone                      1
watch                      1
Acceleration x (m/s^2)     1
Acceleration y (m/s^2)     1
Acceleration z (m/s^2)     1
Gyroscope x (rad/s)        6
Gyroscope y (rad/s)        6
Gyroscope z (rad/s)        6
label_walking              0
label_jogging              0
label_dribbling            0
label_playing_catch        0
label_kicking              0
label_stairs               0
label_sitting              0
label_drinking_from_cup    0
label_eating_soup          0
label_standing             0
label_clapping             0
label_eating_chips         0
label_brushing_teeth       0
label_eating_pasta         0
label_eating_sandwich      0
label_writing              0
label_typing               0
label_folding              0
dtype: int64

In [32]:
# save dataframe to csv
preprocessed_watch.data_table.dropna()
preprocessed_watch.data_table.to_csv('intermediate_datafiles/data_watch.csv')