In [1]:
import pandas as pd
import numpy as np
import glob
import datetime

## Reading in Data

In [2]:
def read_in_data(location, path):
    files = glob.glob(path + location + "_agg/*")
    df = pd.read_csv(files[0])
    filename = files[0].split("agg")[-1][1:-4]
    hip_or_wrist = location in ['wrist', 'hip']
    df["file"] = filename
    if hip_or_wrist:
        df['id'] = filename.split('_')[1][:4]
    for file in files[1:]:
        new_df = pd.read_csv(file)
        filename = file.split("agg")[-1][1:-4]
        new_df["file"] = filename
        if hip_or_wrist:
            new_df['id'] = filename.split('_')[1][:4]
        df = df.append(new_df)
    return df

In [3]:
path = "E:/Data/Monitor Data/" # The directory of wrist_agg, hip_agg, chest_agg, thigh_agg, and processed directories

df_wrist = read_in_data("wrist", path)
df_hip = read_in_data("hip", path)
df_chest = read_in_data("chest", path)
df_thigh = read_in_data("thigh", path)

groundtruth = pd.read_csv(path + "processed/groundtruth.csv").drop("Unnamed: 0", axis=1)

In [4]:
print(len(df_wrist))
print(len(df_hip))
print(len(df_chest))
print(len(df_thigh))
print(len(groundtruth))

15795431
16777678
524525
521887
341101


## Wrist and Hip Merge With Groundtruth on ID and Date

In [5]:
groundtruth['date'] = groundtruth['time'].str.split(' ').apply(lambda x: x[0])
groundtruth['year'] = groundtruth['date'].str.split('/').apply(lambda x: x[2])
groundtruth['month'] = groundtruth['date'].str.split('/').apply(lambda x: x[0])
groundtruth['day'] = groundtruth['date'].str.split('/').apply(lambda x: x[1])
groundtruth['datetime'] = pd.to_datetime(groundtruth[['year', 'month', 'day']]).astype(str)
groundtruth['actual_datetime'] = groundtruth['datetime'] + ' ' + groundtruth['actual_time']

In [6]:
groundtruth.drop(['date', 'year', 'month', 'day', 'datetime'], inplace = True, axis = 1)

In [7]:
# df_wrist = df_wrist.merge(groundtruth, on = ['actual_datetime', 'id'], how = 'inner')
# df_hip = df_hip.merge(groundtruth, on = ['actual_datetime', 'id'], how = 'inner')

## Feature Selection and Engineering

In [8]:
df_chest.rename(columns = {"Accel X (g)": "Accelerometer X", 
                           "Accel Y (g)": "Accelerometer Y", 
                           "Accel Z (g)": "Accelerometer Z"}, inplace=True)
df_thigh.rename(columns = {"Accel X (g)": "Accelerometer X", 
                           "Accel Y (g)": "Accelerometer Y", 
                           "Accel Z (g)": "Accelerometer Z"}, inplace=True)

In [9]:
dt_vars = ["time", "actual_time", "start.time"]
for var in dt_vars:
    groundtruth[var] = pd.to_datetime(groundtruth[var])

In [10]:
groundtruth["time"] = groundtruth["time"] + pd.to_timedelta(groundtruth["actual_time"].dt.second, unit="s")
groundtruth["time"].head()

0   2017-08-02 18:30:13
1   2017-08-02 18:30:14
2   2017-08-02 18:30:15
3   2017-08-02 18:30:16
4   2017-08-02 18:30:17
Name: time, dtype: datetime64[ns]

In [11]:
groundtruth.dtypes

time                     datetime64[ns]
id                               object
coding                           object
primary_behavior                 object
primary_posture                  object
primary_upperbody                object
primary_intensity                object
secondary_behavior               object
secondary_posture                object
secondary_upperbody              object
secondary_intensity              object
num_postures                      int64
transition                        int64
actual_time              datetime64[ns]
posture_coding                   object
type                             object
start.time               datetime64[ns]
broad_activity                   object
detailed_activity                object
walking_running_bouts            object
updated_activity                 object
act_type                         object
act_type_broad                   object
actual_datetime                  object
dtype: object

In [12]:
kept_merge_variables = ["id", "time", "primary_behavior", "primary_posture", "primary_upperbody", 
                        "primary_intensity", "secondary_behavior", "secondary_posture", "secondary_upperbody", 
                        "secondary_intensity", "num_postures", "transition", "actual_time", "type", "start_time",
                        "broad_activity", "detailed_activity", "updated_activity", "act_type", "act_type_broad"]

dropped_merge_variables = ["coding", # Duplicates primary_intensity
                           "posture_coding", # Duplicates primary_posture
                           "walking_running_bouts" # NaNs
                          ]

In [13]:
groundtruth.drop(dropped_merge_variables, axis=1, inplace=True)

In [14]:
df_wrist["actual_datetime"] = pd.to_datetime(df_wrist["actual_datetime"])
df_hip["actual_datetime"] = pd.to_datetime(df_hip["actual_datetime"])
df_chest["actual_datetime"] = pd.to_datetime(df_chest["actual_datetime"])
df_thigh["actual_datetime"] = pd.to_datetime(df_thigh["actual_datetime"])

In [15]:
dfs = [df_wrist, df_hip, df_chest, df_thigh]
locations = ["Wrist", "Hip", "Chest", "Thigh"]
for i in range(len(dfs)):
    print(locations[i] + ":")
    display(dfs[i][dfs[i]["actual_datetime"].dt.month == 10]["actual_datetime"].dt.date.unique())

Wrist:


array([datetime.date(2017, 10, 2), datetime.date(2017, 10, 3),
       datetime.date(2017, 10, 4), datetime.date(2017, 10, 5),
       datetime.date(2017, 10, 6), datetime.date(2017, 10, 7),
       datetime.date(2017, 10, 8), datetime.date(2017, 10, 9),
       datetime.date(2017, 10, 10), datetime.date(2017, 10, 11),
       datetime.date(2017, 10, 12), datetime.date(2017, 10, 13),
       datetime.date(2017, 10, 14), datetime.date(2017, 10, 15),
       datetime.date(2017, 10, 16), datetime.date(2017, 10, 23),
       datetime.date(2017, 10, 24), datetime.date(2017, 10, 25),
       datetime.date(2017, 10, 26), datetime.date(2017, 10, 27),
       datetime.date(2017, 10, 28), datetime.date(2017, 10, 29),
       datetime.date(2017, 10, 30)], dtype=object)

Hip:


array([datetime.date(2017, 10, 2), datetime.date(2017, 10, 3),
       datetime.date(2017, 10, 4), datetime.date(2017, 10, 5),
       datetime.date(2017, 10, 6), datetime.date(2017, 10, 7),
       datetime.date(2017, 10, 8), datetime.date(2017, 10, 9),
       datetime.date(2017, 10, 10), datetime.date(2017, 10, 11),
       datetime.date(2017, 10, 12), datetime.date(2017, 10, 13),
       datetime.date(2017, 10, 14), datetime.date(2017, 10, 15),
       datetime.date(2017, 10, 16), datetime.date(2017, 10, 23),
       datetime.date(2017, 10, 24), datetime.date(2017, 10, 25),
       datetime.date(2017, 10, 26), datetime.date(2017, 10, 27),
       datetime.date(2017, 10, 28), datetime.date(2017, 10, 29),
       datetime.date(2017, 10, 30)], dtype=object)

Chest:


array([datetime.date(2017, 10, 4), datetime.date(2017, 10, 6),
       datetime.date(2017, 10, 7), datetime.date(2017, 10, 13),
       datetime.date(2017, 10, 14), datetime.date(2017, 10, 8),
       datetime.date(2017, 10, 11), datetime.date(2017, 10, 25),
       datetime.date(2017, 10, 27)], dtype=object)

Thigh:


array([datetime.date(2017, 10, 4), datetime.date(2017, 10, 6),
       datetime.date(2017, 10, 7), datetime.date(2017, 10, 13),
       datetime.date(2017, 10, 14), datetime.date(2017, 10, 8),
       datetime.date(2017, 10, 11), datetime.date(2017, 10, 25),
       datetime.date(2017, 10, 27)], dtype=object)

In [16]:
locations = ["wrist", "hip", 
             "chest", "thigh"]
all_dfs = [df_wrist, df_hip, 
           df_chest, df_thigh]

In [17]:
display(df_wrist.columns)
display(df_chest.columns)

Index(['Accelerometer X', 'Accelerometer Y', 'Accelerometer Z',
       'actual_datetime', 'file', 'id'],
      dtype='object')

Index(['Accelerometer X', 'Accelerometer Y', 'Accelerometer Z',
       'actual_datetime', 'file'],
      dtype='object')

In [18]:
new_dfs = []
for df in all_dfs:
#     df.drop("file", axis=1, inplace=True)
    merged_df = df.merge(groundtruth, how = "inner", left_on = "actual_datetime", right_on = "time")
    #Adding new variables
    merged_df["magnitude_of_change_X"] = merged_df["Accelerometer X"].diff().fillna(0)
    merged_df["magnitude_of_change_Y"] = merged_df["Accelerometer Y"].diff().fillna(0)
    merged_df["magnitude_of_change_Z"] = merged_df["Accelerometer Z"].diff().fillna(0)
    merged_df["time_diff_seconds"] = (merged_df["time"] - merged_df["time"].shift()).dt.total_seconds().fillna(1).astype(int)
    
    merged_df["year"] = merged_df["time"].dt.year
    merged_df["month"] = merged_df["time"].dt.month
    merged_df["day_of_week"] = merged_df["time"].dt.dayofweek
    merged_df["day"] = merged_df["time"].dt.day
    merged_df["hour"] = merged_df["time"].dt.hour
    merged_df["minute"] = merged_df["time"].dt.minute
    merged_df["second"] = merged_df["time"].dt.second
    
    new_dfs.append(merged_df)

In [19]:
output_dir = "E:/Data/Monitor Data/data_frames/"

for i in range(len(new_dfs)):
    new_dfs[i].to_csv(output_dir + locations[i] + ".csv", index=False)