# Extract raw data, use timestamps to annotate tasks, save as pickle

In [3]:
# Importing the Libraries
import os
import platform
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pathlib
import pickle
from itertools import product
from scipy.stats import skew, kurtosis, pearsonr
from scipy.signal import butter, welch, filtfilt, resample
import time
import re
import copy

In [4]:
# if platform.system() == 'Windows':
#     if platform.release() == '7':
path = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\Data\biostamp_data\controls'
folder_path = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\Data\biostamp_data'
dict_path = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\Data\biostamp_data\Data_dict'   
#        features_path = r'X:\CIS-PD Study\FeatureMatrix'

In [5]:
def process_annotations(path):
    """
    Processes raw annotations file to extract start / end timestamps and remove unnecessary data
    Inputs:  path - filepath of the subject folder containing annotations.csv
    Outputs: df - dataframe containing list of activities and their start / end timestamps
    """
    df = pd.read_csv(os.path.join(path, 'annotations.csv'))
    del df['Timestamp (ms)']
    del df['AnnotationId']
    del df['AuthorId']
    
    # subset Activity Recognition data by partially match EventType string
    df = df[df['EventType'].str.match('Activity')]
    del df['EventType']
    df.Value = df.Value.shift(-1)
    df = df.dropna()
    
    # Create Trial column for Value
    sorter = set(df.Value.unique().flatten())
    sorterIndex = dict(zip(sorter, range(len(sorter))))        
    df['Value_Rank'] = df['Value'].map(sorterIndex)
    df['Trial'] = df.groupby('Value')['Start Timestamp (ms)'].rank(ascending=True).astype(int)
    del df['Value_Rank']
    df = df.reset_index(drop=True).set_index('Value')
    
    return df

## Extract Data

In [6]:
def extract_data(SubID, path):
    """
    For a given subject, extracts and separates accelerometer, gyroscope, and 
    EMG/ECG data into trials and sensor per activity
    """
    ## This is the annotations.csv dataset cleaned
    ## Used to match timestamp ranges to the accel, gyro, elec data
    timestamps = process_annotations(path)
    
    # Creates list of sensor locations from folders within subject's raw data directory
    locations = [locs for locs in os.listdir(path) if os.path.isdir(os.path.join(path, locs))]
    
    # Creates dictionary of empty dataframes to merge all accelerometer, gyroscope, and EMG/ECG data for each sensor
    accel = {locs: pd.DataFrame() for locs in locations}
    gyro = {locs: pd.DataFrame() for locs in locations}
    elec = {locs: pd.DataFrame() for locs in locations}
    
    # Finds and merges all accelerometer, gyroscope, and EMG/ECG data for each sensor, retains datetime information
    for root, dirs, files in os.walk(path, topdown=True):
        for filenames in files:
            if filenames.endswith('accel.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                accel[location] = accel[location].append(temp_df)

            elif filenames.endswith('gyro.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                gyro[location] = gyro[location].append(temp_df)

            elif filenames.endswith('elec.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                elec[location] = elec[location].append(temp_df)
                
    complete= list(['LYING','SITTING','STANDING','WALKING','STAIRS DOWN','STAIRS UP'])
    complete_acts = complete
    
    # Complete dictionary of all activities
    act_dict = {acts: pd.DataFrame() for acts in complete_acts}
    
    # Populate dictionary keys per activity with every iteration / trial
    for activities in complete_acts:
        
        startSize = timestamps.loc[activities, 'Start Timestamp (ms)']
        
        if np.size(startSize) == 1:
            startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)']
            endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)']
        else:
            startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)'].values
            endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)'].values

        # Create trial dictionary with each key containing all sensor data related with each activity's trial
        trial_dict = {trials: pd.DataFrame() for trials in range(0, np.size(startTimestamp))}

        # Populate trial directory keys
        for trials in range(0, np.size(startTimestamp)):

            if np.size(startSize) == 1:
                startTime = startTimestamp
                endTime = endTimestamp
            else:
                startTime = startTimestamp[trials]
                endTime = endTimestamp[trials]

            # Create sensor location dictionary with each key corresponding to sensor locations
            sensor_dict = {locs: pd.DataFrame() for locs in locations}

            # Extract sensor data and populate sensor_dict with sensor data
            for location in locations:

                data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}

                if not accel[location].empty:
                    accelData = accel[location]
                    data['accel'] = accelData[(accelData.index >= startTime) & (accelData.index <= endTime)]  
 
                if not gyro[location].empty:
                    gyroData = gyro[location]
                    data['gyro'] = gyroData[(gyroData.index >= startTime) & (gyroData.index <= endTime)]
                   
                if not elec[location].empty:
                    elecData = elec[location]
                    data['elec'] = elecData[(elecData.index >= startTime) & (elecData.index <= endTime)]
                   
                sensor_dict[location] = data

            trial_dict[trials] = sensor_dict

        act_dict[activities] = trial_dict
    
    return act_dict, timestamps

# Run extract_data()

In [None]:
#create data dict for remaining subjects
# data_all = []

# # Tasks for Activity Recognition
# complete= list(['LYING','SITTING','STANDING','WALKING','STAIRS DOWN','STAIRS UP'])

f = ['HC02']
fd= []

for SubID in list(set(f) - set(fd)):
    print('Loading Subject ' + SubID + ' Data...')
    act_dict, timestamps = extract_data(SubID, os.path.join(path, SubID))
    print('Extract data complete.')
    
#     #save dict to Pickle file
#     filename = os.path.join(dict_path, SubID + 'dict.pkl')
#     #filename = SubID + 'dict.pkl' # will need to move to Y: drive
#     with open(filename,'wb') as f:
#         pickle.dump(act_dict,f)
#     print(filename + ' ' + 'File Saved\n')

In [8]:
# should have data
act_dict['LYING'][0]['tibialis_anterior_left']['accel'].head(5)

Unnamed: 0_level_0,Accel X (g),Accel Y (g),Accel Z (g)
Timestamp (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1510606218556,-0.046815,-0.89839,0.468818
1510606218588,-0.051698,-0.899855,0.479561
1510606218620,-0.054139,-0.908156,0.477607
1510606218652,-0.050721,-0.893507,0.478584
1510606218684,-0.058534,-0.900832,0.46833


In [9]:
# check if empty
act_dict['LYING'][0]['tibialis_anterior_left']['gyro']

# Load HC02 pickle

In [7]:
dict_path2 = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\Data\biostamp_data\Data_dict'
#load Pickle file dict
subj = 'HC02'
f = open(os.path.join(dict_path2, subj + 'dict.pkl'), 'rb')
act_dict = pickle.load(f)
f.close()

# check timestamp annotations

In [None]:
SubID = 'HC02'
time = process_annotations(os.path.join(path, SubID))

In [None]:
time

# Unstack dictionary

In [28]:
# unstack nested dictionary
df = pd.DataFrame([(k1, k2, k3, k4, v) for k1, k234v in act_dict.items()
                           for k2, k34v in k234v.items()
                           for k3, k4v in k34v.items()
                           for k4, v in k4v.items()])
df.columns = ['task','trial','location','sensor','rawdata']

In [32]:
df

Unnamed: 0,task,trial,location,sensor,rawdata
0,LYING,0,bicep_left,accel,Accel X (g) Accel Y (g) Acce...
1,LYING,0,bicep_left,gyro,Empty DataFrame Columns: [] Index: []
2,LYING,0,bicep_left,elec,Sample (V) Timestamp (ms) ...
3,LYING,0,bicep_right,accel,Accel X (g) Accel Y (g) Acce...
4,LYING,0,bicep_right,gyro,Empty DataFrame Columns: [] Index: []
5,LYING,0,bicep_right,elec,Sample (V) Timestamp (ms) ...
6,LYING,0,biceps_femoris_left,accel,Accel X (g) Accel Y (g) Acce...
7,LYING,0,biceps_femoris_left,gyro,Empty DataFrame Columns: [] Index: []
8,LYING,0,biceps_femoris_left,elec,Sample (V) Timestamp (ms) ...
9,LYING,0,biceps_femoris_right,accel,Accel X (g) Accel Y (g) Acce...
