# Test HC02 line by line - compare to HC01

In [1]:
cd Y:\Inpatient Sensors -Stroke\Data\biostamp_data

Y:\Inpatient Sensors -Stroke\Data\biostamp_data


In [2]:
# Importing the Libraries
import os
import platform
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pathlib
import pickle #to save files
from itertools import product
from scipy.stats import skew, kurtosis, pearsonr
from scipy.signal import butter, welch, filtfilt, resample
import time
import re
import copy

In [3]:
if platform.system() == 'Windows':
    if platform.release() == '7':
        path = r'Y:\Inpatient Sensors -Stroke\Data\biostamp_data\controls'
        folder_path = r'Y:\Inpatient Sensors -Stroke\Data\biostamp_data'
        dict_path = r'Y:\Inpatient Sensors -Stroke\Data\biostamp_data\Data_dict'

In [4]:
# List based on Value data of Activity Recognition
complete= list(['LYING','SITTING','STANDING','WALKING','STAIRS DOWN','STAIRS UP'])

## Extract data without 'trial' structure

In [5]:
def process_annotations(path):
#---------------------------------------------------------------------------------------------------------
# Processes raw annotations file to extract start / end timestamps and remove unnecessary data
#
# Inputs:  path - filepath of the subject folder containing annotations.csv
#
# Outputs: df - dataframe containing list of activities and their start / end timestamps
#---------------------------------------------------------------------------------------------------------
    df = pd.read_csv(os.path.join(path, 'annotations.csv'))
    del df['Timestamp (ms)']
    del df['AnnotationId']
    del df['AuthorId']
    
    # subset Activity Recognition data by partially match EventType string
    df = df[df['EventType'].str.match('Activity')]
    del df['EventType']
    df.Value = df.Value.shift(-1)
    df = df.dropna()
    
    # Create Trial column for Value
    sorter = set(df.Value.unique().flatten())
    sorterIndex = dict(zip(sorter, range(len(sorter))))        
    df['Value_Rank'] = df['Value'].map(sorterIndex)
    df['Trial'] = df.groupby('Value')['Start Timestamp (ms)'].rank(ascending=True).astype(int)
    del df['Value_Rank']
    df = df.reset_index(drop=True).set_index('Value')
    
    return df

# Check timestamps in annotations file

In [16]:
# Compare annotations file for HC01 vs 2
# note: HC01 only has 1 trial, HC02 and subsequent HCs seems to have 2 trials

SubID = 'HC01'
timestamps = process_annotations(os.path.join(path, SubID))
print(timestamps)

             Start Timestamp (ms)  Stop Timestamp (ms)  Trial
Value                                                        
LYING               1510002069639        1510002130051      1
SITTING             1510002153976        1510002214893      1
STANDING            1510002250787        1510002311224      1
WALKING             1510002352692        1510002384115      1
STANDING            1510002415166        1510002475751      2
STAIRS DOWN         1510002535559        1510002547458      1
STAIRS UP           1510002568801        1510002581432      1
STANDING            1510002620624        1510002651580      3
WALKING             1510002668365        1510002721676      2
WALKING             1510002852155        1510002863746      3
STANDING            1510002895517        1510002925845      4
SITTING             1510002949365        1510003009363      2


In [25]:
SubID = 'HC02'
timestamps = process_annotations(os.path.join(path, SubID))
print(timestamps)

             Start Timestamp (ms)  Stop Timestamp (ms)  Trial
Value                                                        
LYING               1510606218530        1510606279050      1
SITTING             1510606310861        1510606371482      1
STANDING            1510606397801        1510606428207      1
WALKING             1510606486679        1510606500080      1
STANDING            1510606531436        1510606561917      2
STAIRS DOWN         1510606596903        1510606607282      1
STAIRS UP           1510606636754        1510606647562      1
STANDING            1510606672709        1510606703132      3
WALKING             1510606756302        1510606807686      2
STANDING            1510606834853        1510606865396      4
SITTING             1510606887461        1510606947863      2
LYING               1510611171960        1510611232456      2
SITTING             1510611263250        1510611323624      3
STANDING            1510611425390        1510611455823      5
WALKING 

# Check extraction

In [25]:
# HC01 timestamps
#print(timestamps)

             Start Timestamp (ms)  Stop Timestamp (ms)  Trial
Value                                                        
LYING               1510002069639        1510002130051      1
SITTING             1510002153976        1510002214893      1
STANDING            1510002250787        1510002311224      1
WALKING             1510002352692        1510002384115      1
STANDING            1510002415166        1510002475751      2
STAIRS DOWN         1510002535559        1510002547458      1
STAIRS UP           1510002568801        1510002581432      1
STANDING            1510002620624        1510002651580      3
WALKING             1510002668365        1510002721676      2
WALKING             1510002852155        1510002863746      3
STANDING            1510002895517        1510002925845      4
SITTING             1510002949365        1510003009363      2


In [7]:
locations = [locs for locs in os.listdir(path) if os.path.isdir(os.path.join(path, locs))]
accel = {locs: pd.DataFrame() for locs in locations}
gyro = {locs: pd.DataFrame() for locs in locations}
elec = {locs: pd.DataFrame() for locs in locations}

In [8]:
print(path)
print(os.path.join(path, SubID))

Y:\Inpatient Sensors -Stroke\Data\biostamp_data\controls
Y:\Inpatient Sensors -Stroke\Data\biostamp_data\controls\HC02


In [10]:
for root, dirs, files in os.walk(os.path.join(path, SubID), topdown=True):
    for filenames in files:
        if filenames.endswith('accel.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            location = str(p.relative_to(path)).split("\\")[0]
            temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
            accel[location] = accel[location].append(temp_df)

        elif filenames.endswith('gyro.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            location = str(p.relative_to(path)).split("\\")[0]
            temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
            gyro[location] = gyro[location].append(temp_df)

        elif filenames.endswith('elec.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            location = str(p.relative_to(path)).split("\\")[0]
            temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
            elec[location] = elec[location].append(temp_df)

In [11]:
accel['HC02'].head(5)

Unnamed: 0_level_0,Accel X (g),Accel Y (g),Accel Z (g)
Timestamp (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1510600271651,-0.988724,-0.08246,-0.052186
1510600271683,-0.99263,-0.07953,-0.053162
1510600271715,-0.9897,-0.06537,-0.056092
1510600271747,-0.996536,-0.075135,-0.046326
1510600271779,-0.992142,-0.064881,-0.057069


In [19]:
complete= list(['LYING','SITTING','STANDING','WALKING','STAIRS DOWN','STAIRS UP'])
complete_acts = complete
act_dict = {acts: pd.DataFrame() for acts in complete_acts}

In [47]:
# Populate dictionary keys per activity with every sensor
for activities in complete_acts:

    startSize = timestamps.loc[activities, 'Start Timestamp (ms)']

    if np.size(startSize) == 1:
        startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)']
        endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)']
    else:
        startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)'].values
        endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)'].values

    # Create sensor location dictionary with each key corresponding to sensor locations
    sensor_dict = {locs: pd.DataFrame() for locs in locations}

    # Extract sensor data and populate sensor_dict with sensor data
    for location in locations:
        print(location)#######################################

        data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}

        if not accel[location].empty:
            accelData = accel[location]
            #data['accel'] = accelData[(accelData.index >= startTimestamp) & (accelData.index <= endTimestamp)]
            ###########
            #print(bool(data))
            if not bool(data):
                data['accel'] = accelData[(accelData.index >= startTimestamp) & (accelData.index <= endTimestamp)]
                print(bool(data))
                print('added data') ###########
            else:
                print(bool(data))
                data['accel'] = data['accel'].append(accelData[(accelData.index >= startTimestamp) & (accelData.index <= endTimestamp)])

        if not gyro[location].empty:
            gyroData = gyro[location]
            data['gyro'] = gyroData[(gyroData.index >= startTimestamp) & (gyroData.index <= endTimestamp)]

        if not elec[location].empty:
            elecData = elec[location]
            data['elec'] = elecData[(elecData.index >= startTimestamp) & (elecData.index <= endTimestamp)]

        sensor_dict[location] = data

    act_dict[activities] = sensor_dict

HC20
HC26
HC13
HC06
HC28
HC01
True
HC30
HC05
HC08
HC09
HC07
HC27
HC34
HC35
HC22
HC33
HC21
HC25
HC04
HC11
HC24
HC18
HC14
HC17
HC31
HC03
HC15
HC29
HC10
HC12
HC19
HC32
HC16
HC23
HC02
HC20
HC26
HC13
HC06
HC28
HC01
True


ValueError: Lengths must match to compare

In [15]:
accel['HC02'].shape

(7388797, 3)

In [26]:
accel[location].empty

False

In [27]:
accelData = accel['HC02']
data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}
print(data['accel'])

Empty DataFrame
Columns: []
Index: []


In [38]:
for activities in complete_acts:
        
    startSize = timestamps.loc[activities, 'Start Timestamp (ms)']

    if np.size(startSize) == 1:
        startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)']
        endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)']
    else:
        startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)'].values
        endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)'].values

    # Create trial dictionary with each key containing all sensor data related with each activity's trial
    trial_dict = {trials: pd.DataFrame() for trials in range(0, np.size(startTimestamp))}

    # Populate trial directory keys
    for trials in range(0, np.size(startTimestamp)):

        if np.size(startSize) == 1:
            startTime = startTimestamp
            endTime = endTimestamp
        else:
            startTime = startTimestamp[trials]
            endTime = endTimestamp[trials]

        # Create sensor location dictionary with each key corresponding to sensor locations
        sensor_dict = {locs: pd.DataFrame() for locs in locations}

        # Extract sensor data and populate sensor_dict with sensor data
        for location in locations:

            data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}

            if not accel[location].empty:
                accelData = accel[location]
                data['accel'] = accelData[(accelData.index >= startTime) & (accelData.index <= endTime)]

            if not gyro[location].empty:
                gyroData = gyro[location]
                data['gyro'] = gyroData[(gyroData.index >= startTime) & (gyroData.index <= endTime)]

            if not elec[location].empty:
                elecData = elec[location]
                data['elec'] = elecData[(elecData.index >= startTime) & (elecData.index <= endTime)]

        trial_dict[trials] = sensor_dict

    act_dict[activities] = trial_dict

In [39]:
act_dict

{'LYING': {0: {'HC20': {'accel': Empty DataFrame
    Columns: []
    Index: [], 'gyro': Empty DataFrame
    Columns: []
    Index: [], 'elec': Empty DataFrame
    Columns: []
    Index: []}, 'HC26': {'accel': Empty DataFrame
    Columns: []
    Index: [], 'gyro': Empty DataFrame
    Columns: []
    Index: [], 'elec': Empty DataFrame
    Columns: []
    Index: []}, 'HC13': {'accel': Empty DataFrame
    Columns: []
    Index: [], 'gyro': Empty DataFrame
    Columns: []
    Index: [], 'elec': Empty DataFrame
    Columns: []
    Index: []}, 'HC06': {'accel': Empty DataFrame
    Columns: []
    Index: [], 'gyro': Empty DataFrame
    Columns: []
    Index: [], 'elec': Empty DataFrame
    Columns: []
    Index: []}, 'HC28': {'accel': Empty DataFrame
    Columns: []
    Index: [], 'gyro': Empty DataFrame
    Columns: []
    Index: [], 'elec': Empty DataFrame
    Columns: []
    Index: []}, 'HC01': {'accel': Empty DataFrame
    Columns: []
    Index: [], 'gyro': Empty DataFrame
    Columns: []


In [34]:
accel['HC02'].empty

False

In [44]:
#For a given subject, extracts and separates accelerometer, gyroscope, and 
#EMG/ECG data into trials and sensor per activity
def  extract_data(SubID, path):

    ## This is the annotations.csv dataset cleaned
    ## Used to match timestamp ranges to the accel, gyro, elec data
    timestamps = process_annotations(path)
#    timestamps = fix_errors(SubID, timestamps)
#    timestamps = add_unstruct_data(timestamps)
    
    # Creates list of sensor locations from folders within subject's raw data directory
    locations = [locs for locs in os.listdir(path) if os.path.isdir(os.path.join(path, locs))]
    
    # Creates dictionary of empty dataframes to merge all accelerometer, gyroscope, and EMG/ECG data for each sensor
    accel = {locs: pd.DataFrame() for locs in locations}
    gyro = {locs: pd.DataFrame() for locs in locations}
    elec = {locs: pd.DataFrame() for locs in locations}
    
    # Finds and merges all accelerometer, gyroscope, and EMG/ECG data for each sensor, retains datetime information
    for root, dirs, files in os.walk(path, topdown=True):
        for filenames in files:
            if filenames.endswith('accel.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                accel[location] = accel[location].append(temp_df)

            elif filenames.endswith('gyro.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                gyro[location] = gyro[location].append(temp_df)

            elif filenames.endswith('elec.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                elec[location] = elec[location].append(temp_df)
                
    complete_acts = complete
    
    # Complete dictionary of all activities
    act_dict = {acts: pd.DataFrame() for acts in complete_acts}
    
    # Populate dictionary keys per activity with every iteration / trial
    for activities in complete_acts:
        
        startSize = timestamps.loc[activities, 'Start Timestamp (ms)']
        
        if np.size(startSize) == 1:
            startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)']
            endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)']
        else:
            startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)'].values
            endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)'].values

        # Create trial dictionary with each key containing all sensor data related with each activity's trial
        trial_dict = {trials: pd.DataFrame() for trials in range(0, np.size(startTimestamp))}

        # Populate trial directory keys
        for trials in range(0, np.size(startTimestamp)):

            if np.size(startSize) == 1:
                startTime = startTimestamp
                endTime = endTimestamp
            else:
                startTime = startTimestamp[trials]
                endTime = endTimestamp[trials]

            # Create sensor location dictionary with each key corresponding to sensor locations
            sensor_dict = {locs: pd.DataFrame() for locs in locations}

            # Extract sensor data and populate sensor_dict with sensor data
            for location in locations:

                data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}

                if not accel[location].empty:
                    accelData = accel[location]
                    data['accel'] = accelData[(accelData.index >= startTime) & (accelData.index <= endTime)]  
 
                if not gyro[location].empty:
                    gyroData = gyro[location]
                    data['gyro'] = gyroData[(gyroData.index >= startTime) & (gyroData.index <= endTime)]
                   
                if not elec[location].empty:
                    elecData = elec[location]
                    data['elec'] = elecData[(elecData.index >= startTime) & (elecData.index <= endTime)]
                   
                sensor_dict[location] = data

            trial_dict[trials] = sensor_dict

        act_dict[activities] = trial_dict
    
    return act_dict, timestamps

In [45]:
#For a given subject, extracts and separates accelerometer, gyroscope, and 
#EMG/ECG data into trials and sensor per activity
def  extract_data(SubID, path):

    ## This is the annotations.csv dataset cleaned
    ## Used to match timestamp ranges to the accel, gyro, elec data
    timestamps = process_annotations(path)
#    timestamps = fix_errors(SubID, timestamps)
#    timestamps = add_unstruct_data(timestamps)
    
    # Creates list of sensor locations from folders within subject's raw data directory
    locations = [locs for locs in os.listdir(path) if os.path.isdir(os.path.join(path, locs))]
    
    # Creates dictionary of empty dataframes to merge all accelerometer, gyroscope, and EMG/ECG data for each sensor
    accel = {locs: pd.DataFrame() for locs in locations}
    gyro = {locs: pd.DataFrame() for locs in locations}
    elec = {locs: pd.DataFrame() for locs in locations}
    
    # Finds and merges all accelerometer, gyroscope, and EMG/ECG data for each sensor, retains datetime information
    for root, dirs, files in os.walk(path, topdown=True):
        for filenames in files:
            if filenames.endswith('accel.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                accel[location] = accel[location].append(temp_df)

            elif filenames.endswith('gyro.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                gyro[location] = gyro[location].append(temp_df)

            elif filenames.endswith('elec.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                elec[location] = elec[location].append(temp_df)
                
    complete_acts = complete
    
    # Complete dictionary of all activities
    act_dict = {acts: pd.DataFrame() for acts in complete_acts}
    
    # Populate dictionary keys per activity with every iteration / trial
    for activities in complete_acts:
        
        startSize = timestamps.loc[activities, 'Start Timestamp (ms)']
        
        if np.size(startSize) == 1:
            startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)']
            endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)']
        else:
            startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)'].values
            endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)'].values

        # Create trial dictionary with each key containing all sensor data related with each activity's trial
        trial_dict = {trials: pd.DataFrame() for trials in range(0, np.size(startTimestamp))}

        # Populate trial directory keys
        for trials in range(0, np.size(startTimestamp)):

            if np.size(startSize) == 1:
                startTime = startTimestamp
                endTime = endTimestamp
            else:
                startTime = startTimestamp[trials]
                endTime = endTimestamp[trials]

            # Create sensor location dictionary with each key corresponding to sensor locations
            sensor_dict = {locs: pd.DataFrame() for locs in locations}

            # Extract sensor data and populate sensor_dict with sensor data
            for location in locations:

                data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}

                if not accel[location].empty:
                    accelData = accel[location]
                    data['accel'] = accelData[(accelData.index >= startTime) & (accelData.index <= endTime)]  
 
                if not gyro[location].empty:
                    gyroData = gyro[location]
                    data['gyro'] = gyroData[(gyroData.index >= startTime) & (gyroData.index <= endTime)]
                   
                if not elec[location].empty:
                    elecData = elec[location]
                    data['elec'] = elecData[(elecData.index >= startTime) & (elecData.index <= endTime)]
                   
                sensor_dict[location] = data

            trial_dict[trials] = sensor_dict

        act_dict[activities] = trial_dict
    
    return act_dict, timestamps

In [41]:
print(SubID)
print(path)

HC02
Y:\Inpatient Sensors -Stroke\Data\biostamp_data\controls


In [46]:
act_dict, timestamps = extract_data(SubID, os.path.join(path, SubID))

In [47]:
act_dict

{'LYING': {0: {'tibialis_anterior_left': {'accel':                 Accel X (g)  Accel Y (g)  Accel Z (g)
    Timestamp (ms)                                       
    1510606218556     -0.046815    -0.898390     0.468818
    1510606218588     -0.051698    -0.899855     0.479561
    1510606218620     -0.054139    -0.908156     0.477607
    1510606218652     -0.050721    -0.893507     0.478584
    1510606218684     -0.058534    -0.900832     0.468330
    1510606218716     -0.050233    -0.904250     0.489326
    1510606218748     -0.045838    -0.912551     0.477119
    1510606218780     -0.048280    -0.902296     0.463935
    1510606218812     -0.045350    -0.900832     0.476143
    1510606218844     -0.055116    -0.906203     0.466865
    1510606218876     -0.047303    -0.898878     0.467842
    1510606218908     -0.038514    -0.901320     0.481514
    1510606218940     -0.049744    -0.902296     0.485420
    1510606218972     -0.052186    -0.903273     0.471748
    1510606219004     -0.

In [48]:
timestamps

Unnamed: 0_level_0,Start Timestamp (ms),Stop Timestamp (ms),Trial
Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LYING,1510606218530,1510606279050,1
SITTING,1510606310861,1510606371482,1
STANDING,1510606397801,1510606428207,1
WALKING,1510606486679,1510606500080,1
STANDING,1510606531436,1510606561917,2
STAIRS DOWN,1510606596903,1510606607282,1
STAIRS UP,1510606636754,1510606647562,1
STANDING,1510606672709,1510606703132,3
WALKING,1510606756302,1510606807686,2
STANDING,1510606834853,1510606865396,4


In [49]:
#lying
1510606279050 - 1510606218530

60520

In [50]:
# stairs up
1510606647562 - 1510606636754

10808

## Manually extract HC01 data 

HC01dict.pkl seems too small, so run by itself

In [None]:
dict_path2 = r'Y:\Inpatient Sensors -Stroke\Data\biostamp_data\HC01test'
SubID = 'HC01'
path2 = os.path.join(path, SubID)

In [None]:
# Plot Sacrum walking
rawdata = act_dict['WALKING']['sacrum']['accel']
rawdata.plot(figsize=(8,4))