## Using multiple strides in treadmill-acquired gait data for Multiple Sclerosis prediction 
### Extracting raw treadmill features (COPX, COPY, ForceZ and belt speed) for each stride and time normalizing data in a given stride, and so retaining 30 samples per stride after downsampling with smooting approach.
* Each stride is a 2D data with 30 time steps and 4 features (COPX, COPY, ForceZ and belt speed)
* Somehow keep track of ordering of strides and if they are consequetive or not since later on we need to retain groups of 5 consequetive strides only for our analysis
* Keep track of PID, Trial ID (W/WT) and labels for each stride, maybe in a separate .csv file where each csv file for 2D data has a corresponding PID, TrialID and label 
* **Sanity Check to make sure we retained only the same strides as we used for othe domain knowledge features:** Overall, 1654 (HOA: 905, PwMS: 749) and 1576 (HOA: 878, PwMS: 698) strides were retrieved from W and WT trials, respectively, across 35 subjects (HOA: 18, PwMS: 17). 

In [2]:
import numpy as np
import pandas as pd
import math
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
path = 'C:\\Users\\Rachneet Kaur\\Box\\GAIT\\sample_data\\data_export\\'

In [4]:
# Get all the file names in the dictionary
control_ids = list(range(200, 220))
ms_ids = list(range(300, 315)) + [318, 320, 321, 322, 323]
raw_controls_t1 = [path +str(i)+ '_B3_TRIAL01_RAWDATA.csv' for i in control_ids]
raw_controls_t2 = [path +str(i)+ '_B3_TRIAL02_RAWDATA.csv' for i in control_ids]

raw_ms_t1 = [path +str(i)+ '_B3_TRIAL01_RAWDATA.csv' for i in ms_ids]
raw_ms_t2 = [path +str(i)+ '_B3_TRIAL02_RAWDATA.csv' for i in ms_ids]

gait_controls_t1 = [path +str(i)+ '_B3_TRIAL01_GAITCYCLES.csv' for i in control_ids]
gait_controls_t2 = [path +str(i)+ '_B3_TRIAL02_GAITCYCLES.csv' for i in control_ids]

gait_ms_t1 = [path +str(i)+ '_B3_TRIAL01_GAITCYCLES.csv' for i in ms_ids]
gait_ms_t2 = [path +str(i)+ '_B3_TRIAL02_GAITCYCLES.csv' for i in ms_ids]

In [5]:
# for every GaitCycle file, a sequence of walk will always start with a heel strike on the right foot.
# Thus the order of the Gait event points would be HSR, TOL, MidSSR, HSL, TOR and MidSSL.
gait_type = np.array(['HSR', 'TOL', 'MidSSR', 'HSL', 'TOR', 'MidSSL'])

#Delta_time
delta_time = 0.002 #Since the data is collected is 500Hz frequency 

In [6]:
#functions to drop missing values and invalid data 
def drop_unnamed(dataframe):
    return(dataframe.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1))

#Eliminate missing values
def drop_na(dataframe):
    return(pd.DataFrame.dropna(dataframe))

#Eliminate invalid data 
def get_valid(dataframe):
    return(dataframe.loc[dataframe.Valid == True, :])

# Valid strides in the gait_cycles.csv file 
def get_cycle(dataframe):
    stride_start = min(dataframe.loc[dataframe.EventType == 'HSR'].index)
    stride_end = max(dataframe.loc[dataframe.EventType == 'MidSSL'].index)   
    return dataframe.loc[stride_start:stride_end]

# Restore the indexing for the cropped dataframe 
def change_index(dataframe):
    dataframe.index = range(len(dataframe))
    return dataframe

# get all the valid index in order: HSR-TOL-MidSSR-HSL-TOR-MidSSL
def set_complete(data_frame):
    # input is the Dataframe includes ONLY valid points 
    # get all the index of HSR since it starts with heal strike left
    # if the length of last gait cycle contain HSR does not equals to 6, then ignore it
    
    HSR = data_frame.loc[data_frame.EventType == 'HSR'].index
    last_idx = HSR[-1]
    last_all_idx = data_frame.index[-1]
    # if the last gait cycles contains HSR is not a valid gait cycle, then we should consider the last second HSR instead.
    if((last_all_idx-last_idx) < 5):
        HSR = HSR[0:-1] 
    else:
        HSR = HSR
    
    # get all the valid index in order: HSR-TOL-MidSSR-HSL-TOR-MidSSL
    valid = []
    for idx_HSR in HSR:
        if (((idx_HSR + 1) in data_frame.index) & ((idx_HSR + 2) in data_frame.index) &
            ((idx_HSR + 3) in data_frame.index) & ((idx_HSR + 4) in data_frame.index) & 
            ((idx_HSR + 5) in data_frame.index)):
            # the valid index exist in the dataframe.
            if((data_frame.loc[idx_HSR + 1].EventType == 'TOL') & (data_frame.loc[idx_HSR + 2].EventType == 'MidSSR') & 
               (data_frame.loc[idx_HSR + 3].EventType == 'HSL') & (data_frame.loc[idx_HSR + 4].EventType == 'TOR') & 
               (data_frame.loc[idx_HSR + 5].EventType == 'MidSSL')):
                valid.extend(range(idx_HSR, idx_HSR+6))
    #returns the list of valid indices which form complete strides 
    return valid

In [7]:
#Preprocessing the files to delete missing and invalid data 
#For each person (control and MS) in Trial 1
def cleaning(pid, trial = 1, cohort = 'controls'):
    if (cohort == 'controls'):
        if (trial == 1):
            gait = pd.read_csv(gait_controls_t1[pid])
            raw = pd.read_csv(raw_controls_t1[pid])
        else:
            gait = pd.read_csv(gait_controls_t2[pid])
            raw = pd.read_csv(raw_controls_t2[pid])            
    
    if (cohort == 'pwms'):
        if (trial == 1):
            gait = pd.read_csv(gait_ms_t1[pid])
            raw = pd.read_csv(raw_ms_t1[pid])
        else:
            gait = pd.read_csv(gait_ms_t2[pid])
            raw = pd.read_csv(raw_ms_t2[pid])            
    gait = drop_na(gait)
    gait  = get_valid(gait)

    #Reducing to complete strides data 
    gait = get_cycle(gait)
    indices_complete = set_complete(gait)
    gait = gait.loc[indices_complete]

    #Resetting the index 
    gait = change_index(gait)
    return indices_complete, gait, raw

## Creating and saving the raw data for original length strides for PIDs and trials of interest

In [8]:
path_to_save_original_strides = 'C:\\Users\\Rachneet Kaur\\Box\\GaitLSTMproject\\raw_treadmill_features\\original_strides\\'
features_of_interest = ['Speed', 'TreadMill_FZ', 'COPX', 'COPY']
raw_labels = pd.DataFrame(columns = ['PID', 'TrialID', 'StrideIndex', 'FileName', 'label', 'OriginalStrideLength'])

In [9]:
#Extracting the original stride's raw data features 
def stride_original_raw_data(index, pid, trial_id, cohort):
    df = pd.DataFrame()
    indices_complete, gait, raw = cleaning(index, trial_id, cohort)
    stride_count = int(gait.shape[0]/6) #6 events in each stride
    if cohort == 'controls':
        label = 0
    else:
        label = 1
    
    #Indices for the in-consequetive gait cycles' and hence strides we will discard/label as NaN
    nan_idx = np.array(indices_complete[::6][1:]) - np.array(indices_complete[::6][:-1])
    #If this difference is not 6, that means the valid strides is not in consequent order, hence, we cannot compute lengths 
    nan_stride_idx = np.where(nan_idx!=6)[0]
    
    #Exact times of HSR
    HSR_times = gait['Time'][gait.EventType == 'HSR']
    #For HSR, calculate the closest times from RAWDATA.csv file 
    HSR_times_raw = [raw['Time'][raw['Time']>HSR_times.iloc[i]].iloc[0] for i in range(stride_count)]

    for idx in range(0, stride_count): #Use for all strides for each person, each trial
        try:
            #Indices in raw data relative to the current stride 
            indices_stride = (raw['Time']>=HSR_times_raw[idx]) & (raw['Time']<HSR_times_raw[idx+1]) #Progression vector 
            original_stride = raw[indices_stride][features_of_interest]
            original_stride = original_stride.reset_index(drop = True)
            original_stride_length = len(raw[indices_stride][features_of_interest])
            original_stride_filename = str(pid) + '_' + str(trial_id) + '_' + str(idx) + '.csv'
            if idx not in nan_stride_idx:
                #Saving the original length strides to csv
                original_stride.to_csv(path_to_save_original_strides + original_stride_filename)
            else:
#                 print ('This is a NaN stride')
                original_stride_filename = np.nan
                original_stride_length = np.nan
            raw_labels.loc[len(raw_labels)] = [pid, trial_id, idx, original_stride_filename, label, original_stride_length]
        except:
            pass

In [78]:
cohorts = ['controls', 'pwms']
trials = [1, 2]
subjects_to_delete = [212, 213, 309, 310, 311]
for trial_id in trials:
    for cohort in cohorts:
        for index in range(0, 20): #20 people in each cohort and each trial
            if cohort == 'controls':
                pid = control_ids[index]
            else:
                pid = ms_ids[index]
            if pid not in subjects_to_delete:
                stride_original_raw_data(index, pid, trial_id, cohort)

#Saving the log of each stride created in this labels file 
raw_labels.to_csv(path_to_save_original_strides + '..\\original_stride_raw_labels.csv')
print ('No. of PIDs in these raw strides: ', len(raw_labels['PID'].unique()))

No. of PIDs in these raw strides:  35


### Downsample with smoothing the original length strides to define fixed shape input tensor for models with raw treadmill features 
* Basically, the downsampled stride's .csv is a windowed mean version (with 30 windows) of the original length stride's .csv.
* We create a downsampled_labels.csv with columns for cohort (HOA/MS), trial (W/WT), file name (212_W_1_25), PID (212), stride number (23), length of the downsampled stride and label (0 - HOA/1-MS).
* So the idea is to use this file downsampled_labels.csv to decide the identifiers for the training/testing sets/folds and then pull out the .csvs of the same name as samples and pull the from the label column of the labels.csv the labels of the corresponding identifier strides. 

In [11]:
#Path for downsampled strides (each file has 30 downsampled/smoothed frames and 4 features)
path_to_save_downsampled_strides = 'C:\\Users\\Rachneet Kaur\\Box\\GaitLSTMproject\\raw_treadmill_features\\downsampled_strides\\'

#Keeping 30 stride-normalized time steps for each stride while downsampling
downsampled_steps_per_stride = 30

#Dataframe to keep track of all the downsampled strides
downsampled_labels = raw_labels
downsampled_labels['DownsampledStrideLength'] = downsampled_steps_per_stride

In [88]:
#Use mean with disjoint windows to downsample while smoothing 
original_strides = os.listdir(path_to_save_original_strides)
print ('No. of original strides = ', len(original_strides))
for original_stride in original_strides:
    print ('Currently processing ', original_stride)
    original_stride_df = pd.read_csv(path_to_save_original_strides+original_stride, index_col = 0)  
#     print (original_stride_df)
    #Creating the downsampled stride with 30 time steps for each stride containg n time steps 
    downsampled_stride = pd.DataFrame(columns = original_stride_df.columns) 
    #Creating an approximately even splits of frames to divide n time steps in a stride to 30 
    #downsampled time steps
    splits = np.array_split(original_stride_df.index, downsampled_steps_per_stride)
    for split in splits:
        #We do the windowed mean of each split of frames to get the downsampled frame
        downsampled_stride.loc[len(downsampled_stride)] = original_stride_df.loc[split].mean()
#     print (downsampled_stride)
    #Saving the downsampled stride with 30*4 with the unique key in the name
    downsampled_stride.to_csv(path_to_save_downsampled_strides+original_stride)
#Saving the log of each stride created in this labels file 
downsampled_labels.to_csv(path_to_save_downsampled_strides + '..\\downsampled_stride_raw_labels.csv')

No. of original strides =  3393
Currently processing  200_1_0.csv
Currently processing  200_1_1.csv
Currently processing  200_1_10.csv
Currently processing  200_1_11.csv
Currently processing  200_1_12.csv
Currently processing  200_1_13.csv
Currently processing  200_1_14.csv
Currently processing  200_1_15.csv
Currently processing  200_1_16.csv
Currently processing  200_1_17.csv
Currently processing  200_1_18.csv
Currently processing  200_1_19.csv
Currently processing  200_1_2.csv
Currently processing  200_1_20.csv
Currently processing  200_1_21.csv
Currently processing  200_1_22.csv
Currently processing  200_1_23.csv
Currently processing  200_1_24.csv
Currently processing  200_1_25.csv
Currently processing  200_1_26.csv
Currently processing  200_1_27.csv
Currently processing  200_1_28.csv
Currently processing  200_1_29.csv
Currently processing  200_1_3.csv
Currently processing  200_1_30.csv
Currently processing  200_1_31.csv
Currently processing  200_1_32.csv
Currently processing  200_1

### Creating the grouped 5 strides from the downsampled strides, such that we have 5x30 = 150 time steps and 4 features per each grouped stride for multi-stride analysis on raw treadmill features 
* Make sure to not mix trials or PIDs when creating the grouped strides 
* We only create grouped strides if the 5 strides were consequetive 
* The gait domain knowledge features have 1437 sequences of 5 strides each

In [18]:
downsampled_labels = pd.read_csv(path_to_save_downsampled_strides + '..\\downsampled_stride_raw_labels.csv', index_col = 0)
downsampled_labels['PID'] = downsampled_labels['PID'].astype(int)
downsampled_labels['TrialID'] = downsampled_labels['TrialID'].astype(int)

In [19]:
path_to_save_grouped_strides = 'C:\\Users\\Rachneet Kaur\\Box\\GaitLSTMproject\\raw_treadmill_features\\grouped_5strides\\'

#Keeping 5 strides per group
strides_per_sequence = 5
#Skipping 2 strides for making the next group of 5 strides 
skippedSteps = 2

#Dataframe to keep track of all the grouped strides
grouped_labels = pd.DataFrame(columns = ['PID', 'TrialID', 'GroupIndex', 'FileName', 'label', 'StridesInThisGroup'])

In [20]:
PIDs = downsampled_labels.PID.unique()
trials = downsampled_labels.TrialID.unique()
for PID in PIDs:
    if PID<300:
        label = 0
    else:
        label = 1
    for trial in trials:
        print ('Processing PID ', PID, ' and trial ', trial)
        snip = downsampled_labels[(downsampled_labels.PID == PID) & (downsampled_labels.TrialID ==trial)]
        index = 0
        group_index = 0
        while (index<=(len(snip)-strides_per_sequence)):
            filenames = []
            snippet = snip[index: index+strides_per_sequence]['FileName']
            snippet_nan = snippet.isna().sum()
            if (snippet_nan==0):
                for idx, filename in enumerate(snippet):
                    filenames.append(filename)
                    if idx ==0:
                        temp = pd.read_csv(path_to_save_downsampled_strides+filename, index_col = 0)
                    else:
                        temp = pd.concat((temp, pd.read_csv(path_to_save_downsampled_strides+filename, index_col = 0)), ignore_index=True)
                grouped_filename = str(PID)+ '_' + str(trial) + '_' + str(group_index) + '.csv'
                temp.to_csv(path_to_save_grouped_strides + grouped_filename) 
                grouped_labels.loc[len(grouped_labels)] = [PID, trial, group_index, grouped_filename, label, filenames]
                index+=skippedSteps
                group_index+=1  
            else:
                print ('This is a NaN/inconsequetive strides group')
                index+=skippedSteps
grouped_labels.to_csv(path_to_save_grouped_strides + '..\\grouped_labels.csv')

Processing PID  200  and trial  1
Processing PID  200  and trial  2
Processing PID  201  and trial  1
Processing PID  201  and trial  2
This is a NaN/inconsequetive strides group
This is a NaN/inconsequetive strides group
Processing PID  202  and trial  1
Processing PID  202  and trial  2
Processing PID  203  and trial  1
This is a NaN/inconsequetive strides group
This is a NaN/inconsequetive strides group
This is a NaN/inconsequetive strides group
Processing PID  203  and trial  2
Processing PID  204  and trial  1
This is a NaN/inconsequetive strides group
Processing PID  204  and trial  2
Processing PID  205  and trial  1
This is a NaN/inconsequetive strides group
Processing PID  205  and trial  2
Processing PID  206  and trial  1
Processing PID  206  and trial  2
Processing PID  207  and trial  1
Processing PID  207  and trial  2
Processing PID  208  and trial  1
Processing PID  208  and trial  2
Processing PID  209  and trial  1
Processing PID  209  and trial  2
This is a NaN/incon

In [None]:
print (len(grouped_labels))