## GAAIMS: Predicting Multiple Sclerosis from Dynamics of Gait Variability Using an Instrumented Treadmill - A Machine Learning-Based Approach
## Feature engineering 

### Package imports

In [90]:
import numpy as np
import pandas as pd
import math
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
path = 'C:\\Users\\Rachneet Kaur\\Dropbox\\GAIT\\sample_data\\data_export\\'

In [92]:
# Get all the file names in the dictionary
control_ids = list(range(200, 220))
ms_ids = list(range(300, 315)) + [318, 320, 321, 322, 323]
raw_controls_t1 = [path +str(i)+ '_B3_TRIAL01_RAWDATA.csv' for i in control_ids]
raw_controls_t2 = [path +str(i)+ '_B3_TRIAL02_RAWDATA.csv' for i in control_ids]

raw_ms_t1 = [path +str(i)+ '_B3_TRIAL01_RAWDATA.csv' for i in ms_ids]
raw_ms_t2 = [path +str(i)+ '_B3_TRIAL02_RAWDATA.csv' for i in ms_ids]

gait_controls_t1 = [path +str(i)+ '_B3_TRIAL01_GAITCYCLES.csv' for i in control_ids]
gait_controls_t2 = [path +str(i)+ '_B3_TRIAL02_GAITCYCLES.csv' for i in control_ids]

gait_ms_t1 = [path +str(i)+ '_B3_TRIAL01_GAITCYCLES.csv' for i in ms_ids]
gait_ms_t2 = [path +str(i)+ '_B3_TRIAL02_GAITCYCLES.csv' for i in ms_ids]

In [93]:
# for every GaitCycle file, a sequence of walk will always start with a heel strike on the right foot.
# Thus the order of the Gait event points would be HSR, TOL, MidSSR, HSL, TOR and MidSSL.
gait_type = np.array(['HSR', 'TOL', 'MidSSR', 'HSL', 'TOR', 'MidSSL'])

#Delta_time
delta_time = 0.002 #Since the data is collected is 500Hz frequency 

### Utility functions

In [94]:
#functions to drop missing values and invalid data 
def drop_unnamed(dataframe):
    return(dataframe.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1))

#Eliminate missing values
def drop_na(dataframe):
    return(pd.DataFrame.dropna(dataframe))

#Eliminate invalid data 
def get_valid(dataframe):
    return(dataframe.loc[dataframe.Valid == True, :])

# Valid strides in the gait_cycles.csv file 
def get_cycle(dataframe):
    stride_start = min(dataframe.loc[dataframe.EventType == 'HSR'].index)
    stride_end = max(dataframe.loc[dataframe.EventType == 'MidSSL'].index)   
    return dataframe.loc[stride_start:stride_end]

# Restore the indexing for the cropped dataframe 
def change_index(dataframe):
    dataframe.index = range(len(dataframe))
    return dataframe

# get all the valid index in order: HSR-TOL-MidSSR-HSL-TOR-MidSSL
def set_complete(data_frame):
    # input is the Dataframe includes ONLY valid points 
    # get all the index of HSR since it starts with heal strike left
    # if the length of last gait cycle contain HSR does not equals to 6, then ignore it
    
    HSR = data_frame.loc[data_frame.EventType == 'HSR'].index
    last_idx = HSR[-1]
    last_all_idx = data_frame.index[-1]
    # if the last gait cycles contains HSR is not a valid gait cycle, then we should consider the last second HSR instead.
    if((last_all_idx-last_idx) < 5):
        HSR = HSR[0:-1] 
    else:
        HSR = HSR
    
    # get all the valid index in order: HSR-TOL-MidSSR-HSL-TOR-MidSSL
    valid = []
    for idx_HSR in HSR:
        if (((idx_HSR + 1) in data_frame.index) & ((idx_HSR + 2) in data_frame.index) &
            ((idx_HSR + 3) in data_frame.index) & ((idx_HSR + 4) in data_frame.index) & 
            ((idx_HSR + 5) in data_frame.index)):
            # the valid index exist in the dataframe.
            if((data_frame.loc[idx_HSR + 1].EventType == 'TOL') & (data_frame.loc[idx_HSR + 2].EventType == 'MidSSR') & 
               (data_frame.loc[idx_HSR + 3].EventType == 'HSL') & (data_frame.loc[idx_HSR + 4].EventType == 'TOR') & 
               (data_frame.loc[idx_HSR + 5].EventType == 'MidSSL')):
                valid.extend(range(idx_HSR, idx_HSR+6))
    #returns the list of valid indices which form complete strides 
    return valid

### Data Preprocessing

In [95]:
#Preprocessing the files to delete missing and invalid data 
#For each person (control and MS) in Trial 1
def cleaning(pid, trial = 1, cohort = 'controls'):
    if (cohort == 'controls'):
        if (trial == 1):
            gait = pd.read_csv(gait_controls_t1[pid])
            raw = pd.read_csv(raw_controls_t1[pid])
        else:
            gait = pd.read_csv(gait_controls_t2[pid])
            raw = pd.read_csv(raw_controls_t2[pid])            
    
    if (cohort == 'pwms'):
        if (trial == 1):
            gait = pd.read_csv(gait_ms_t1[pid])
            raw = pd.read_csv(raw_ms_t1[pid])
        else:
            gait = pd.read_csv(gait_ms_t2[pid])
            raw = pd.read_csv(raw_ms_t2[pid])            
    gait = drop_na(gait)
    gait  = get_valid(gait)

    #Reducing to complete strides data 
    gait = get_cycle(gait)
    indices_complete = set_complete(gait)
    gait = gait.loc[indices_complete]

    #Resetting the index 
    gait = change_index(gait)
    return indices_complete, gait, raw

### Our gait cycle would be HSR, TOL, MidSSR, HSL, TOR and MidSSL

### Supporting times

In [96]:
#Supporting Times
#Double support: HSR-TOL
#Single support (Right): TOL-MidSSR-HSL
#Double Support: HSL-TOR
#Signle support (Left): TOR - MidSSL-HSR (of the next stride)
# Note, for counting supporting time of a foot for current stride, we need the HSR for the next stride
def get_cycle_double_single(Dataframe):
    stride_start = min(Dataframe.loc[Dataframe.EventType == 'HSR'].index)
    stride_end = max(Dataframe.loc[Dataframe.EventType == 'HSR'].index)   
    return Dataframe.loc[stride_start:stride_end]

# delete the 'mid' points for calculating supporting time for convenience
def delete_mid(Dataframe):
    midl = Dataframe.loc[Dataframe.EventType == 'MidSSL'].index
    midr = Dataframe.loc[Dataframe.EventType == 'MidSSR'].index
    new_index = pd.Int64Index(np.arange(len(Dataframe))).difference(list(midl) + list(midr))
    return(Dataframe.loc[pd.Int64Index(list(new_index))])

#This function computes the 4 features for supporting times ,namely, double support (on right and left heels) and single support 
#(on left and right foot) 
def support(gait):
    ####################
    #insert the support#
    ####################
    double_single = get_cycle_double_single(gait) 
    #Reducing the dataframe from first HSR to last HSR for calculating supporting times
    
    # change the index again for counting strides
    double_single = change_index(double_single)
    
    # MidSSR and MidSSL is useless for calculating the support time
    double_single = delete_mid(double_single)
    # get the time
    time = list(double_single['Time'])
    time_d_s = list(np.array(time[1:]) - np.array(time[0:-1])) 
    #Since now the events are HSR-TOL-HSL-TOR, we can simply take time[1:]-time[:-1]
    
    Double_LeftWhole_RightHeal = time_d_s[0::4] # support by whole left foot and right heal #HSR-TOL
    Single_Right = time_d_s[1::4] # support bi single right feet #TOL-HSL
    Double_RightWhole_LeftHeal = time_d_s[2::4] #HSL-TOR
    Single_left = time_d_s[3::4] #TOR-HSR (of the next stride)
    return Double_LeftWhole_RightHeal, Single_Right, Double_RightWhole_LeftHeal, Single_left

### Treadmill self-controlled speed and Ground reaction forces at gait events 

In [97]:
#Function returning the treadmill speed or ground reaction forces at 6 gait events 
def tspeeds_forces(gait, raw, stride_count, feature = 'Speed'):
    #Exact times of HSR
    HSR_times = gait['Time'][gait.EventType == 'HSR']
    #Exact times of TOR 
    TOR_times = gait['Time'][gait.EventType == 'TOR']
    #Exact times of HSL
    HSL_times = gait['Time'][gait.EventType == 'HSL']
    #Exact times of TOL 
    TOL_times = gait['Time'][gait.EventType == 'TOL']
    #Exact times of MidSSR 
    MidSSR_times = gait['Time'][gait.EventType == 'MidSSR']
    #Exact times of MidSSL
    MidSSL_times = gait['Time'][gait.EventType == 'MidSSL']

    #For six events of interest, calculate the closest times from RAWDATA.csv file 
    #and keep the treadmill speed (tspeed) at that point if feature == 'Speed' or 
    #ground reaction force if feature == 'TreadMill_FZ'
    HSR_raw = [raw[feature][raw['Time']>HSR_times.iloc[i]].iloc[0] for i in range(stride_count)]
    TOR_raw = [raw[feature][raw['Time']>TOR_times.iloc[i]].iloc[0] for i in range(stride_count)]
    HSL_raw = [raw[feature][raw['Time']>HSL_times.iloc[i]].iloc[0] for i in range(stride_count)]
    TOL_raw = [raw[feature][raw['Time']>TOL_times.iloc[i]].iloc[0] for i in range(stride_count)]
    MidSSR_raw = [raw[feature][raw['Time']>MidSSR_times.iloc[i]].iloc[0] for i in range(stride_count)]
    MidSSL_raw = [raw[feature][raw['Time']>MidSSL_times.iloc[i]].iloc[0] for i in range(stride_count)]

    return HSR_raw, MidSSR_raw, TOR_raw, HSL_raw, TOL_raw, MidSSL_raw

### Stride, swing and stance times for each stride 

In [98]:
#Function computing the stride, swing and stance times for each stride 
def times(gait):
    #Exact times of HSR
    HSR_times = gait['Time'][gait.EventType == 'HSR']
    #Exact times of TOR 
    TOR_times = gait['Time'][gait.EventType == 'TOR']
    #Exact times of HSL
    HSL_times = gait['Time'][gait.EventType == 'HSL']
    #Exact times of TOL 
    TOL_times = gait['Time'][gait.EventType == 'TOL']
    #Exact times of MidSSR 
    MidSSR_times = gait['Time'][gait.EventType == 'MidSSR']
    #Exact times of MidSSL
    MidSSL_times = gait['Time'][gait.EventType == 'MidSSL']

    #Stride Time = Next HSR Time - Current HSR Time
    stride_times = HSR_times[1:].values - HSR_times[:-1].values

    #Swing time = Next HSR time - Current TOR time
    swing_times = HSR_times[1:].values - TOR_times[:-1].values

    #Stance time = Current TOR time - Current HSR time 
    stance_times = TOR_times.values - HSR_times.values

    return stride_times, swing_times, stance_times

### Stride length

In [99]:
#Function returning the difference between consequetive Y-coordinates 
def stride_len(y1, y2):
    return y2-y1

def length(gait, raw, indices_complete, stride_count):
    #Function returning the stride length 
    #Exact times of HSR
    HSR_times = gait['Time'][gait.EventType == 'HSR']

    #For HSR, calculate the closest times from RAWDATA.csv file 
    HSR_times_raw = [raw['Time'][raw['Time']>HSR_times.iloc[i]].iloc[0] for i in range(stride_count)]

    #Y for HSR
    HSR_Y = gait['Y'][gait.EventType == 'HSR']
    rely_progR = []

    for idx in range(0, stride_count): #Use for all strides for each person, each trial
        try:
            #For Right Foot
            #Relative y indices for HSR(i-1) to HSR(i) 
            rely_prog_idxR = (raw['Time']>=HSR_times_raw[idx]) & (raw['Time']<HSR_times_raw[idx+1]) #Progression vector 
            #Relative_y or Belt Speed = Speed*dt = Area under the speed curve *dt for HSR(i-1) to HSR(i)
            rely_progR.append(np.trapz(raw['Speed'][rely_prog_idxR])*0.002) 
        except:
            pass
    #Right Foot 
    #HSR_Y after adding the relative y correspoding to previous HSR
    rel_HSR_Y = HSR_Y[1:].values+np.array(rely_progR) 

    #Stride length HSR-NextHSR 
    length = np.array(list(map(stride_len, HSR_Y[:-1].values, rel_HSR_Y)))

    #Right Foot
    #Convert in-consequetive gait cycles' stride length to NaN 
    stride_idx = np.array(indices_complete[::6][1:]) - np.array(indices_complete[::6][:-1])
    #If this difference is not 6, that means the valid strides is not in consequent order, hence, we cannot compute lengths 
    length[np.where(stride_idx!=6)[0]] = np.nan
    return length

### Stride width

In [100]:
#Stride Width = abs((x2-x1)*(y1-y0) - (x1-x0)*(y2-y1)) / np.sqrt(np.square(x2-x1) + np.square(y2-y1))
#where (x0, y0) is the point from HS of opposite feet (i.e. coordinates of HSL)
#and (x1,y1), (x2,y2) are coordinates that make the line joining HS(i-1) and HS(i) of same feet
def stride_wid(x0, y0, x1, y1, x2, y2):
    return np.abs((x2-x1)*(y1-y0) - (x1-x0)*(y2-y1)) / np.sqrt((x2-x1)**2 + (y2-y1)**2)

In [101]:
#Function returning the stride width
def calc_width(gait, raw, indices_complete, stride_count):
    #Exact times of HSR
    HSR_times = gait['Time'][gait.EventType == 'HSR']
    #Exact times of HSL
    HSL_times = gait['Time'][gait.EventType == 'HSL']

    #For HSR, calculate the closest times from RAWDATA.csv file 
    HSR_times_raw = [raw['Time'][raw['Time']>HSR_times.iloc[i]].iloc[0] for i in range(stride_count)]
    #For HSL, calculate the closest times from RAWDATA.csv file 
    HSL_times_raw = [raw['Time'][raw['Time']>HSL_times.iloc[i]].iloc[0] for i in range(stride_count)]

    #Y for HSR
    HSR_Y = gait['Y'][gait.EventType == 'HSR']
    #Y for HSL
    HSL_Y = gait['Y'][gait.EventType == 'HSL']

    #X for HSR
    HSR_X = gait['X'][gait.EventType == 'HSR']
    #X for HSL
    HSL_X = gait['X'][gait.EventType == 'HSL']

    rely_progR = []
    rely_progL = []

    for idx in range(0, stride_count): #Use for all strides for each person, each trial
        try:
            #For Right Foot
            #Relative y indices for HSR(i-1) to HSR(i) 
            rely_prog_idxR = (raw['Time']>=HSR_times_raw[idx]) & (raw['Time']<HSR_times_raw[idx+1]) #Progression vector 
            #Relative_y or Belt Speed = Speed*dt = Area under the speed curve *dt for HSR(i-1) to HSR(i)
            rely_progR.append(np.trapz(raw['Speed'][rely_prog_idxR])*0.002) 

            #Relative y indices for HSR(i-1) to HSL(i-1) 
            rely_prog_idxL = (raw['Time']>=HSR_times_raw[idx]) & (raw['Time']<HSL_times_raw[idx]) #Progression vector 
            #Relative_y or Belt Speed = Speed*dt = Area under the speed curve *dt for HSR(i-1) to HSL(i-1)
            rely_progL.append(np.trapz(raw['Speed'][rely_prog_idxL])*0.002)         

        except:
            pass
    #Right Foot 
    #HSR_Y after adding the relative y correspoding to previous HSR
    rel_HSR_Y = HSR_Y[1:].values+np.array(rely_progR) 
    #Left Foot 
    #HSL_Y after adding the relative y correspoding to same stride's HSR
    rel_HSL_Y = HSL_Y[:-1].values+np.array(rely_progL) 

    #Stride width HSR-HSL-NextHSR 
    width = np.array(list(map(stride_wid, HSL_X[:-1].values, rel_HSL_Y, HSR_X[:-1].values, HSR_Y[:-1].values, 
                                     HSR_X[1:].values, rel_HSR_Y)))

    #Right Foot
    #Convert in-consequetive gait cycles' stride width to NaN 
    stride_idx = np.array(indices_complete[::6][1:]) - np.array(indices_complete[::6][:-1])
    #If this difference is not 6, that means the valid strides is not in consequent order, hence, we cannot compute lengths 
    width[np.where(stride_idx!=6)[0]] = np.nan
    return width

### Computing all the gait-based features

In [102]:
#Appending the 24 features 
def gait_features(pid, trial_id, cohort):
    df = pd.DataFrame()
    indices_complete, gait, raw = cleaning(pid, trial_id, cohort)
    stride_count = int(gait.shape[0]/6) #6 events in each stride
    
    #Inserting the supporting times 
    DS_R, SS_R, DS_L, SS_L = support(gait)
    #Total length must match the stride count 
    #Append NaN at the end for all the supporting times since for SS_L, we need the next stride available, 
    #hence SS_L does not exist for the last stride
    df['DS_R'], df['SS_R'], df['DS_L'], df['SS_L'] = np.append(DS_R, np.nan), np.append(SS_R, np.nan), np.append(DS_L, np.nan), np.append(SS_L, np.nan)
    
    #Inserting the treadmill speeds 
    df['tspeed_HSR'], df['tspeed_MidSSR'], df['tspeed_TOR'], df['tspeed_HSL'], df['tspeed_TOL'], df['tspeed_MidSSL'] = tspeeds_forces(gait, raw, stride_count, 'Speed')
    
    #Inserting the ground reaction forces 
    df['force_HSR'], df['force_MidSSR'], df['force_TOR'], df['force_HSL'], df['force_TOL'], df['force_MidSSL'] = tspeeds_forces(gait, raw, stride_count, 'TreadMill_FZ')
    
    #Inserting the stride, stance and swing times 
    stride_time, swing_time, stance_time= times(gait)
    #Append NaN at the end for all the stride and swing time since we need next stride for computation of these at current stride
    df['stride_time'], df['swing_time'] = np.append(stride_time, np.nan), np.append(swing_time, np.nan)
    df['stance_time'] = stance_time
    
    #Inserting the stride length 
    stride_length = length(gait, raw, indices_complete, stride_count)
    #For length, append NaN at the end
    df['stride_length'] = np.append(stride_length, np.nan)
    
    #Inserting the stride width
    stride_width = calc_width(gait, raw, indices_complete, stride_count)
    #For stride width, append NaN at the end 
    df['stride_width'] = np.append(stride_width, np.nan)
    
    #Inserting the stride speed
    df['stride_speed'] = df['stride_length']/df['stride_time']
    
    #Inserting the cadence (steps per minute i.e. 60*2/stride_time since 1 stride has 2 steps 
    #and stride time is in seconds so multiple by 60 to compute steps in a minute)
    df['cadence'] = 60*2/df['stride_time'] 
    
    #Inserting the walk ratio = sride_length/(strides per minute) where stride_per_min = cadence/2 (Unit: m/strides/min)
    df['walk_ratio'] = 2*df['stride_length']/df['cadence']
    return df

In [103]:
#Dataframe with Patient ID, Trial ID, (Right FPA, Left FPA for each stride)
final_df = pd.DataFrame()

cohorts = ['controls', 'pwms']
trials = [1, 2]
for trial_id in trials:
    for cohort in cohorts:
        for index in range(0, 20): #20 people in each cohort and each trial
            if cohort == 'controls':
                pid = control_ids[index]
            else:
                pid = ms_ids[index]
            df = gait_features(index, trial_id, cohort)
            
            temp_df = pd.DataFrame(data = np.array([[pid]*len(df), [trial_id]*len(df)]).T)
            temp_df.columns = ['PID', 'TrialID']
            temp_df = pd.concat([temp_df, df], axis = 1)
            final_df = final_df.append(temp_df, ignore_index= True)

In [171]:
#Combining all features together, including butterfly features and angles to create a final raw dataframe 
#Also inserting the labels 
FPAs = pd.read_csv(path+'..\\FPA\\FPA_feature.csv')
butterfly = pd.read_csv(path+'..\\..\\ButterflyFeatures.csv')
whole_df = pd.concat([FPAs[['LeftFPA', 'RightFPA', 'Label']], butterfly[['Butterfly_x_abs', 'Butterfly_y_abs', 
                                                                'ButterflySQ_x', 'ButterflySQ_y']], final_df], axis = 1)

#Deleting the subjects who were holding the handrail to remove bias involded in forces 
subjects_to_delete = [212, 213, 309, 310, 311]

for subject_to_delete in subjects_to_delete:
    index_to_delete = whole_df[ whole_df['PID']==subject_to_delete].index
    whole_df.drop(index_to_delete, inplace=True)
    
#Saving to .csv 
whole_df.to_csv(path + '..\\..\\gait_features.csv')

#Saving to .csv trial 1 
whole_df_trial1 = whole_df[whole_df['TrialID']==1]
whole_df_trial1.to_csv(path + '..\\..\\gait_features_trial1.csv')

#Saving to .csv trial 2 
whole_df_trial2 = whole_df[whole_df['TrialID']==2]
whole_df_trial2.to_csv(path + '..\\..\\gait_features_trial2.csv')

In [174]:
print (whole_df_trial1.shape, whole_df_trial2.shape, whole_df.shape)

(1776, 33) (1710, 33) (3486, 33)
