# Predicting Multiple Sclerosis from Dynamics of Gait Variability - A Deep Learning Approach
### Creating the summary statistics files for the raw, size-N and regress-N features to be used by traditional ML algorithms on task/subject generalization frameworks 
This code generates summary statistics (namely mean and standard deviation) over multiple strides of raw, size-normalized and regression-normalized gait features. \
Traditional ML agorithms are sample-to-label classifiers that aim to infer a single label for a single data sample where each sample may have multiple dimensions corresponding to different features. To keep the dimensionality in check to avoid overtraining issues, we compute the aggregated gait parameters corresponding to multiple strides in each window. \
From 21 gait features across 10 strides, we create a set of 42 mean and standard deviation parameters for each window. 

In [7]:
from importlib import reload
import utils.utils_traditional_methods
reload(utils.utils_traditional_methods)
from utils.utils_traditional_methods import read_data, genarate_sequences, generate_summary_dataframe
from utils.package_imports import *

In [8]:
#Reading raw, size-N and regress-N features 
#Raw features
raw_data = read_data("data/gait_features.csv")
# print(raw_data.isnull().values.any())
print ('raw data')
display(raw_data.head(), raw_data.shape)

#Size-normalized features
sizeN_data = read_data("data/size_normalized_gait_features.csv")
# print(sizeN_data.isnull().values.any())
print ('size-N data')
display(sizeN_data.head(), sizeN_data.shape)

#Regression-normalized features
regressN_data = read_data("data/mr_scaled_features_30controlsTrialW.csv", drop_cols = False)
# print(regressN_data.isnull().values.any())
print ('regress-N data')
display(regressN_data.head(), regressN_data.shape)

#Strides to consider in one sequence/group 
strides_per_sequence_ = 5

raw data


Unnamed: 0,LeftFPA,RightFPA,Label,Butterfly_x_abs,ButterflySQ_x,PID,TrialID,DS_R,SS_R,DS_L,...,force_MidSSL,stride_time,swing_time,stance_time,stride_length,stride_width,stride_speed,cadence,walk_ratio,id
0,,,0,0.031715,0.001013,200,1,0.408,0.296,0.486,...,668.865206,1.554,0.364,1.19,0.555127,0.20092,0.357225,77.220077,0.014378,2001
1,0.454459,1.03904,0,0.013971,0.000198,200,1,0.41,0.454,0.408,...,715.837557,1.698,0.426,1.272,0.608529,0.204193,0.35838,70.671378,0.017221,2001
2,-0.267919,-2.942001,0,0.013155,0.000176,200,1,0.466,0.362,0.286,...,676.951255,1.726,0.612,1.114,0.48389,0.207683,0.280354,69.524913,0.01392,2001
3,0.669773,3.611119,0,0.018508,0.000347,200,1,0.342,0.444,0.334,...,683.155797,1.546,0.426,1.12,0.656292,0.217538,0.424509,77.619664,0.01691,2001
4,-8.515633,3.855825,0,0.028962,0.000832,200,1,0.324,0.59,0.292,...,733.838192,1.8,0.594,1.206,0.609956,0.184868,0.338864,66.666667,0.018299,2001


(3486, 25)

size-N data


Unnamed: 0,PID,TrialID,Label,stride_length,stride_width,DS_R,SS_R,DS_L,stride_time,stance_time,...,force_TOL,force_MidSSL,cadence,stride_speed,walk_ratio,LeftFPA,RightFPA,Butterfly_x_abs,ButterflySQ_x,id
0,200,1,0,0.346955,0.125575,1.010264,0.732936,1.203402,3.847916,2.946602,...,0.977063,0.895952,0.519762,0.090167,1.335052,,,0.300633,0.021467,2001
1,200,1,0,0.38033,0.127621,1.015216,1.124166,1.010264,4.204479,3.149645,...,0.96676,0.958872,0.475683,0.090458,1.599091,0.454459,1.03904,0.132927,0.003906,2001
2,200,1,0,0.302431,0.129802,1.153879,0.896361,0.708175,4.273811,2.758416,...,0.978692,0.906784,0.467966,0.070764,1.292535,-0.267919,-2.942001,0.060477,0.001183,2001
3,200,1,0,0.410182,0.135961,0.846839,1.099404,0.827029,3.828106,2.773272,...,0.94886,0.915095,0.522452,0.10715,1.570221,0.669773,3.611119,0.114006,0.002811,2001
4,200,1,0,0.381222,0.115542,0.802268,1.46092,0.723032,4.457045,2.98622,...,0.934266,0.982984,0.448728,0.085533,1.699125,-8.515633,3.855825,0.047291,0.000773,2001


(3486, 25)

regress-N data


Unnamed: 0,ButterflySQ_x,Butterfly_x_abs,DS_L,DS_R,Label,LeftFPA,PID,RightFPA,SS_R,TrialID,...,force_TOL,force_TOR,stance_time,stride_length,stride_speed,stride_time,stride_width,swing_time,walk_ratio,id
0,,,,,,,200,,,1,...,,,,,,,,,,2001
1,13.720482,5.516202,2.039117,2.094537,0.0,1.298406,200,4.360225,1.189094,1,...,0.942396,0.906447,1.626875,0.524177,0.352733,1.484333,1.605389,1.121805,0.76872,2001
2,4.157315,2.50967,1.429381,2.38062,0.0,-0.765453,200,-12.345805,0.948133,1,...,0.954028,0.895213,1.424794,0.416816,0.275937,1.50881,1.632826,1.611607,0.621351,2001
3,9.873948,4.731046,1.669277,1.74715,0.0,1.913565,200,15.153691,1.162903,1,...,0.924948,0.909291,1.432468,0.565319,0.417821,1.35146,1.710305,1.121805,0.754842,2001
4,2.713923,1.962483,1.459368,1.655195,0.0,-24.329473,200,16.180574,1.545299,1,...,0.910721,0.937886,1.542461,0.525407,0.333526,1.573498,1.453448,1.564206,0.816809,2001


(3486, 25)

### Generating sequences of multiple strides 

In [9]:
#21 features per stride, 10 strides per sequence with 2 skipped strides in each sequence 
#Raw data sequences 
print ('raw data')
sequenecs_raw_data = genarate_sequences(raw_data, strides_per_sequence = strides_per_sequence_, skipped_steps = 2)
display(sequenecs_raw_data[0])

#Size-N data sequences 
print ('size-N data')
sequenecs_sizeN_data = genarate_sequences(sizeN_data, strides_per_sequence = strides_per_sequence_, skipped_steps = 2)
display(sequenecs_sizeN_data[0])

#Regress-N data sequences 
print ('regress-N data')
sequenecs_regressN_data = genarate_sequences(regressN_data, strides_per_sequence = strides_per_sequence_, skipped_steps = 2)
display(sequenecs_regressN_data[0])

raw data
dropped sequences:  225


Unnamed: 0,LeftFPA,RightFPA,Label,Butterfly_x_abs,ButterflySQ_x,PID,TrialID,DS_R,SS_R,DS_L,...,force_MidSSL,stride_time,swing_time,stance_time,stride_length,stride_width,stride_speed,cadence,walk_ratio,id
1,0.454459,1.03904,0,0.013971,0.000198,200,1,0.41,0.454,0.408,...,715.837557,1.698,0.426,1.272,0.608529,0.204193,0.35838,70.671378,0.017221,2001
2,-0.267919,-2.942001,0,0.013155,0.000176,200,1,0.466,0.362,0.286,...,676.951255,1.726,0.612,1.114,0.48389,0.207683,0.280354,69.524913,0.01392,2001
3,0.669773,3.611119,0,0.018508,0.000347,200,1,0.342,0.444,0.334,...,683.155797,1.546,0.426,1.12,0.656292,0.217538,0.424509,77.619664,0.01691,2001
4,-8.515633,3.855825,0,0.028962,0.000832,200,1,0.324,0.59,0.292,...,733.838192,1.8,0.594,1.206,0.609956,0.184868,0.338864,66.666667,0.018299,2001
5,13.063372,-5.439336,0,0.003943,1.6e-05,200,1,0.392,0.404,0.31,...,705.890064,1.514,0.408,1.106,0.638237,0.255631,0.421557,79.260238,0.016105,2001


size-N data
dropped sequences:  225


Unnamed: 0,PID,TrialID,Label,stride_length,stride_width,DS_R,SS_R,DS_L,stride_time,stance_time,...,force_TOL,force_MidSSL,cadence,stride_speed,walk_ratio,LeftFPA,RightFPA,Butterfly_x_abs,ButterflySQ_x,id
1,200,1,0,0.38033,0.127621,1.015216,1.124166,1.010264,4.204479,3.149645,...,0.96676,0.958872,0.475683,0.090458,1.599091,0.454459,1.03904,0.132927,0.003906,2001
2,200,1,0,0.302431,0.129802,1.153879,0.896361,0.708175,4.273811,2.758416,...,0.978692,0.906784,0.467966,0.070764,1.292535,-0.267919,-2.942001,0.060477,0.001183,2001
3,200,1,0,0.410182,0.135961,0.846839,1.099404,0.827029,3.828106,2.773272,...,0.94886,0.915095,0.522452,0.10715,1.570221,0.669773,3.611119,0.114006,0.002811,2001
4,200,1,0,0.381222,0.115542,0.802268,1.46092,0.723032,4.457045,2.98622,...,0.934266,0.982984,0.448728,0.085533,1.699125,-8.515633,3.855825,0.047291,0.000773,2001
5,200,1,0,0.398898,0.159769,0.970645,1.000359,0.767602,3.74887,2.738607,...,0.948954,0.945548,0.533494,0.106405,1.495416,13.063372,-5.439336,0.026973,8.9e-05,2001


regress-N data
dropped sequences:  225


Unnamed: 0,ButterflySQ_x,Butterfly_x_abs,DS_L,DS_R,Label,LeftFPA,PID,RightFPA,SS_R,TrialID,...,force_TOL,force_TOR,stance_time,stride_length,stride_speed,stride_time,stride_width,swing_time,walk_ratio,id
1,13.720482,5.516202,2.039117,2.094537,0.0,1.298406,200,4.360225,1.189094,1,...,0.942396,0.906447,1.626875,0.524177,0.352733,1.484333,1.605389,1.121805,0.76872,2001
2,4.157315,2.50967,1.429381,2.38062,0.0,-0.765453,200,-12.345805,0.948133,1,...,0.954028,0.895213,1.424794,0.416816,0.275937,1.50881,1.632826,1.611607,0.621351,2001
3,9.873948,4.731046,1.669277,1.74715,0.0,1.913565,200,15.153691,1.162903,1,...,0.924948,0.909291,1.432468,0.565319,0.417821,1.35146,1.710305,1.121805,0.754842,2001
4,2.713923,1.962483,1.459368,1.655195,0.0,-24.329473,200,16.180574,1.545299,1,...,0.910721,0.937886,1.542461,0.525407,0.333526,1.573498,1.453448,1.564206,0.816809,2001
5,0.311205,1.119339,1.549329,2.002582,0.0,37.322527,200,-22.825616,1.058137,1,...,0.925039,0.924717,1.414563,0.549767,0.414915,1.323487,2.009793,1.074404,0.718881,2001


### Generating the summary statistics dataframe 

In [10]:
#Raw data 
print ('Raw data')
summary_raw_data = generate_summary_dataframe(sequenecs_raw_data)
print(summary_raw_data.shape)
display(summary_raw_data.head())
summary_raw_data.to_csv('data/summary_statistics_raw_data_'+str(strides_per_sequence_)+'strides.csv', index=False)

#Size-N data
print ('Size-N data')
summary_sizeN_data = generate_summary_dataframe(sequenecs_sizeN_data)
print(summary_sizeN_data.shape)
display(summary_sizeN_data.head())
summary_sizeN_data.to_csv('data/summary_statistics_sizeN_data_'+str(strides_per_sequence_)+'strides.csv', index=False)

#Regress-N data
print ('Regress-N data')
summary_regressN_data = generate_summary_dataframe(sequenecs_regressN_data)
print(summary_regressN_data.shape)
display(summary_regressN_data.head())
summary_regressN_data.to_csv('data/summary_statistics_regressN_data_'+str(strides_per_sequence_)+'strides.csv', index=False)

Raw data
(1436, 46)


Unnamed: 0,LeftFPA_mean,RightFPA_mean,Butterfly_x_abs_mean,ButterflySQ_x_mean,DS_R_mean,SS_R_mean,DS_L_mean,force_HSR_mean,force_MidSSR_mean,force_TOR_mean,...,stance_time_std,stride_length_std,stride_width_std,stride_speed_std,cadence_std,walk_ratio_std,PID,TrialID,SeqNum,Label
0,1.08081,0.0249295,0.0157079,0.000313987,0.3868,0.4508,0.326,644.436,693.787,699.733,...,0.0728478,0.0676058,0.0261265,0.0604805,5.42738,0.00163834,200,1,1,0
1,-0.574026,0.073994,0.019813,0.000469561,0.3648,0.424,0.3388,651.462,690.174,710.761,...,0.0453299,0.0638136,0.0252558,0.0417772,6.47414,0.00246309,200,1,2,0
2,2.00011,-0.668423,0.0187047,0.000518411,0.3716,0.3744,0.342,660.986,692.667,716.364,...,0.0241661,0.0839904,0.0250833,0.053728,3.84421,0.00248645,200,1,3,0
3,0.875886,0.0409768,0.0220967,0.000728349,0.36,0.3668,0.3504,657.933,688.703,710.523,...,0.0498116,0.0830975,0.0237462,0.0644361,3.31821,0.00194405,200,1,4,0
4,-0.453779,-0.710192,0.0161205,0.000437939,0.3504,0.392,0.3264,669.811,686.876,714.094,...,0.054637,0.0865216,0.0196957,0.0765366,3.77445,0.00160714,200,1,5,0


Size-N data
(1436, 46)


Unnamed: 0,stride_length_mean,stride_width_mean,DS_R_mean,SS_R_mean,DS_L_mean,stride_time_mean,stance_time_mean,swing_time_mean,force_HSR_mean,force_MidSSR_mean,...,stride_speed_std,walk_ratio_std,LeftFPA_std,RightFPA_std,Butterfly_x_abs_std,ButterflySQ_x_std,PID,TrialID,SeqNum,Label
0,0.374613,0.133739,0.957769,1.11624,0.80722,4.10246,2.88123,1.22123,0.863229,0.929336,...,0.0152659,0.152128,7.7145,4.09957,0.045148,0.0015659,200,1,1,0
1,0.39485,0.135748,0.903294,1.04988,0.838915,3.91527,2.79209,1.12318,0.87264,0.924496,...,0.0105449,0.22871,8.29642,4.08745,0.0661144,0.00265152,200,1,2,0
2,0.410892,0.141361,0.920132,0.927065,0.846839,3.71916,2.69404,1.02512,0.885398,0.927835,...,0.0135614,0.23088,7.69475,5.99433,0.0692013,0.00272247,200,1,3,0
3,0.40653,0.140696,0.891409,0.908247,0.867638,3.60228,2.66729,0.934989,0.881308,0.922525,...,0.0162643,0.180515,5.08383,5.48361,0.0661874,0.00324351,200,1,4,0
4,0.433417,0.137974,0.867638,0.970645,0.808211,3.6003,2.64649,0.953808,0.89722,0.920077,...,0.0193186,0.149231,3.35768,3.0555,0.0805202,0.00725558,200,1,5,0


Regress-N data
(1436, 46)


Unnamed: 0,ButterflySQ_x_mean,Butterfly_x_abs_mean,DS_L_mean,DS_R_mean,LeftFPA_mean,RightFPA_mean,SS_R_mean,cadence_mean,force_HSL_mean,force_HSR_mean,...,stride_length_std,stride_speed_std,stride_time_std,stride_width_std,swing_time_std,walk_ratio_std,PID,TrialID,SeqNum,Label
0,6.15537,3.16775,1.62929,1.97602,3.08791,0.104614,1.18071,0.686188,0.813466,0.814138,...,0.0582346,0.0595277,0.10676,0.205409,0.265187,0.0731317,200,1,1,0
1,7.05747,3.01161,1.69327,1.86363,-1.64001,0.310508,1.11052,0.72015,0.827317,0.823014,...,0.0549681,0.041119,0.122424,0.198564,0.257541,0.109946,200,1,2,0
2,6.70432,2.81775,1.70926,1.89837,5.7144,-2.80497,0.98061,0.755012,0.833966,0.835046,...,0.072348,0.0528815,0.0648181,0.197207,0.158308,0.110989,200,1,3,0
3,8.56985,3.24439,1.75124,1.83911,2.50244,0.171955,0.960705,0.779028,0.847039,0.831189,...,0.0715789,0.0634209,0.0506956,0.186695,0.0938742,0.0867776,200,1,4,0
4,21.5824,5.66645,1.63129,1.79006,-1.29646,-2.98025,1.02671,0.779758,0.838245,0.846195,...,0.0745284,0.0753308,0.0579856,0.154849,0.0748166,0.0717388,200,1,5,0


In [17]:
print ('No. of subjects retained with', strides_per_sequence_,'strides in sequesnce:',len(summary_raw_data.PID.unique()))

No. of subjects retained with 5 strides in sequesnce: 35
