## Gait Video Study 
### Traditional ML algorithms on subject generalization frameworks, namely a) W, b) WT, c) VBW and d) VBWT,  to classify HOA/MS/PD strides and subjects using cross validation 
#### Remember to add the original count of frames in a single stride (before down sampling via smoothing) for each stride as an additional artificial feature to add information about speed of the subject to the model
1. Save the optimal hyperparameters, confusion matrices and ROC curves for each algorithm.
2. Make sure to not use x, y, z, confidence = 0, 0, 0, 0 as points for the model since they are simply missing values and not data points, so make sure to treat them before inputting to model 
3. Make sure to normalize (z-score normalization) the features before we feed them to the model.
4. We use the summary statistics as range, CoV and asymmetry between the right and left limbs as the features to input to the traditional models requiring fixed size 1D input for each training/testing set sample.
5. We use Group 5-fold stratified cross validation for evaluation.
6. Compare traditional algorithms among the 4 sub-frameworks of subject generalization by retaining only common subjets across the 4 frameworks.

In [65]:
from importlib import reload
from ml_utils.imports import *

from ml_utils import subject_gen_traditionalML
reload(subject_gen_traditionalML)
from ml_utils.subject_gen_traditionalML import keep_common_PIDs, models, evaluate, run_ml_models
from ml_utils.subject_gen_traditionalML import design, plot_ROC

In [66]:
path = 'C:\\Users\\Rachneet Kaur\\Box\\Gait Video Project\\GaitVideoData\\video\\'
data_path = path+'traditional_methods_dataframe.csv'
results_path = 'C:\\Users\\Rachneet Kaur\\Box\Gait Video Project\\MLresults\\'

data = pd.read_csv(data_path, index_col= 0)
display(data.head())

#Whether to save the results (confusion matrices and RoC plots) or not 
save_results = True 

Unnamed: 0,key,cohort,trial,scenario,video,PID,stride_number,frame_count,label,right hip-x-CoV,...,ankle-z-asymmetry,heel-x-asymmetry,heel-y-asymmetry,heel-z-asymmetry,toe 1-x-asymmetry,toe 1-y-asymmetry,toe 1-z-asymmetry,toe 2-x-asymmetry,toe 2-y-asymmetry,toe 2-z-asymmetry
0,GVS_212_T_T1_1,HOA,BW,SLWT,GVS_212_T_T1,212,1,46,0,0.046077,...,14.426173,3.407379,10.662441,0.830365,0.50257,31.450487,8.644012,5.236678,31.182183,8.215725
1,GVS_212_T_T1_2,HOA,BW,SLWT,GVS_212_T_T1,212,2,39,0,0.021528,...,1.360847,5.155307,11.363806,4.333776,1.025647,28.2664,2.671081,6.678294,15.058825,4.903579
2,GVS_212_T_T1_3,HOA,BW,SLWT,GVS_212_T_T1,212,3,56,0,0.034394,...,1.341021,8.625363,7.159495,3.366152,1.759968,17.545787,5.921325,8.243491,9.578638,3.008162
3,GVS_212_T_T1_4,HOA,BW,SLWT,GVS_212_T_T1,212,4,53,0,0.028511,...,2.375934,6.728268,0.098235,0.999027,0.541911,7.843339,4.279617,0.748023,19.471731,5.086056
4,GVS_212_T_T1_5,HOA,BW,SLWT,GVS_212_T_T1,212,5,44,0,0.025213,...,8.525816,1.775282,0.03321,9.166863,1.354601,6.674183,8.47948,4.373622,0.315168,11.795593


### main()

### Subject generalization framework 1: walking (W) to classify HOA/MS/PD strides and subjects using cross validation 

In [67]:
#Trial W for the first framework of subject generalization
trialW = data[data['scenario']=='W']
print ('Original number of subjects in trial W for cross validation:', len(trialW['PID'].unique()))
print ('Number of subjects in trial W in each cohort:\n', trialW.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialW = shuffle(trialW, random_state = 0)
#CV for people generalize so no train-test split
X = trialW.drop(cols_to_drop, axis = 1)
Y = trialW[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial W for cross validation: ', len(trialW))
print ('HOA, MS and PD strides in trial W:\n', trialW['cohort'].value_counts())
print ('Imbalance ratio in trial W (controls:MS:PD)= 1:X:Y\n', trialW['cohort'].value_counts()/trialW['cohort'].value_counts()['HOA'])

#Defining the framework of interest
framework = 'W'

Original number of subjects in trial W for cross validation: 32
Number of subjects in trial W in each cohort:
 HOA    14
MS     10
PD      8
Name: cohort, dtype: int64
Strides in trial W for cross validation:  1380
HOA, MS and PD strides in trial W:
 HOA    658
MS     389
PD     333
Name: cohort, dtype: int64
Imbalance ratio in trial W (controls:MS:PD)= 1:X:Y
 HOA    1.000000
MS     0.591185
PD     0.506079
Name: cohort, dtype: float64


In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
# ml_models = ['logistic_regression']
metrics = run_ml_models(ml_models, X, Y, framework, results_path, save_results)

random_forest


In [None]:
metrics

### Subject generalization framework 2: walking while talking (WT) to classify strides and subjects of HOA/MS/PD

In [None]:
#Trial WT for the second framework of subject generalization
trialWT = data[data['scenario']=='WT']
print ('Original number of subjects in trial WT for cross validation:', len(trialWT['PID'].unique()))
print ('Number of subjects in trial WT in each cohort:\n', trialWT.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialWT = shuffle(trialWT, random_state = 0)
#CV for people generalize so no train-test split
X_WT = trialWT.drop(cols_to_drop, axis = 1)
Y_WT = trialWT[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial WT for cross validation: ', len(trialWT))
print ('HOA, MS and PD strides in trial WT:\n', trialWT['cohort'].value_counts())
print ('Imbalance ratio in trial WT (controls:MS:PD)= 1:X:Y\n', trialWT['cohort'].value_counts()/trialWT['cohort'].value_counts()['HOA'])
#Defining the framework of interest
framework = 'WT'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_WT = run_ml_models(ml_models, X_WT, Y_WT, framework, results_path, save_results)

In [None]:
metrics_WT

### Subject generalization framework 3: virtual beam walking (VBW) to classify strides and subjects of HOA/MS/PD

In [None]:
#Trial VBW for the third framework of subject generalization
trialVBW = data[data['scenario']=='SLW']
print ('Original number of subjects in trial VBW for cross validation:', len(trialVBW['PID'].unique()))
print ('Number of subjects in trial VBW in each cohort:\n', trialVBW.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialVBW = shuffle(trialVBW, random_state = 0)
#CV for people generalize so no train-test split
X_VBW = trialVBW.drop(cols_to_drop, axis = 1)
Y_VBW = trialVBW[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial VBW for cross validation: ', len(trialVBW))
print ('HOA, MS and PD strides in trial VBW:\n', trialVBW['cohort'].value_counts())
print ('Imbalance ratio in trial VBW (controls:MS:PD)= 1:X:Y\n', trialVBW['cohort'].value_counts()/trialVBW['cohort'].value_counts()['HOA'])
#Defining the framework of interest
framework = 'VBW'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_VBW = run_ml_models(ml_models, X_VBW, Y_VBW, framework, results_path, save_results)

In [None]:
metrics_VBW

### Subject generalization framework 4: virtual beam walking while talking (VBWT) to classify strides and subjects of HOA/MS/PD

In [None]:
#Trial VBWT for the fourth framework of subject generalization
trialVBWT = data[data['scenario']=='SLWT']
print ('Original number of subjects in trial VBWT for cross validation:', len(trialVBWT['PID'].unique()))
print ('Number of subjects in trial VBWT in each cohort:\n', trialVBWT.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialVBWT = shuffle(trialVBWT, random_state = 0)
#CV for people generalize so no train-test split
X_VBWT = trialVBWT.drop(cols_to_drop, axis = 1)
Y_VBWT = trialVBWT[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial VBWT for cross validation: ', len(trialVBWT))
print ('HOA, MS and PD strides in trial VBWT:\n', trialVBWT['cohort'].value_counts())
print ('Imbalance ratio in trial VBWT (controls:MS:PD)= 1:X:Y\n', trialVBWT['cohort'].value_counts()/trialVBWT['cohort'].value_counts()['HOA'])
#Defining the framework of interest
framework = 'VBWT'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_VBWT = run_ml_models(ml_models, X_VBWT, Y_VBWT, framework, results_path, save_results)

In [None]:
metrics_VBWT

In [None]:
#To do!
#ROC- Done 
#Confusion matrix files - Done
#CF itself - Done
#Saving all results to results folder - Done
#Comments in utility functions - Done
#micro macro weighted save all results 
#How can we compare different frameworks


## Compare traditional algorithms among the 4 sub-frameworks of subject generalization

In [None]:
#To compare across the 4 sub-frameworks of subject generalization, we reduce to common subjects across all 4 sub-frameworks 
#and then compare for the best accuracy/model in each sub-framework 

#Retaining the common PIDs across the 4 tasks 
common_pids = keep_common_PIDs(data, ['W', 'WT', 'SLW', 'SLWT'])

#Retaining the data with only common PIDs
reduced_data = data[data.PID.isin(common_pids)]
print ('Number of subjects in each cohort in reduced data with common PIDs:\n', \
       reduced_data.groupby('PID').first()['cohort'].value_counts())
design()
#Checking the retained strides in each task after reducing to commpn PIDs only
for scen in ['W', 'WT', 'SLW', 'SLWT']:
    reduced_data_scen = reduced_data[reduced_data.scenario==scen].reset_index().drop('index', axis = 1)
    print ('No. of strides retained in scenario', scen, 'are: ', reduced_data_scen.shape)
    print ('No. of strides retained for each cohort in scenario', scen, 'are:\n', reduced_data_scen['cohort'].value_counts())
    print ('Imbalance ratio in scenario', scen, '(controls:MS:PD)= 1:X:Y\n', \
           reduced_data_scen['cohort'].value_counts()/reduced_data_scen['cohort'].value_counts()['HOA'])
    design()

#### Trial W

In [None]:
#Running the traditional models again for all sub-frameworks of suject generalization to give a ranking of best to worst 
#tasks for subject generalization 

#Trial W
reduced_data_W = reduced_data[reduced_data.scenario=='W'].reset_index().drop('index', axis = 1)
cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']

#Shuffling the cross validation stride data
reduced_data_W = shuffle(reduced_data_W, random_state = 0)
#CV for people generalize so no train-test split
X_reduced_data_W = reduced_data_W.drop(cols_to_drop, axis = 1)
Y_reduced_data_W = reduced_data_W[['PID', 'label']]

#Defining the framework of interest
framework = 'reducedW_for_comparision'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_reducedW_for_comparision = run_ml_models(ml_models, X_reduced_data_W, Y_reduced_data_W, framework, results_path, save_results)

In [None]:
metrics_reducedW_for_comparision

#### Trial WT

In [None]:
#Running the traditional models again for all sub-frameworks of suject generalization to give a ranking of best to worst 
#tasks for subject generalization 

#Trial WT
reduced_data_WT = reduced_data[reduced_data.scenario=='WT'].reset_index().drop('index', axis = 1)
cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']

#Shuffling the cross validation stride data
reduced_data_WT = shuffle(reduced_data_WT, random_state = 0)
#CV for people generalize so no train-test split
X_reduced_data_WT = reduced_data_WT.drop(cols_to_drop, axis = 1)
Y_reduced_data_WT = reduced_data_WT[['PID', 'label']]

#Defining the framework of interest
framework = 'reducedWT_for_comparision'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_reducedWT_for_comparision = run_ml_models(ml_models, X_reduced_data_WT, Y_reduced_data_WT, framework, results_path, save_results)

In [None]:
metrics_reducedWT_for_comparision

#### Trial VBW

In [None]:
#Running the traditional models again for all sub-frameworks of suject generalization to give a ranking of best to worst 
#tasks for subject generalization 

#Trial VBW
reduced_data_VBW = reduced_data[reduced_data.scenario=='SLW'].reset_index().drop('index', axis = 1)
cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']

#Shuffling the cross validation stride data
reduced_data_VBW = shuffle(reduced_data_VBW, random_state = 0)
#CV for people generalize so no train-test split
X_reduced_data_VBW = reduced_data_VBW.drop(cols_to_drop, axis = 1)
Y_reduced_data_VBW = reduced_data_VBW[['PID', 'label']]

#Defining the framework of interest
framework = 'reducedVBW_for_comparision'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_reducedVBW_for_comparision = run_ml_models(ml_models, X_reduced_data_VBW, Y_reduced_data_VBW, framework, results_path, save_results)

In [None]:
metrics_reducedVBW_for_comparision
#HOA is doing better than W and WT even though not in full model 
#Sequential test - this is screening test - first classify HOA or not using VBW task and then then do further screening out of PD/MS 

#### Trial VBWT

In [None]:
#Running the traditional models again for all sub-frameworks of suject generalization to give a ranking of best to worst 
#tasks for subject generalization 

#Trial VBWT
reduced_data_VBWT = reduced_data[reduced_data.scenario=='SLWT'].reset_index().drop('index', axis = 1)
cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']

#Shuffling the cross validation stride data
reduced_data_VBWT = shuffle(reduced_data_VBWT, random_state = 0)
#CV for people generalize so no train-test split
X_reduced_data_VBWT = reduced_data_VBWT.drop(cols_to_drop, axis = 1)
Y_reduced_data_VBWT = reduced_data_VBWT[['PID', 'label']]

#Defining the framework of interest
framework = 'reducedVBWT_for_comparision'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_reducedVBWT_for_comparision = run_ml_models(ml_models, X_reduced_data_VBWT, Y_reduced_data_VBWT, framework, results_path, save_results)

In [None]:
metrics_reducedVBWT_for_comparision


In [None]:
#RESULTS of comparing 4 sub-frameworks for subject generalization
#WT>>W>>VBWT>>VBW 
#VBW is screening test since it performs well (looking at the confusion matrix) for predicting HOA or not, but we need something more to 
#distinguish well between the two neurological populations (MS and PD).

## Compare traditional algorithms among the 2 sub-frameworks of subject generalization, namely W and WT 

In [None]:
#To compare across the 2 sub-frameworks of subject generalization, we reduce to common subjects across all 2 sub-frameworks 
#and then compare for the best accuracy/model in each sub-framework 

#Retaining the common PIDs across the 2 walking tasks 
common_pids = keep_common_PIDs(data, ['W', 'WT'])

#Retaining the data with only common PIDs
reduced_data = data[data.PID.isin(common_pids)]
print ('Number of subjects in each cohort in reduced data with common PIDs:\n', \
       reduced_data.groupby('PID').first()['cohort'].value_counts())
design()
#Checking the retained strides in each task after reducing to commpn PIDs only
for scen in ['W', 'WT']:
    reduced_data_scen = reduced_data[reduced_data.scenario==scen].reset_index().drop('index', axis = 1)
    print ('No. of strides retained in scenario', scen, 'are: ', reduced_data_scen.shape)
    print ('No. of strides retained for each cohort in scenario', scen, 'are:\n', reduced_data_scen['cohort'].value_counts())
    print ('Imbalance ratio in scenario', scen, '(controls:MS:PD)= 1:X:Y\n', \
           reduced_data_scen['cohort'].value_counts()/reduced_data_scen['cohort'].value_counts()['HOA'])
    design()

### Trial W

In [None]:
#Running the traditional models again for all sub-frameworks of suject generalization to give a ranking of best to worst 
#tasks for subject generalization 

#Trial W
reduced_data_W = reduced_data[reduced_data.scenario=='W'].reset_index().drop('index', axis = 1)
cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']

#Shuffling the cross validation stride data
reduced_data_W = shuffle(reduced_data_W, random_state = 0)
#CV for people generalize so no train-test split
X_reduced_data_W = reduced_data_W.drop(cols_to_drop, axis = 1)
Y_reduced_data_W = reduced_data_W[['PID', 'label']]

#Defining the framework of interest
framework = 'reducedW_for_comparision_WandWTonly'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_reducedW_for_comparision_WandWTonly = run_ml_models(ml_models, X_reduced_data_W, Y_reduced_data_W, framework, results_path, save_results)

In [None]:
metrics_reducedW_for_comparision_WandWTonly

### Trial WT

In [None]:
#Running the traditional models again for all sub-frameworks of suject generalization to give a ranking of best to worst 
#tasks for subject generalization 

#Trial WT
reduced_data_WT = reduced_data[reduced_data.scenario=='WT'].reset_index().drop('index', axis = 1)
cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']

#Shuffling the cross validation stride data
reduced_data_WT = shuffle(reduced_data_WT, random_state = 0)
#CV for people generalize so no train-test split
X_reduced_data_WT = reduced_data_WT.drop(cols_to_drop, axis = 1)
Y_reduced_data_WT = reduced_data_WT[['PID', 'label']]

#Defining the framework of interest
framework = 'reducedWT_for_comparision_WandWTonly'

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']
metrics_reducedWT_for_comparision_WandWTonly = run_ml_models(ml_models, X_reduced_data_WT, Y_reduced_data_WT, framework, results_path, save_results)

In [None]:
metrics_reducedWT_for_comparision_WandWTonly