## Gait Video Study 
### Traditional ML algorithms on task generalization framework 1: W to WT to classify HOA/MS/PD strides and subjects 
#### Remember to add the original count of frames in a single stride (before down sampling via smoothing) for each stride as an additional artificial feature to add information about speed of the subject to the model

1. Save the optimal hyperparameters, confusion matrices and ROC curves for each algorithm.
2. Make sure to not use x, y, z, confidence = 0, 0, 0, 0 as points for the model since they are simply missing values and not data points, so make sure to treat them before inputting to model 
3. Make sure to normalize (mean substract) the features before we feed them to the model.
4. We use the summary statistics as range, CoV and asymmetry between the right and left limbs as the features to input to the traditional models requiring fixed size 1D input for each training/testing set sample.


In [2]:
# 33 subject in total (~10 per group) 
# 4500 strides - 2000 strides - 200 groups for 10 strides per group
# 90 features - 36 Cov, 36 Range, 18 assymetry
# Default + Dimensionality reduction - 3D space
# Try top 10 features 
# Subject generalization is where the overfitting issue is tested - If we get good results, that means we are not 
# overfitting 

In [1]:
import numpy as np
import cv2
import os
import glob
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, HTML
import seaborn as sns
import copy

import xgboost 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from inspect import signature
from scipy import interp
from pyitlib import discrete_random_variable as drv
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve
import warnings
from sklearn.neural_network import MLPClassifier

In [2]:
path = 'C:\\Users\\Rachneet Kaur\\Box\\Gait Video Project\\GaitVideoData\\video\\'
data_path = path + 'downsampled_strides\\'
labels_path = path + 'labels.csv'

labels = pd.read_csv(labels_path, index_col= 0)
display(labels.head())

Unnamed: 0,cohort,trial,scenario,video,PID,stride_number,key,frame_count,label
0,HOA,BW,SLWT,GVS_212_T_T1,212,1,GVS_212_T_T1_1,46,0
1,HOA,BW,SLWT,GVS_212_T_T1,212,2,GVS_212_T_T1_2,39,0
2,HOA,BW,SLWT,GVS_212_T_T1,212,3,GVS_212_T_T1_3,56,0
3,HOA,BW,SLWT,GVS_212_T_T1,212,4,GVS_212_T_T1_4,53,0
4,HOA,BW,SLWT,GVS_212_T_T1,212,5,GVS_212_T_T1_5,44,0


In [None]:
#Making the summary statistics dataframe 
#It will have all columns same as labels.csv and 90 extra columns for 36 CoV, 36 range and 18 asymmetry features
#We can then use summary statistics dataframe for all frameworks with the traditional algorithms 

keys = labels['key'] #Using the key representing the video and stride as the unique identifier
markers = ['hip', 'knee', 'ankle', 'heel', 'toe 1', 'toe 2']
order = ['right hip', 'right knee', 'right ankle', 'left hip', 'left knee', 'left ankle', 'left toe 1', 'left toe 2', \
         'left heel', 'right toe 1', 'right toe 2', 'right heel']
coordinates = [o + '-'+ y for o in order for y in ['x', 'y', 'z']]
coordinates_asymmetry = [m + '-' + y for m in markers for y in ['x', 'y', 'z']] #Defining the column names for the asymmetry
right_side_markers = ['right ' + c_a for c_a in coordinates_asymmetry] #Defining the right side markers 
left_side_markers = ['left ' + c_a for c_a in coordinates_asymmetry] #Defining the left side markers 
#Columns for the summary features dataframe 
feature_cols = [c + '-'+ y for y in ['CoV', 'range']  for c in coordinates] + [c_a + '-asymmetry' for c_a in coordinates_asymmetry]
#Summary feature dataframe with index as keys and columns as 90 summary statistics (36 CoV, 36 range and 18 asymmetry)
features_dataframe = pd.DataFrame(columns = feature_cols, index = keys)

for key in keys: #Using the key as the unique identifier to loop through each stride in the dataset
    stride = pd.read_csv(data_path+key+'.csv', index_col = 0) 
#     display(stride.head())
    stride_cov = stride.std()/stride.mean() #Computing the stride's coefficient of variation across all 36 features 
    stride_range = stride.max()-stride.min() #Stride's range across all 36 features 
    #Stride's asymmetry across left and right side for 18 features 
    asymmetry_values = [np.abs(stride_range[x] - stride_range[y]) for x, y in zip(right_side_markers, left_side_markers)]
    #All the 90 summary statistics as a list 
    summary_stats = list(stride_cov.values)+list(stride_range.values)+asymmetry_values
#     print (len(summary_stats), summary_stats)
    #Assigning the row of the features dataframe with 90 values for summary statistics 
    features_dataframe.loc[key] = summary_stats

#After the features dataframe is ready, concatenating the features with the other information in the labels.csv 
#This data can now be use for all frameworks in traditional algorithms 
#This will have as many rows as strides in our dataset and 99 columns 
#(90 for features and 9 for information relative to the stride)
traditional_methods_dataframe = pd.concat((labels.set_index('key'), features_dataframe), axis = 1).reset_index()
#Saving to the .csv file 
traditional_methods_dataframe.to_csv(path+'traditional_methods_dataframe.csv')

### Utility functions 

In [5]:
#Standardize the data before ML methods 
#Take care that testing set is not used while normalizaing the training set, otherwise the train set indirectly contains 
#information about the test set
def normalize(dataframe, n_type): 
    '''
    Input: dataframe, type of normalization (z-score or min-max)
    '''
    col_names = list(dataframe.columns)
    if (n_type == 'z'): #z-score normalization 
        mean = dataframe.mean()
        sd = dataframe.std()
    else: #min-max normalization
        mean = dataframe.min()
        sd = dataframe.max()-dataframe.min()
    return mean, sd

In [6]:
#Trial W for training 
trialW = labels[labels['scenario']=='W']
# raw_trainX = raw_trial1.drop(['Label', 'PID', 'TrialID'], axis = 1)
# raw_trainY = raw_trial1[['PID', 'Label']]

#Trial WT for testing 
trialWT = labels[labels['scenario']=='WT']
# raw_testX = raw_trial2.drop(['Label', 'PID', 'TrialID'], axis = 1)
# raw_testY = raw_trial2[['PID', 'Label']] #PID to compute person based metrics later 

#Normalize according to z-score standardization
# norm_mean, norm_sd = normalize(raw_trainX, 'z')
# raw_trainX_norm = (raw_trainX-norm_mean)/norm_sd
# raw_testX_norm = (raw_testX-norm_mean)/norm_sd

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in training set: ', len(trialW))
print ('HOA, MS and PD strides in training set:\n', trialW['cohort'].value_counts())

#Test Set
print('\nStrides in test set: ', len(trialWT)) 
print ('HOA, MS and PD strides in test set:\n', trialWT['cohort'].value_counts())
print ('Imbalance ratio (controls:MS:PD)= 1:X:Y\n', trialWT['cohort'].value_counts()/trialWT['cohort'].value_counts()['HOA'])

Strides in training set:  1651
HOA, MS and PD strides in training set:
 HOA    809
PD     453
MS     389
Name: cohort, dtype: int64

Strides in test set:  1176
HOA, MS and PD strides in test set:
 PD     493
HOA    351
MS     332
Name: cohort, dtype: int64
Imbalance ratio (controls:MS:PD)= 1:X:Y
 PD     1.404558
HOA    1.000000
MS     0.945869
Name: cohort, dtype: float64


In [7]:
print ('Number of subjects in training and test sets:', len(trialW['PID'].unique()), len(trialWT['PID'].unique()))

#Try to use same subjects in trials W and WT for testing on same subjects we train on
print ('Subjects in WT (test set), which are not in W (training set)')
for x in trialWT['PID'].unique():
    if x not in trialW['PID'].unique():
        print (x)
        
print ('Subjects in W (training set), which are not in WT (test set)')
for x in trialW['PID'].unique():
    if x not in trialWT['PID'].unique():
        print (x)

Number of subjects in training and test sets: 32 26
Subjects in WT (test set), which are not in W (training set)
403
Subjects in W (training set), which are not in WT (test set)
312
102
112
113
115
123
124


In [8]:
36+36+18, 809 - (90+95+57+52+118+63)

(90, 334)

In [9]:
trialW.groupby(['video']).count()

Unnamed: 0_level_0,cohort,trial,scenario,PID,stride_number,key,frame_count,label
video,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GVS_102_W_T1,90,90,90,90,90,90,90,90
GVS_112_W_T1,95,95,95,95,95,95,95,95
GVS_113_W_T1,57,57,57,57,57,57,57,57
GVS_115_W_T1,52,52,52,52,52,52,52,52
GVS_123_W_T1,118,118,118,118,118,118,118,118
GVS_124_W_T1,63,63,63,63,63,63,63,63
GVS_212_W_T2,44,44,44,44,44,44,44,44
GVS_213_W_T1,43,43,43,43,43,43,43,43
GVS_214_W_T1,38,38,38,38,38,38,38,38
GVS_215_W_T1,45,45,45,45,45,45,45,45


In [10]:
trialW.groupby(['video']).count()

Unnamed: 0_level_0,cohort,trial,scenario,PID,stride_number,key,frame_count,label
video,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GVS_102_W_T1,90,90,90,90,90,90,90,90
GVS_112_W_T1,95,95,95,95,95,95,95,95
GVS_113_W_T1,57,57,57,57,57,57,57,57
GVS_115_W_T1,52,52,52,52,52,52,52,52
GVS_123_W_T1,118,118,118,118,118,118,118,118
GVS_124_W_T1,63,63,63,63,63,63,63,63
GVS_212_W_T2,44,44,44,44,44,44,44,44
GVS_213_W_T1,43,43,43,43,43,43,43,43
GVS_214_W_T1,38,38,38,38,38,38,38,38
GVS_215_W_T1,45,45,45,45,45,45,45,45


In [32]:
labels.head()

Unnamed: 0,cohort,trial,scenario,video,PID,stride_number,key,frame_count,label
0,HOA,BW,SLWT,GVS_212_T_T1,212,1,GVS_212_T_T1_1,46,0
1,HOA,BW,SLWT,GVS_212_T_T1,212,2,GVS_212_T_T1_2,39,0
2,HOA,BW,SLWT,GVS_212_T_T1,212,3,GVS_212_T_T1_3,56,0
3,HOA,BW,SLWT,GVS_212_T_T1,212,4,GVS_212_T_T1_4,53,0
4,HOA,BW,SLWT,GVS_212_T_T1,212,5,GVS_212_T_T1_5,44,0


In [50]:
features_dataframe

Unnamed: 0_level_0,right hip-x-CoV,right hip-y-CoV,right hip-z-CoV,right knee-x-CoV,right knee-y-CoV,right knee-z-CoV,right ankle-x-CoV,right ankle-y-CoV,right ankle-z-CoV,left hip-x-CoV,...,ankle-z-asymmetry,heel-x-asymmetry,heel-y-asymmetry,heel-z-asymmetry,toe 1-x-asymmetry,toe 1-y-asymmetry,toe 1-z-asymmetry,toe 2-x-asymmetry,toe 2-y-asymmetry,toe 2-z-asymmetry
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GVS_212_T_T1_1,0.0460773,0.0348898,0.00429736,0.0955517,0.0234486,0.0342959,0.165017,0.140369,0.442588,0.233172,...,14.4262,3.40738,10.6624,0.830365,0.50257,31.4505,8.64401,5.23668,31.1822,8.21572
GVS_212_T_T1_2,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_3,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_4,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_5,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_6,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_7,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_8,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_9,,,,,,,,,,,...,,,,,,,,,,
GVS_212_T_T1_10,,,,,,,,,,,...,,,,,,,,,,


In [34]:
stride_range

right hip-x       7.212309
right hip-y      20.620270
right hip-z       1.991843
right knee-x      9.964181
right knee-y     13.444621
right knee-z      9.038304
right ankle-x    21.594528
right ankle-y    76.063625
right ankle-z    31.074113
left hip-x       13.329466
left hip-y       21.961515
left hip-z        1.991843
left knee-x       9.858765
left knee-y      14.225126
left knee-z       3.894683
left ankle-x     14.372807
left ankle-y     55.463843
left ankle-z     16.647940
left toe 1-x     16.425025
left toe 1-y     65.548873
left toe 1-z     21.766010
left toe 2-x     22.508497
left toe 2-y     60.997153
left toe 2-z     22.014344
left heel-x      16.300582
left heel-y      58.040586
left heel-z      23.324031
right toe 1-x    15.922455
right toe 1-y    34.098386
right toe 1-z    13.121999
right toe 2-x    17.271819
right toe 2-y    29.814971
right toe 2-z    13.798620
right heel-x     12.893203
right heel-y     68.703027
right heel-z     22.493666
dtype: float64

In [43]:
coordinates_asymmetry, right_side_markers,

(['hip-x',
  'hip-y',
  'hip-z',
  'knee-x',
  'knee-y',
  'knee-z',
  'ankle-x',
  'ankle-y',
  'ankle-z',
  'heel-x',
  'heel-y',
  'heel-z',
  'toe 1-x',
  'toe 1-y',
  'toe 1-z',
  'toe 2-x',
  'toe 2-y',
  'toe 2-z'],
 ['right hip-x',
  'right hip-y',
  'right hip-z',
  'right knee-x',
  'right knee-y',
  'right knee-z',
  'right ankle-x',
  'right ankle-y',
  'right ankle-z',
  'right heel-x',
  'right heel-y',
  'right heel-z',
  'right toe 1-x',
  'right toe 1-y',
  'right toe 1-z',
  'right toe 2-x',
  'right toe 2-y',
  'right toe 2-z'])