## Gait Video Study 
### Traditional ML algorithms on subject generalization framework 1: walking (W) to classify HOA/MS/PD strides and subjects using cross validation 
#### Remember to add the original count of frames in a single stride (before down sampling via smoothing) for each stride as an additional artificial feature to add information about speed of the subject to the model

In [215]:
from importlib import reload
import imports 
reload(imports)
from imports import *
from split import StratifiedGroupKFold

In [151]:
path = 'C:\\Users\\Rachneet Kaur\\Box\\Gait Video Project\\GaitVideoData\\video\\'
data_path = path+'traditional_methods_dataframe.csv'

data = pd.read_csv(data_path, index_col= 0)
display(data.head())

Unnamed: 0,key,cohort,trial,scenario,video,PID,stride_number,frame_count,label,right hip-x-CoV,...,ankle-z-asymmetry,heel-x-asymmetry,heel-y-asymmetry,heel-z-asymmetry,toe 1-x-asymmetry,toe 1-y-asymmetry,toe 1-z-asymmetry,toe 2-x-asymmetry,toe 2-y-asymmetry,toe 2-z-asymmetry
0,GVS_212_T_T1_1,HOA,BW,SLWT,GVS_212_T_T1,212,1,46,0,0.046077,...,14.426173,3.407379,10.662441,0.830365,0.50257,31.450487,8.644012,5.236678,31.182183,8.215725
1,GVS_212_T_T1_2,HOA,BW,SLWT,GVS_212_T_T1,212,2,39,0,0.021528,...,1.360847,5.155307,11.363806,4.333776,1.025647,28.2664,2.671081,6.678294,15.058825,4.903579
2,GVS_212_T_T1_3,HOA,BW,SLWT,GVS_212_T_T1,212,3,56,0,0.034394,...,1.341021,8.625363,7.159495,3.366152,1.759968,17.545787,5.921325,8.243491,9.578638,3.008162
3,GVS_212_T_T1_4,HOA,BW,SLWT,GVS_212_T_T1,212,4,53,0,0.028511,...,2.375934,6.728268,0.098235,0.999027,0.541911,7.843339,4.279617,0.748023,19.471731,5.086056
4,GVS_212_T_T1_5,HOA,BW,SLWT,GVS_212_T_T1,212,5,44,0,0.025213,...,8.525816,1.775282,0.03321,9.166863,1.354601,6.674183,8.47948,4.373622,0.315168,11.795593


### Utility functions 

In [219]:
def evaluate(model, test_features, yoriginal_, ypredicted_):
    best_index = model.cv_results_['mean_test_accuracy'].argmax()
    print('best_params: ', model.cv_results_['params'][best_index])

    #Stride-wise metrics 
    stride_metrics_mean, stride_metrics_std = [], [] #Mean and SD of stride based metrics - Acc, P, R, F1, AUC (in order)
    scores={'accuracy': make_scorer(acc), 'precision':make_scorer(precision_score, average = 'macro'), \
            'recall':make_scorer(recall_score, average = 'macro'), 'f1': make_scorer(f1_score, average = 'macro'), \
           'auc': make_scorer(roc_auc_score, average = 'macro', multi_class = 'ovo', needs_proba= True)}
    
    for score in scores:
        stride_metrics_mean.append(model.cv_results_['mean_test_'+score][best_index])
        stride_metrics_std.append(model.cv_results_['std_test_'+score][best_index])
    print('Stride-based model performance (mean): ', stride_metrics_mean)
    print('Stride-based model performance (standard deviation): ', stride_metrics_std)
    n_folds = 5
    person_acc, person_p, person_r, person_f1, person_auc = [], [], [], [], []
    #For ROC curves 
    tpr_list = []
    base_fpr = np.linspace(0, 1, 101)

    for i in range(n_folds):
        #For each fold, there are 2 splits: test and train (in order) and we need to retrieve the index 
        #of only test set for required 5 folds (best index)
        temp = test_features.loc[yoriginal_[(best_index*n_folds) + (i)].index] #True labels for the test strides in each fold
        temp['pred'] = ypredicted_[(best_index*n_folds) + (i)] #Predicted labels for the strides in the test set in each fold
#         print ('temp_pred', temp['pred'])
        x = temp.groupby('PID')['pred'].value_counts().unstack()
#         print ('x', x)
        #Input for subject wise AUC is probabilities at columns [0, 1, 2]
        proportion_strides_correct = pd.DataFrame(columns = [0, 1, 2])
        probs_stride_wise = x.divide(x.sum(axis = 1), axis = 0).fillna(0)
        proportion_strides_correct[probs_stride_wise.columns] = probs_stride_wise
        proportion_strides_correct.fillna(0, inplace=True)
        proportion_strides_correct['True Label'] = test_features.groupby('PID').first()
        #Binarizing/getting dummies for the true labels i.e. class 1 is represented as 0, 1, 0
        test_features_binarize = pd.get_dummies(proportion_strides_correct['True Label'].values)  
        #Input for precision, recall and F1 score
        proportion_strides_correct['Predicted Label'] = proportion_strides_correct[[0, 1, 2]].idxmax(axis = 1) 

        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(3): #n_classes = 3
            fpr[i], tpr[i], _ = roc_curve(test_features_binarize.iloc[:, i], proportion_strides_correct.iloc[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(test_features_binarize.values.ravel(), proportion_strides_correct[[0, 1, 2]].values.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        
        #Person wise metrics for each fold 
        person_acc.append(accuracy_score(proportion_strides_correct['Predicted Label'], proportion_strides_correct['True Label']))
        person_p.append(precision_score(proportion_strides_correct['Predicted Label'], proportion_strides_correct['True Label'], \
                                       average = 'macro'))
        person_r.append(recall_score(proportion_strides_correct['Predicted Label'], proportion_strides_correct['True Label'], \
                                    average = 'macro'))
        person_f1.append(f1_score(proportion_strides_correct['Predicted Label'], proportion_strides_correct['True Label'], \
                                  average = 'macro'))
        person_auc.append(roc_auc_score(proportion_strides_correct['True Label'], proportion_strides_correct[[0, 1, 2]], \
                                        multi_class = 'ovo', average= 'macro'))

    #Mean and standard deviation for person-based metrics 
    person_means = [np.mean(person_acc), np.mean(person_p), np.mean(person_r), np.mean(person_f1), np.mean(person_auc)]
    person_stds = [np.std(person_acc), np.std(person_p), np.std(person_r), np.std(person_f1), np.std(person_auc)]
    print('Person-based model performance (mean): ', person_means)
    print('Person-based model performance (standard deviation): ', person_stds)

    return tpr, fpr, roc_auc, [stride_metrics_mean, stride_metrics_std, person_means, person_stds]

In [220]:
def acc(y_true,y_pred):
    global yoriginal, ypredicted
    yoriginal.append(y_true)
    ypredicted.append(y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [221]:
#We do not use LDA/QDA since our features are not normally distributed 
def models(X, Y, model_name = 'random_forest'):
    '''
    X, Y, PID groups so that strides of each person are either in training or in testing set
    model: model_name
    '''
    Y_ = Y['label'] #Dropping the PID
    groups_ = Y['PID']
    #We use stratified group K-fold to sample our strides data
    gkf = StratifiedGroupKFold(n_splits=5) 
    scores={'accuracy': make_scorer(acc), 'precision':make_scorer(precision_score, average = 'macro'), \
            'recall':make_scorer(recall_score, average = 'macro'), 'f1': make_scorer(f1_score, average = 'macro'), \
            'auc': make_scorer(roc_auc_score, average = 'macro', multi_class = 'ovo', needs_proba=True)}
    if(model_name == 'random_forest'): #Random Forest
        grid = {
       'randomforestclassifier__n_estimators': [40,45,50],\
       'randomforestclassifier__max_depth' : [15,20,25,None],\
       'randomforestclassifier__class_weight': [None, 'balanced'],\
       'randomforestclassifier__max_features': ['auto','sqrt','log2', None],\
       'randomforestclassifier__min_samples_leaf':[1,2,0.1,0.05]
        }
        #For z-score scaling on training and use calculated coefficients on test set
        rf_grid = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=0))
        grid_search = GridSearchCV(rf_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
    
    if(model_name == 'adaboost'): #Adaboost
        ada_grid = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=0))
        grid = {
        'adaboostclassifier__n_estimators':[50, 75, 100, 125, 150],\
        'adaboostclassifier__learning_rate':[0.01,.1, 1, 1.5, 2]\
        }
        grid_search = GridSearchCV(ada_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
        
    if(model_name == 'kernel_svm'): #RBF SVM
        svc_grid = make_pipeline(StandardScaler(), SVC(kernel = 'rbf', probability=True, random_state=0))
        grid = {
        'svc__gamma':[0.0001, 0.001, 0.1, 1, 10, ]\
        }
        grid_search = GridSearchCV(svc_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)

    if(model_name == 'gbm'): #GBM
        gbm_grid = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=0))
        grid = {
        'gradientboostingclassifier__learning_rate':[0.15,0.1,0.05], \
        'gradientboostingclassifier__n_estimators':[50, 100, 150],\
        'gradientboostingclassifier__max_depth':[2,4,7],\
        'gradientboostingclassifier__min_samples_split':[2,4], \
        'gradientboostingclassifier__min_samples_leaf':[1,3],\
        'gradientboostingclassifier__max_features':['auto','sqrt','log2', None],\
        }
        grid_search = GridSearchCV(gbm_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
    
    if(model_name=='xgboost'): #Xgboost
        xgb_grid = make_pipeline(StandardScaler(), xgboost.XGBClassifier(random_state=0))
        grid = {
            'xgbclassifier__min_child_weight': [1, 5],\
            'xgbclassifier__gamma': [0.1, 0.5, 1, 1.5, 2],\
            'xgbclassifier__subsample': [0.6, 0.8, 1.0],\
            'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0],\
            'xgbclassifier__max_depth': [5, 7, 8]
        }
        grid_search = GridSearchCV(xgb_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
    
    if(model_name == 'knn'): #KNN
        knn_grid = make_pipeline(StandardScaler(), KNeighborsClassifier())
        grid = {
            'kneighborsclassifier__n_neighbors': [1, 3, 4, 5, 10],\
            'kneighborsclassifier__p': [1, 2, 3, 4, 5]\
        }
        grid_search = GridSearchCV(knn_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
        
    if(model_name == 'decision_tree'): #Decision Tree
        dec_grid = make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=0))
        #For z-score scaling on training and use calculated coefficients on test set
        grid = {'decisiontreeclassifier__min_samples_split': range(2, 50)}
        grid_search = GridSearchCV(dec_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)

    if(model_name == 'linear_svm'): #Linear SVM
        lsvm_grid = make_pipeline(StandardScaler(), SVC(kernel = 'linear', probability=True, random_state=0)) #LinearSVC(random_state=0, probability= True))
        grid = {
            'svc__gamma':[0.0001, 0.001, 0.1, 1, 10, ]\

        }
        grid_search = GridSearchCV(lsvm_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
    
    if(model_name == 'logistic_regression'): #Logistic regression
        lr_grid = make_pipeline(StandardScaler(), LogisticRegression())
        grid = {
            'logisticregression__random_state': [0]}
            
        grid_search = GridSearchCV(lr_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
    
    if(model_name == 'mlp'):
        mlp_grid = make_pipeline(StandardScaler(), MLPClassifier(random_state = 0, activation='relu', solver='adam',\
                                                       learning_rate = 'adaptive', learning_rate_init=0.001, 
                                                        shuffle=False, max_iter = 200))
        grid = {
            'mlpclassifier__hidden_layer_sizes': [(128, 8, 8, 128, 32), (50, 50, 50, 50, 50, 50, 150, 100, 10), 
                                  (50, 50, 50, 50, 50, 60, 30, 20, 50), (50, 50, 50, 50, 50, 150, 10, 60, 150),
                                  (50, 50, 50, 50, 50, 5, 50, 10, 5), (50, 50, 50, 50, 50, 5, 50, 150, 150),
                                  (50, 50, 50, 50, 50, 5, 30, 50, 20), (50, 50, 50, 50, 10, 150, 20, 20, 30),
                                  (50, 50, 50, 50, 30, 150, 100, 20, 100), (50, 50, 50, 50, 30, 5, 100, 20, 100),
                                  (50, 50, 50, 50, 60, 50, 50, 60, 60), (50, 50, 50, 50, 20, 50, 60, 20, 20),
                                  (50, 50, 50, 10, 50, 10, 150, 60, 150), (50, 50, 50, 10, 50, 150, 30, 150, 5),
                                  (50, 50, 50, 10, 50, 20, 150, 5, 10), (50, 50, 50, 10, 150, 50, 20, 20, 100), 
                                  (50, 50, 50, 30, 100, 5, 30, 150, 30), (50, 50, 50, 50, 100, 150, 100, 200), 
                                  (50, 50, 50, 5, 5, 100, 100, 150), (50, 50, 5, 50, 200, 100, 150, 5), 
                                  (50, 50, 5, 5, 200, 100, 50, 30), (50, 50, 5, 10, 5, 200, 200, 10), 
                                  (50, 50, 5, 30, 5, 5, 50, 10), (50, 50, 5, 200, 50, 5, 5, 50), 
                                  (50, 50,50, 5, 5, 100, 100, 150), (5, 5, 5, 5, 5, 100, 50, 5, 50, 50), 
                                  (5, 5, 5, 5, 5, 100, 20, 100, 30, 30), (5, 5, 5, 5, 5, 20, 20, 5, 30, 100), 
                                  (5, 5, 5, 5, 5, 20, 20, 100, 10, 10), (5, 5, 5, 5, 10, 10, 30, 50, 10, 10), 
                                  (5, 5, 5, 5, 10, 100, 30, 30, 30, 10), (5, 5, 5, 5, 10, 100, 50, 10, 50, 10), 
                                  (5, 5, 5, 5, 10, 100, 20, 100, 30, 5), (5, 5, 5, 5, 30, 5, 20, 30, 100, 50), 
                                  (5, 5, 5, 5, 30, 100, 20, 50, 20, 30), (5, 5, 5, 5, 50, 30, 5, 50, 10, 100), 
                                  (21, 21, 7, 84, 21, 84, 84), (21, 21, 5, 42, 42, 7, 42), (21, 84, 7, 7, 7, 84, 5), 
                                  (21, 7, 84, 5, 5, 21, 120), (42, 5, 21, 21, 21, 5, 120), (42, 5, 42, 84, 7, 120, 84), 
                                  (50, 100, 10, 5, 100, 25), (10, 10, 25, 50, 25, 5), (50, 50, 50, 50, 50, 20, 30, 100, 60)]

        }
        grid_search = GridSearchCV(mlp_grid, param_grid=grid, scoring=scores\
                           , n_jobs = 1, cv=gkf.split(X, Y_, groups=groups_), refit=False)
    
#     print ('X', X)
#     print ('Y', Y_)
#     print ('groups', groups_)
    grid_search.fit(X, Y_, groups=groups_) #Fitting on the training set to find the optimal hyperparameters 
    tpr, fpr, roc_auc, stride_person_metrics = evaluate(grid_search, Y, yoriginal, ypredicted)
    return tpr, fpr, roc_auc, stride_person_metrics

### main()

In [188]:
#Trial W for the first framework of subject generalization
trialW = data[data['scenario']=='W']
print ('Original number of subjects in trial W for cross validation:', len(trialW['PID'].unique()))
print ('Number of subjects in trial W in each cohort:\n', trialW.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialW = shuffle(trialW, random_state = 0)
#CV for people generalize so no train-test split
X = trialW.drop(cols_to_drop, axis = 1)
Y = trialW[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial W for cross validation: ', len(trialW))
print ('HOA, MS and PD strides in trial W:\n', trialW['cohort'].value_counts())
print ('Imbalance ratio in trial W (controls:MS:PD)= 1:X:Y\n', trialW['cohort'].value_counts()/trialW['cohort'].value_counts()['HOA'])

Original number of subjects in trial W for cross validation: 32
Number of subjects in trial W in each cohort:
 HOA    14
MS     10
PD      8
Name: cohort, dtype: int64
Strides in trial W for cross validation:  1651
HOA, MS and PD strides in trial W:
 HOA    809
PD     453
MS     389
Name: cohort, dtype: int64
Imbalance ratio in trial W (controls:MS:PD)= 1:X:Y
 HOA    1.000000
PD     0.559951
MS     0.480841
Name: cohort, dtype: float64


In [None]:
metrics = pd.DataFrame(columns = ml_models) #Dataframe to store accuracies for each ML model for raw data 
#For storing predicted probabilities for person (for class 1) to show ROC curves 
tprs_, fprs_, roc_auc_ = pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models) 

In [189]:
ml_models = ['mlp'] #['linear_svm', 
#              'logistic_regression', 'mlp'] #'random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  
for ml_model in ml_models:
    print (ml_model)
    yoriginal = []
    ypredicted = []
    tprs, fprs, roc_aucs, stride_person_metrics = models(X, Y, ml_model)
    metrics[ml_model] = sum(stride_person_metrics, [])
    tprs_[ml_model] = list(tprs)
    fprs_[ml_model] = list(fprs)
    roc_auc_[ml_model] = list(roc_aucs)
    print ('********************************')

metrics.index = ['stride_mean_accuracy', 'stride_mean_precision', 'stride_mean_recall', 'stride_mean_F1', \
                     'stride_mean_AUC', 'stride_std_accuracy', 'stride_std_precision', 'stride_std_recall', 'stride_std_F1', \
                     'stride_std_AUC','person_mean_accuracy', 'person_mean_precision', 'person_mean_recall', 'person_mean_F1',\
                     'person_mean_AUC', 'person_std_accuracy', 'person_std_precision', 'person_std_recall', 'person_std_F1',\
                     'person_std_AUC']  
# raw_metrics.to_csv(path+'..//person_generalize//person_generalize_results_raw_data.csv')
# tprs_raw.to_csv(path+'..//person_generalize//person_generalize_ROCresults_raw_data.csv')

linear_svm
best_params:  {'svc__gamma': 0.0001}
Stride-based model performance (mean):  [0.5258975878028396, 0.5085383087808439, 0.5029719633335817, 0.4449302680136472, 0.6567134175311374]
Stride-based model performance (standard deviation):  [0.13193670459748894, 0.0788796048289837, 0.08660771828672537, 0.09436908513168224, 0.11792428221849367]
Person-based model performance (mean):  [0.5, 0.55, 0.4966666666666667, 0.43714285714285717, 0.8319444444444445]
Person-based model performance (standard deviation):  [0.18808016817268097, 0.18708286933869703, 0.19160143817599887, 0.21258845771997387, 0.11439589045541111]
********************************
logistic_regression
best_params:  {'logisticregression__random_state': 0}
Stride-based model performance (mean):  [0.5388611226050963, 0.5006306757918169, 0.5206823928471429, 0.4589347858299558, 0.6649391695053544]
Stride-based model performance (standard deviation):  [0.1575466775687537, 0.09489065084019938, 0.11380131476822485, 0.124195332180

In [191]:
metrics.index = ['stride_mean_accuracy', 'stride_mean_precision', 'stride_mean_recall', 'stride_mean_F1', \
                     'stride_mean_AUC', 'stride_std_accuracy', 'stride_std_precision', 'stride_std_recall', 'stride_std_F1', \
                     'stride_std_AUC','person_mean_accuracy', 'person_mean_precision', 'person_mean_recall', 'person_mean_F1',\
                     'person_mean_AUC', 'person_std_accuracy', 'person_std_precision', 'person_std_recall', 'person_std_F1',\
                     'person_std_AUC']  

In [192]:
metrics

Unnamed: 0,linear_svm,logistic_regression,mlp
stride_mean_accuracy,0.525898,0.538861,0.541543
stride_mean_precision,0.508538,0.500631,0.489918
stride_mean_recall,0.502972,0.520682,0.5143
stride_mean_F1,0.44493,0.458935,0.452616
stride_mean_AUC,0.656713,0.664939,0.670161
stride_std_accuracy,0.131937,0.157547,0.132792
stride_std_precision,0.07888,0.094891,0.062777
stride_std_recall,0.086608,0.113801,0.072965
stride_std_F1,0.094369,0.124195,0.084739
stride_std_AUC,0.117924,0.121804,0.089213


In [None]:
#ROC curves 
base_fpr = np.linspace(0, 1, 101)
ml_models = ['random_forest', 'adaboost', 'gbm', 'decision_tree', 'mlp'] #'knn'
ml_model_names = {'random_forest': 'RF', 'adaboost': 'AdaBoost', 'kernel_svm': 'RBF SVM', 'gbm': 'GBM', \
                  'xgboost': 'Xgboost', 'knn': 'KNN', 'decision_tree': 'DT',  'linear_svm': 'LSVM', 
             'logistic_regression': 'LR', 'mlp':'MLP'}

fig, axes = plt.subplots(1, 1, sharex=True, sharey = True, figsize=(5.2, 3.5))
sns.despine(offset=0)

linestyles = ['-', '-', '-', '-.', '--', '-', '--', '-', '--']
colors = ['b', 'magenta', 'cyan', 'g',  'red', 'violet', 'lime', 'grey', 'pink']

#ROCs
axes.plot([0, 1], [0, 1], linestyle='--', label='Majority (AUC = 0.5)', linewidth = 3, color = 'k')
for idx, ml_model in enumerate(ml_models):
    tprs = tprs_regressN[ml_model] # person-based prediction probabilities
    tprs = np.array(tprs)
    mean_tprs = tprs.mean(axis=0)
    std = tprs.std(axis=0)

    tprs_upper = np.minimum(mean_tprs + std, 1)
    tprs_lower = mean_tprs - std
#     axes[2].fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
    axes.plot(base_fpr, mean_tprs, label=ml_model_names[ml_model]+' (AUC = '+ str(round(regressN_metrics.loc['person_mean_AUC']
                     [ml_model], 2)) + r'$\pm$' + str(round(regressN_metrics.loc['person_std_AUC']
                     [ml_model], 2)) + ')', linewidth = 3, alpha = 0.8, linestyle = linestyles[idx], color = colors[idx])
axes.set_ylabel('True Positive Rate')
axes.set_title('RegressN data')
axes.legend() #loc='upper center', bbox_to_anchor=(1.27, 1), ncol=1)

axes.set_xlabel('False Positive Rate')
plt.tight_layout()
plt.savefig(path + '..//person_generalize//ROC_person_generalize_onlyregressNdata.png', dpi = 250)
plt.show()

### Subject generalization framework 2: walking while talking (WT) to classify strides and subjects of HOA/MS/PD

In [194]:
#Trial WT for the second framework of subject generalization
trialWT = data[data['scenario']=='WT']
print ('Original number of subjects in trial WT for cross validation:', len(trialWT['PID'].unique()))
print ('Number of subjects in trial WT in each cohort:\n', trialWT.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialWT = shuffle(trialWT, random_state = 0)
#CV for people generalize so no train-test split
X_WT = trialWT.drop(cols_to_drop, axis = 1)
Y_WT = trialWT[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial WT for cross validation: ', len(trialWT))
print ('HOA, MS and PD strides in trial WT:\n', trialWT['cohort'].value_counts())
print ('Imbalance ratio in trial WT (controls:MS:PD)= 1:X:Y\n', trialWT['cohort'].value_counts()/trialWT['cohort'].value_counts()['HOA'])

Original number of subjects in trial WT for cross validation: 26
Number of subjects in trial WT in each cohort:
 PD     9
MS     9
HOA    8
Name: cohort, dtype: int64
Strides in trial WT for cross validation:  1176
HOA, MS and PD strides in trial WT:
 PD     493
HOA    351
MS     332
Name: cohort, dtype: int64
Imbalance ratio in trial WT (controls:MS:PD)= 1:X:Y
 PD     1.404558
HOA    1.000000
MS     0.945869
Name: cohort, dtype: float64


In [195]:
metrics_WT = pd.DataFrame(columns = ml_models) #Dataframe to store accuracies for each ML model for raw data 
#For storing predicted probabilities for person (for class 1) to show ROC curves 
tprs_WT, fprs_WT, roc_auc_WT = pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models) 

In [196]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']

for ml_model in ml_models:
    print (ml_model)
    yoriginal = []
    ypredicted = []
    tprs_WT, fprs_WT, roc_aucs_WT, stride_person_metrics_WT = models(X_WT, Y_WT, ml_model)
    metrics_WT[ml_model] = sum(stride_person_metrics_WT, [])
    tprs_WT[ml_model] = list(tprs_WT)
    fprs_WT[ml_model] = list(fprs_WT)
    roc_auc_WT[ml_model] = list(roc_aucs_WT)
    print ('********************************')

metrics_WT.index = ['stride_mean_accuracy', 'stride_mean_precision', 'stride_mean_recall', 'stride_mean_F1', \
                     'stride_mean_AUC', 'stride_std_accuracy', 'stride_std_precision', 'stride_std_recall', 'stride_std_F1', \
                     'stride_std_AUC','person_mean_accuracy', 'person_mean_precision', 'person_mean_recall', 'person_mean_F1',\
                     'person_mean_AUC', 'person_std_accuracy', 'person_std_precision', 'person_std_recall', 'person_std_F1',\
                     'person_std_AUC']  
# raw_metrics.to_csv(path+'..//person_generalize//person_generalize_results_raw_data.csv')
# tprs_raw.to_csv(path+'..//person_generalize//person_generalize_ROCresults_raw_data.csv')

random_forest
best_params:  {'randomforestclassifier__class_weight': None, 'randomforestclassifier__max_depth': 20, 'randomforestclassifier__max_features': None, 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__n_estimators': 40}
Stride-based model performance (mean):  [0.5822906388317091, 0.5169221813642826, 0.5276326282216708, 0.5043073439241711, 0.7024526455234088]
Stride-based model performance (standard deviation):  [0.11055115837111511, 0.12161322371601872, 0.13677849970584177, 0.12441116118720508, 0.1375921631330036]
Person-based model performance (mean):  [0.6133333333333333, 0.5777777777777777, 0.5944444444444443, 0.5326984126984128, 0.75]
Person-based model performance (standard deviation):  [0.12927146286443544, 0.16703662642636566, 0.1753303759784389, 0.14367876602161003, 0.21318426504833216]
********************************
adaboost
best_params:  {'adaboostclassifier__learning_rate': 1.5, 'adaboostclassifier__n_estimators': 75}
Stride-based model per

In [197]:
metrics_WT

Unnamed: 0,linear_svm,logistic_regression,mlp,random_forest,adaboost,kernel_svm,gbm,xgboost,knn,decision_tree
stride_mean_accuracy,0.563645,0.553918,0.617066,0.582291,0.556667,0.56201,0.598024,0.59255,0.538204,0.53703
stride_mean_precision,0.4979,0.494793,0.562846,0.516922,0.477953,0.525903,0.529623,0.528534,0.493067,0.456237
stride_mean_recall,0.512189,0.5021,0.57146,0.527633,0.489253,0.508145,0.543818,0.543913,0.488647,0.472572
stride_mean_F1,0.490895,0.483078,0.549557,0.504307,0.472963,0.482357,0.523945,0.518429,0.476301,0.455692
stride_mean_AUC,0.68036,0.673718,0.735527,0.702453,0.640184,0.712908,0.697651,0.716898,0.693519,0.62409
stride_std_accuracy,0.101565,0.08766,0.116162,0.110551,0.103508,0.146725,0.121815,0.089941,0.143961,0.09288
stride_std_precision,0.116937,0.107157,0.124945,0.121613,0.118694,0.152261,0.129075,0.100503,0.125059,0.113179
stride_std_recall,0.120481,0.103977,0.145591,0.136778,0.103986,0.139022,0.13556,0.114619,0.146966,0.122038
stride_std_F1,0.112661,0.094243,0.130392,0.124411,0.107574,0.138726,0.126439,0.103231,0.137389,0.110684
stride_std_AUC,0.120813,0.114057,0.134117,0.137592,0.129827,0.13576,0.154913,0.146742,0.116357,0.119345


### Subject generalization framework 3: virtual beam walking (VBW) to classify strides and subjects of HOA/MS/PD

In [222]:
#Trial VBW for the third framework of subject generalization
trialVBW = data[data['scenario']=='SLW']
print ('Original number of subjects in trial VBW for cross validation:', len(trialVBW['PID'].unique()))
print ('Number of subjects in trial VBW in each cohort:\n', trialVBW.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialVBW = shuffle(trialVBW, random_state = 0)
#CV for people generalize so no train-test split
X_VBW = trialVBW.drop(cols_to_drop, axis = 1)
Y_VBW = trialVBW[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial VBW for cross validation: ', len(trialVBW))
print ('HOA, MS and PD strides in trial VBW:\n', trialVBW['cohort'].value_counts())
print ('Imbalance ratio in trial VBW (controls:MS:PD)= 1:X:Y\n', trialVBW['cohort'].value_counts()/trialVBW['cohort'].value_counts()['HOA'])

Original number of subjects in trial VBW for cross validation: 22
Number of subjects in trial VBW in each cohort:
 HOA    8
MS     8
PD     6
Name: cohort, dtype: int64
Strides in trial VBW for cross validation:  829
HOA, MS and PD strides in trial VBW:
 HOA    336
MS     283
PD     210
Name: cohort, dtype: int64
Imbalance ratio in trial VBW (controls:MS:PD)= 1:X:Y
 HOA    1.000000
MS     0.842262
PD     0.625000
Name: cohort, dtype: float64


In [224]:
metrics_VBW = pd.DataFrame(columns = ml_models) #Dataframe to store accuracies for each ML model for raw data 
#For storing predicted probabilities for person (for class 1) to show ROC curves 
tprs_VBW, fprs_VBW, roc_auc_VBW = pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models) 

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']

for ml_model in ml_models:
    print (ml_model)
    yoriginal = []
    ypredicted = []
    tprs_VBW, fprs_VBW, roc_aucs_VBW, stride_person_metrics_VBW = models(X_VBW, Y_VBW, ml_model)
    metrics_VBW[ml_model] = sum(stride_person_metrics_VBW, [])
    tprs_VBW[ml_model] = list(tprs_VBW)
    fprs_VBW[ml_model] = list(fprs_VBW)
    roc_auc_VBW[ml_model] = list(roc_aucs_VBW)
    print ('********************************')

metrics_VBW.index = ['stride_mean_accuracy', 'stride_mean_precision', 'stride_mean_recall', 'stride_mean_F1', \
                     'stride_mean_AUC', 'stride_std_accuracy', 'stride_std_precision', 'stride_std_recall', 'stride_std_F1', \
                     'stride_std_AUC','person_mean_accuracy', 'person_mean_precision', 'person_mean_recall', 'person_mean_F1',\
                     'person_mean_AUC', 'person_std_accuracy', 'person_std_precision', 'person_std_recall', 'person_std_F1',\
                     'person_std_AUC']  
# raw_metrics.to_csv(path+'..//person_generalize//person_generalize_results_raw_data.csv')
# tprs_raw.to_csv(path+'..//person_generalize//person_generalize_ROCresults_raw_data.csv')

random_forest


In [None]:
metrics_VBW

### Subject generalization framework 4: virtual beam walking while talking (VBWT) to classify strides and subjects of HOA/MS/PD

In [None]:
#Trial VBWT for the fourth framework of subject generalization
trialVBW = data[data['scenario']=='SLW']
print ('Original number of subjects in trial VBW for cross validation:', len(trialVBW['PID'].unique()))
print ('Number of subjects in trial VBW in each cohort:\n', trialVBW.groupby('PID').first()['cohort'].value_counts())

cols_to_drop = ['PID', 'key', 'cohort', 'trial', 'scenario', 'video', 'stride_number', 'label']
#Shuffling the cross validation stride data
trialVBW = shuffle(trialVBW, random_state = 0)
#CV for people generalize so no train-test split
X_VBW = trialVBW.drop(cols_to_drop, axis = 1)
Y_VBW = trialVBW[['PID', 'label']]

#Total strides and imbalance of labels in the training and testing set
#Training set 
print('Strides in trial VBW for cross validation: ', len(trialVBW))
print ('HOA, MS and PD strides in trial VBW:\n', trialVBW['cohort'].value_counts())
print ('Imbalance ratio in trial VBW (controls:MS:PD)= 1:X:Y\n', trialVBW['cohort'].value_counts()/trialVBW['cohort'].value_counts()['HOA'])

In [None]:
metrics_VBW = pd.DataFrame(columns = ml_models) #Dataframe to store accuracies for each ML model for raw data 
#For storing predicted probabilities for person (for class 1) to show ROC curves 
tprs_VBW, fprs_VBW, roc_auc_VBW = pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models), pd.DataFrame(columns = ml_models) 

In [None]:
ml_models = ['random_forest', 'adaboost', 'kernel_svm', 'gbm', 'xgboost', 'knn', 'decision_tree',  'linear_svm', 
             'logistic_regression', 'mlp']

for ml_model in ml_models:
    print (ml_model)
    yoriginal = []
    ypredicted = []
    tprs_VBW, fprs_VBW, roc_aucs_VBW, stride_person_metrics_VBW = models(X_VBW, Y_VBW, ml_model)
    metrics_VBW[ml_model] = sum(stride_person_metrics_VBW, [])
    tprs_VBW[ml_model] = list(tprs_VBW)
    fprs_VBW[ml_model] = list(fprs_VBW)
    roc_auc_VBW[ml_model] = list(roc_aucs_VBW)
    print ('********************************')

metrics_VBW.index = ['stride_mean_accuracy', 'stride_mean_precision', 'stride_mean_recall', 'stride_mean_F1', \
                     'stride_mean_AUC', 'stride_std_accuracy', 'stride_std_precision', 'stride_std_recall', 'stride_std_F1', \
                     'stride_std_AUC','person_mean_accuracy', 'person_mean_precision', 'person_mean_recall', 'person_mean_F1',\
                     'person_mean_AUC', 'person_std_accuracy', 'person_std_precision', 'person_std_recall', 'person_std_F1',\
                     'person_std_AUC']  
# raw_metrics.to_csv(path+'..//person_generalize//person_generalize_results_raw_data.csv')
# tprs_raw.to_csv(path+'..//person_generalize//person_generalize_ROCresults_raw_data.csv')

In [None]:
metrics_VBW

In [None]:
#To do!
#ROC
#Confusion matrix
#Saving all results to results folder 
#Comments in utility functions 
