# Analysis of Ladder Walk Videos

BIOF 509 Spring 2021

This notebook contains all the code needed to train and run machine learning models to predict the different scores of interest on the horizontal ladder walk

# Load packages


In [1]:
import pandas as pd
import numpy as np
import glob
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from dateutil import parser
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib, os

# Preprocessing

The function separates the scores for the different limbs and processes the raw output from DeepLabCut. For each video, all features are normalized, frames are made the same length, and flattend.

In [4]:
class Processing:
    def __init__(self):
        #list with a string order of the output of processing
        self.keys = ["dominant_front_hit", "dominant_front_miss", "dominant_front_step", 
 "nondominant_front_hit", "nondominant_front_miss", "nondominant_front_step",
    "dominant_back_hit", "dominant_back_miss", "dominant_back_step", 
 "nondominant_back_hit", "nondominant_back_miss", "nondominant_back_step"]
    def process_with_labels(data_path,label_path,project_name,rung_network,rung_it,max_len,show_skipped):
        '''
        Processes input data that has associated labels for training
        
        Parameters
        ----------
        data_path: string
            String containing path to the DLC output data (.h5 files)
        label_path: string
            String containing path to the manual score csv
        project_name: string
            String containing the name of the DLC project for rat tracking
        rung_network: string
            String containing the project and date info of the DLC project for rung tracking
        rung_it: int
            Integer that shows the number of iterations of the snapshot that was used to analyze the videos for rungs
        max_len: int
            Integer that defines the maximum number of frames of all videos in the dataset
        show_skipped: bool
            If this is set to True, then the script will print out the names of the videos that were skipped
        '''
        
        scores_df = pd.read_csv(label_path)

        #hit scores
        hit_df = scores_df[["subject","date","crossing number","run","limb","avg_human_hit"]]

        #miss
        miss_df = scores_df[["subject","date","crossing number","run","limb","avg_human_miss"]]

        #steps
        step_df = scores_df[["subject","date","crossing number","run","limb","avg_human_steps"]]

        #score extraction
        dom_f_hit = hit_df.loc[hit_df["limb"]=="Dominant Front"]
        dom_f_miss = miss_df.loc[miss_df["limb"]=="Dominant Front"]
        dom_f_step = step_df.loc[step_df["limb"]=="Dominant Front"]

        ndom_f_hit = hit_df.loc[hit_df["limb"]=="Nondominant Front"]
        ndom_f_miss = miss_df.loc[miss_df["limb"]=="Nondominant Front"]
        ndom_f_step = step_df.loc[step_df["limb"]=="Nondominant Front"]

        dom_b_hit = hit_df.loc[hit_df["limb"]=="Dominant Back"]
        dom_b_miss = miss_df.loc[miss_df["limb"]=="Dominant Back"]
        dom_b_step = step_df.loc[step_df["limb"]=="Dominant Back"]

        ndom_b_hit = hit_df.loc[hit_df["limb"]=="Nondominant Back"]
        ndom_b_miss = miss_df.loc[miss_df["limb"]=="Nondominant Back"]
        ndom_b_step = step_df.loc[step_df["limb"]=="Nondominant Back"]
        dom_f_hit_x = []
        dom_f_miss_x = []
        dom_f_step_x =[]

        ndom_f_hit_x = []
        ndom_f_miss_x = []
        ndom_f_step_x = []

        dom_b_hit_x = []
        dom_b_miss_x = []
        dom_b_step_x = []

        ndom_b_hit_x = []
        ndom_b_miss_x = []
        ndom_b_step_x = []

        #sets of labels
        dom_f_hit_y = []
        dom_f_miss_y = []
        dom_f_step_y =[]

        ndom_f_hit_y = []
        ndom_f_miss_y = []
        ndom_f_step_y = []

        dom_b_hit_y = []
        dom_b_miss_y = []
        dom_b_step_y = []

        ndom_b_hit_y = []
        ndom_b_miss_y = []
        ndom_b_step_y = []

        lengths = []
        
        #video_list
        dom_f_hit_videos = []
        dom_f_miss_videos = []
        dom_f_step_videos =[]

        ndom_f_hit_videos = []
        ndom_f_miss_videos = []
        ndom_f_step_videos = []

        dom_b_hit_videos = []
        dom_b_miss_videos = []
        dom_b_step_videos = []

        ndom_b_hit_videos = []
        ndom_b_miss_videos = []
        ndom_b_step_videos = []
        
        rat_folder = glob.glob(data_path)
        
        for file in rat_folder:
            #rung file info
            rung_name_list = file.split("/")[-1].split("_")[0:8]+[rung_network,str(rung_it)+".h5"]
            rung_file = '_'.join(rung_name_list)
            #rat tracking file
            #open the file
            rat_df = pd.read_hdf(file)[project_name]
            #properties of the file
            subject = file.split("/")[0]
            date_raw = rung_name_list[1]
            date = parser.parse(date_raw).date().strftime("%Y-%m-%d")
            run = rung_name_list[2]
            crossing = [int(s) for s in rung_name_list[3] if s.isdigit()][0]
                
            
            #
            dom_f_hit_score = dom_f_hit[(dom_f_hit["subject"]==subject) & (dom_f_hit["date"]==date) & (dom_f_hit["run"]==run)].reset_index()
            dom_f_miss_score = dom_f_miss[(dom_f_miss["subject"]==subject) & (dom_f_miss["date"]==date) & (dom_f_miss["run"]==run)].reset_index()
            dom_f_step_score = dom_f_step[(dom_f_step["subject"]==subject) & (dom_f_step["date"]==date) & (dom_f_step["run"]==run)].reset_index()

            ndom_f_hit_score = ndom_f_hit[(ndom_f_hit["subject"]==subject) & (ndom_f_hit["date"]==date) & (ndom_f_hit["run"]==run)].reset_index()
            ndom_f_miss_score = ndom_f_miss[(ndom_f_miss["subject"]==subject) & (ndom_f_miss["date"]==date) & (ndom_f_miss["run"]==run)].reset_index()
            ndom_f_step_score = ndom_f_step[(ndom_f_step["subject"]==subject) & (ndom_f_step["date"]==date) & (ndom_f_step["run"]==run)].reset_index()

            dom_b_hit_score = dom_b_hit[(dom_b_hit["subject"]==subject) & (dom_b_hit["date"]==date) & (dom_b_hit["run"]==run)].reset_index()
            dom_b_miss_score = dom_b_miss[(dom_b_miss["subject"]==subject) & (dom_b_miss["date"]==date) & (dom_b_miss["run"]==run)].reset_index()
            dom_b_step_score = dom_b_step[(dom_b_step["subject"]==subject) & (dom_b_step["date"]==date) & (dom_b_step["run"]==run)].reset_index()

            ndom_b_hit_score = ndom_b_hit[(ndom_b_hit["subject"]==subject) & (ndom_b_hit["date"]==date) & (ndom_b_hit["run"]==run)].reset_index()
            ndom_b_miss_score = ndom_b_miss[(ndom_b_miss["subject"]==subject) & (ndom_b_miss["date"]==date) & (ndom_b_miss["run"]==run)].reset_index()
            ndom_b_step_score = ndom_b_step[(ndom_b_step["subject"]==subject) & (ndom_b_step["date"]==date) & (ndom_b_step["run"]==run)].reset_index()

            
            video_df = scores_df[["subject","date","crossing number","run"]]
            video_info = video_df[(video_df["subject"]==subject) & (video_df["date"]==date) & (video_df["run"]==run)].reset_index(drop=True)
            
            
            #join the rat and rung dataframes
            df = rat_df
            df_cols = df.columns.tolist()
            df_temp = df

            if df.shape[1] == 0:
                continue
            #if len(dom_f_low_like) >0:
                #print(dom_f_low_like)
            df = df.drop('likelihood', axis=1, level=1)

            #scale data
            #Uses the values function of pandas - converts any dataframe to an array
            data_for_scaling = df.values
            scaler = MinMaxScaler()
            scaled_data = scaler.fit_transform(data_for_scaling)

            if len(scaled_data) < max_len:
                newlength = (max_len-len(scaled_data))
                zero = np.zeros((newlength,6))
                scaled_temp = pd.DataFrame(scaled_data).append(pd.DataFrame(zero),ignore_index=True)
                scaled_temp = scaled_temp.fillna(0)
                scaled_data2 = scaled_temp.values
            else:
                scaled_data2 = scaled_data

            video_data = scaled_data2.flatten()
            if len(dom_f_hit_score) !=0:
                dom_f_hit_y.append(dom_f_hit_score["avg_human_hit"][0])
                dom_f_hit_x.append(video_data)
                dom_f_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Front Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_f_miss_score) != 0:
                dom_f_miss_y.append(dom_f_miss_score["avg_human_miss"][0])
                dom_f_miss_x.append(video_data)
                dom_f_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Front Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_f_step_score) != 0:
                dom_f_step_y.append(dom_f_step_score["avg_human_steps"][0])
                dom_f_step_x.append(video_data)
                dom_f_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Front Step"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_f_hit_score) != 0:
                ndom_f_hit_y.append(ndom_f_hit_score["avg_human_hit"][0])
                ndom_f_hit_x.append(video_data)
                ndom_f_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Front Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_f_miss_score) != 0:
                ndom_f_miss_y.append(ndom_f_miss_score["avg_human_miss"][0])
                ndom_f_miss_x.append(video_data)
                ndom_f_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Front Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_f_step_score) != 0:
                ndom_f_step_y.append(ndom_f_step_score["avg_human_steps"][0])
                ndom_f_step_x.append(video_data)
                ndom_f_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Front Step"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_b_hit_score) != 0:
                dom_b_hit_y.append(dom_b_hit_score["avg_human_hit"][0])
                dom_b_hit_x.append(video_data)
                dom_b_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Back Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_b_miss_score) != 0:
                dom_b_miss_y.append(dom_b_miss_score["avg_human_miss"][0])
                dom_b_miss_x.append(video_data)
                dom_b_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Back Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_b_step_score) != 0:
                dom_b_step_y.append(dom_b_step_score["avg_human_steps"][0])
                dom_b_step_x.append(video_data)
                dom_b_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Back Step"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_b_hit_score) != 0:
                ndom_b_hit_y.append(ndom_b_hit_score["avg_human_hit"][0])
                ndom_b_hit_x.append(video_data)
                ndom_b_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Back Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_b_miss_score) != 0:
                ndom_b_miss_y.append(ndom_b_miss_score["avg_human_miss"][0])
                ndom_b_miss_x.append(video_data)
                ndom_b_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Back Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_b_step_score) != 0:
                ndom_b_step_y.append(ndom_b_step_score["avg_human_steps"][0])
                ndom_b_step_x.append(video_data)
                ndom_b_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Back Step"+"Missing scores: "+subject + " " + date + " " + run)
                continue
        return [dom_f_hit_x, dom_f_miss_x, dom_f_step_x, ndom_f_hit_x, ndom_f_miss_x, ndom_f_step_x,
    dom_b_hit_x, dom_b_miss_x, dom_b_step_x, ndom_b_hit_x, ndom_b_miss_x, ndom_b_step_x],[dom_f_hit_y, dom_f_miss_y, dom_f_step_y, ndom_f_hit_y, ndom_f_miss_y, ndom_f_step_y, dom_b_hit_y,
    dom_b_miss_y, dom_b_step_y, ndom_b_hit_y, ndom_b_miss_y, ndom_b_step_y],[dom_f_hit_videos, dom_f_miss_videos, dom_f_step_videos, ndom_f_hit_videos, ndom_f_miss_videos, ndom_f_step_videos, dom_b_hit_videos,
    dom_b_miss_videos, dom_b_step_videos, ndom_b_hit_videos, ndom_b_miss_videos, ndom_b_step_videos]

    def process_data(data_path,project_name,rung_network,rung_it,max_len,show_skipped):
        '''
        Processes input data that does not have labels. Only for prediction.
        
        Parameters
        ----------
        data_path: string
            String containing path to the DLC output data (.h5 files)
        label_path: string
            String containing path to the manual score csv
        project_name: string
            String containing the name of the DLC project for rat tracking
        rung_network: string
            String containing the project and date info of the DLC project for rung tracking
        rung_it: int
            Integer that shows the number of iterations of the snapshot that was used to analyze the videos for rungs
        max_len: int
            Integer that defines the maximum number of frames of all videos in the dataset
        show_skipped: bool
            If this is set to True, then the script will print out the names of the videos that were skipped
        '''
            
        X = []
        video_list = []

        rat_folder = glob.glob(data_path)
        
        for file in rat_folder:
            #rung file info
            rung_name_list = file.split("/")[-1].split("_")[0:8]+[rung_network,str(rung_it)+".h5"]
            rung_file = '_'.join(rung_name_list)
            #rat tracking file
            #open the file
            rat_df = pd.read_hdf(file)[project_name]
            #properties of the file
            subject = file.split("/")[0]
            date_raw = rung_name_list[1]
            date = parser.parse(date_raw).date().strftime("%Y-%m-%d")
            run = rung_name_list[2]
            crossing = [int(s) for s in rung_name_list[3] if s.isdigit()][0]
                
            video_info = pd.DataFrame({"subject":[subject],"date":[date],"crossing":[crossing],"run":[run]})

            df = rat_df
            df_cols = df.columns.tolist()
            df_temp = df

            if df.shape[1] == 0:
                continue
            #if len(dom_f_low_like) >0:
                #print(dom_f_low_like)
            df = df.drop('likelihood', axis=1, level=1)

            #scale data
            #Uses the values function of pandas - converts any dataframe to an array
            data_for_scaling = df.values
            scaler = MinMaxScaler()
            scaled_data = scaler.fit_transform(data_for_scaling)

            if len(scaled_data) < max_len:
                newlength = (max_len-len(scaled_data))
                zero = np.zeros((newlength,6))
                scaled_temp = pd.DataFrame(scaled_data).append(pd.DataFrame(zero),ignore_index=True)
                scaled_temp = scaled_temp.fillna(0)
                scaled_data2 = scaled_temp.values
            else:
                scaled_data2 = scaled_data

            video_data = scaled_data2.flatten(order='C')
            X.append(video_data)
            video_list.append(video_info)
            
        return X,video_list


# Scoring

This class makes scoring possible with a support vector regressor and random forest regreesor. The result is some metrics of interest and a saved model

In [5]:
class Scoring:
    def support_vector(X,y,name,data_path,project_name,kern,save=True):
        '''
        Trains a support vector machine on regression. Returns accuracy, RMSE, and feature importances
        
        Parameters
        ----------
        X : list
            List containing data of one limb
        y: list
            List containing the labels of the corresponding limb
        name: string
            String containing the name of what the model will be saved as
        data_path: string
            String containing the path to the original .h5 files
        project_name: string
            String containing the name of the DLC project for rat tracking
        kern: string
            String containing the name of the desired kernel
        save: bool, default: True
            If true, then a copy of the trained model will be saved. 
        '''
        data = np.array(X)
        labels = np.array(y)
        
        train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 31)
        regressor = SVR(kernel=kern)
        regressor.fit(train_features, train_labels)
        predictions = regressor.predict(test_features)
        
        rat_folder = glob.glob(data_path)
        df = pd.read_hdf(rat_folder[0])[project_name]
        df = df.drop('likelihood', axis=1, level=1)
        
        all_feature_importances = np.reshape(regressor.coef_,(356,32))
        
        average_feature_importances = []
        for i in range(all_feature_importances.shape[1]):
            average_value = 0
            for array in all_feature_importances:
                average_value += abs(array[i])
            average_feature_importances.append(average_value)
        
        # dictionary to hold {feature_name: feature_importance}
        feats = {} 
        for feature, importance in zip(df.columns, average_feature_importances):
            #add the name/value pair
            feats[feature] = importance 
        #make it into a dataframe
        importances = pd.DataFrame.from_dict(feats, orient='index')

        # Calculate the absolute errors
        errors = abs(predictions - test_labels)
        # Calculate mean absolute percentage error (MAPE)
        mape = 100 * (errors / test_labels)# Calculate and display accuracy
        rmse = mean_squared_error(test_labels,predictions,squared=True)
        accuracy = 100 - np.mean(mape)
        if save == True:
            joblib.dump(regressor, name+".joblib", compress=0)
        return accuracy,rmse,importances

    def forest(X,y,name,data_path,project_name, njobs=None,save=True):
        '''
        Trains a random forest on regression. Returns accuracy, RMSE, and feature importances
        
        Parameters
        ----------
        X : list
            List containing data of one limb
        y: list
            List containing the labels of the corresponding limb
        name: string
            String containing the name of what the model will be saved as
        data_path: string
            String containing the path to the original .h5 files
        project_name: string
            String containing the name of the DLC project for rat tracking
        njobs: int, default = None
            The number of jobs that will be run in parallel. 
        save: bool, default= True
            If true, then a copy of the trained model will be saved. 
        '''
        data = np.array(X)
        labels = np.array(y)
        train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 31)
        # Instantiate model with n decision trees
        rf = RandomForestRegressor(n_estimators = 20, random_state = 42,n_jobs=njobs)# Train the model on training data
        rf.fit(train_features, train_labels)
        # Use the forest's predict method on the test data
        # Calculate the absolute errors
        predictions = rf.predict(test_features)
        errors = abs(predictions - test_labels)
        # Calculate mean absolute percentage error (MAPE)
        mape = 100 * (errors / test_labels)
        # Calculate and display accuracy
        accuracy = 100 - np.mean(mape)
        rmse = mean_squared_error(test_labels,predictions,squared=True)
        
        if save == True:
            joblib.dump(rf, name+".joblib", compress=0) 
        
        rat_folder = glob.glob(data_path)
        df = pd.read_hdf(rat_folder[0])[project_name]
        df = df.drop('likelihood', axis=1, level=1)
        # dictionary to hold feature_name: feature_importance
        feats = {} 
        for feature, importance in zip(df.columns, rf.feature_importances_):
            #add the name/value pair 
            feats[feature] = importance 
        importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
        return predictions,errors,mape,accuracy,rmse,importances

# Process the Data

In [6]:
datas = Processing.process_with_labels("*/dlc_output_resnet50/*.h5","LW_Manual_scores_for_ICC_2020-05-20.csv","DLC_resnet50_LadderWalkFeb13shuffle1_450000","LadderWalkMar12shuffle1",rung_it=450000,max_len=356,show_skipped = False)

In [7]:
#split the processing data into data, labels, and video information
data_x = datas[0]
labels_y = datas[1]
videos = datas[2]

In [8]:
#get the range of the labels for each measurement
'''limbs = []
mins = []
maxs = []
for i,x in enumerate(labels_y):
    ind = i
    limb = Processing().keys[i]
    minimum = min(x)
    maximum = max(x)
    limbs.append(limb)
    mins.append(minimum)
    maxs.append(maximum)
pd.DataFrame([limbs,mins,maxs])'''

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,dominant_front_hit,dominant_front_miss,dominant_front_step,nondominant_front_hit,nondominant_front_miss,nondominant_front_step,dominant_back_hit,dominant_back_miss,dominant_back_step,nondominant_back_hit,nondominant_back_miss,nondominant_back_step
1,1,0,2.33333,1.66667,0,2.33333,1.66667,0,2,1.33333,0,2.33333
2,6,7.33333,10.6667,8.33333,4.66667,9.66667,7,4.66667,7.66667,6.33333,2,6.33333


In [9]:
#hide warnings when desired
import warnings

#hide warnings function
def action_with_warnings():
    warnings.warn("should not appear")

# Train and save the support vector machines

In [10]:
with warnings.catch_warnings(record=True):
    #hide warnings for dividing by 0
    action_with_warnings()
    for i,k in enumerate(Processing().keys):
        print(i,k)
        svr_results = Scoring.support_vector(data_x[i],labels_y[i],k+"_svr","*/dlc_output_resnet50/*.h5","DLC_resnet50_LadderWalkFeb13shuffle1_450000",kern="linear",save=True)
        #svr_results[2].to_csv(k+"_svr_importances.csv")
        #print("accuracy: ", svr_results[0]," RMSE: ", svr_results[1])

0 dominant_front_hit
1 dominant_front_miss
2 dominant_front_step
3 nondominant_front_hit
4 nondominant_front_miss
5 nondominant_front_step
6 dominant_back_hit
7 dominant_back_miss
8 dominant_back_step
9 nondominant_back_hit
10 nondominant_back_miss
11 nondominant_back_step


# Train and save random forests

In [11]:
with warnings.catch_warnings(record=True):
    for i,k in enumerate(Processing().keys):
        print(i,k)
        forest_results = Scoring.forest(data_x[i],labels_y[i],k+"_random_forest","*/dlc_output_resnet50/*.h5","DLC_resnet50_LadderWalkFeb13shuffle1_450000",njobs=6,save=True)
        #forest_results[5].to_csv(k+"_rf_importances.csv")
        #print("accuracy: ", forest_results[3]," RMSE: ", forest_results[4])

0 dominant_front_hit
1 dominant_front_miss
2 dominant_front_step
3 nondominant_front_hit
4 nondominant_front_miss
5 nondominant_front_step
6 dominant_back_hit
7 dominant_back_miss
8 dominant_back_step
9 nondominant_back_hit
10 nondominant_back_miss
11 nondominant_back_step


# Validation with KFold Cross-Validation

In [12]:
from sklearn.model_selection import KFold

In [14]:
'''Only 1 model'''

loaded_model = joblib.load("dominant_front_hit_svr.joblib")

temp_data = np.array(data_x[0])
temp_labels = np.array(labels_y[0])
temp_vids = np.array(videos[0])

scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=0)

for train_index,test_index in kf.split(temp_data):
    X_train, X_test, y_train, y_test = temp_data[train_index], temp_data[test_index], temp_labels[train_index], temp_labels[test_index]
    scores.append(loaded_model.score(X_test, y_test))
print(scores,np.mean(scores))

[0.914527913893575, 0.6990113721337498, 0.731303677711002, 0.8060484538644213, 0.9054053665674096] 0.8112593568340316


In [15]:
'''KFold validation for all models; SVR'''
kf = KFold(n_splits=5, shuffle=True, random_state=0)

for i,k in enumerate(Processing().keys):
    scores = []
    print(i,k)
    data = np.array(data_x[i])
    labels = np.array(labels_y[i])
    vids = np.array(videos[i])
    loaded_model = joblib.load(k+"_svr.joblib")
    for train_index,test_index in kf.split(data):
        X_train, X_test, y_train, y_test = data[train_index], data[test_index], labels[train_index], labels[test_index]
        scores.append(abs(loaded_model.score(X_test, y_test)))
    print("Scores: ",scores)

0 dominant_front_hit
Scores:  [0.914527913893575, 0.6990113721337498, 0.731303677711002, 0.8060484538644213, 0.9054053665674096]
1 dominant_front_miss
Scores:  [0.7558111737967048, 0.9683383506506983, 0.9014736823243106, 0.9311350673382512, 0.8983084858355225]
2 dominant_front_step
Scores:  [0.9206718646080475, 0.8850080171775608, 0.6790149000770475, 0.8858613885218157, 0.9359420826192857]
3 nondominant_front_hit
Scores:  [0.8854762792123323, 0.9259004796107195, 0.7447476283760996, 0.9091808409116596, 0.879025728939417]
4 nondominant_front_miss
Scores:  [0.5487701762259453, 0.6130921226216595, 0.49597976157000134, 0.9448666221450881, 0.6592927566119544]
5 nondominant_front_step
Scores:  [0.9206815829191874, 0.9305994005089346, 0.8031316270436214, 0.9180540709643148, 0.9298643492871516]
6 dominant_back_hit
Scores:  [0.8944416801176944, 0.7991984612849072, 0.6699123167949894, 0.9110586040174083, 0.7921049579862843]
7 dominant_back_miss
Scores:  [0.9571108592570481, 0.9621600559858559, 0.

In [16]:
'''KFold validation for all models; Random forest'''

kf = KFold(n_splits=5, shuffle=True, random_state=0)

for i,k in enumerate(Processing().keys):
    scores = []
    print(i,k)
    data = np.array(data_x[i])
    labels = np.array(labels_y[i])
    vids = np.array(videos[i])
    loaded_model = joblib.load(k+"_random_forest.joblib")
    for train_index,test_index in kf.split(data):
        X_train, X_test, y_train, y_test = data[train_index], data[test_index], labels[train_index], labels[test_index]
        scores.append(abs(loaded_model.score(X_test, y_test)))
    print("Scores: ",scores)

0 dominant_front_hit
Scores:  [0.7397889331038934, 0.786734802912775, 0.7848894367629117, 0.7973208656644035, 0.7731755856027422]
1 dominant_front_miss
Scores:  [0.6881090651558075, 0.9081484354585837, 0.7135435452573908, 0.8197753324163228, 0.8216593059936907]
2 dominant_front_step
Scores:  [0.8776161127471603, 0.9043544773678702, 0.6894219994102035, 0.8368514784946238, 0.8386187845303867]
3 nondominant_front_hit
Scores:  [0.8561082632903261, 0.9061222274657857, 0.7285815205509403, 0.9185861742660059, 0.8765194828841953]
4 nondominant_front_miss
Scores:  [0.46930858054825386, 0.6225891877058178, 0.6908428620928622, 0.8343836552478132, 0.6406083815028901]
5 nondominant_front_step
Scores:  [0.8837752619760478, 0.9017599885946466, 0.8266992905937993, 0.915281762295082, 0.9235320382868434]
6 dominant_back_hit
Scores:  [0.7380478046778827, 0.7523338552540014, 0.5846096018249689, 0.8078865944251348, 0.8314729598220494]
7 dominant_back_miss
Scores:  [0.785, 0.8138977918568899, 0.582268110130

# Example of using loaded model

In [17]:
loaded_model = joblib.load("dominant_front_hit_svr.joblib")
temp_data = np.array(data_x[0])
temp_labels = np.array(labels_y[0])
temp_vids = np.array(videos[0])


train_features, test_features, train_labels, test_labels = train_test_split(temp_data, temp_labels, test_size = 0.5, random_state = 1)

result = loaded_model.score(test_features, test_labels)
print(result)

0.7524629831750013


# Example of use

I imagine that this tool can be used to score videos and quickly output a dataframe with the video information and scores

In [18]:
'''Example of using the code to predict the results of a random sample of the dataset without the corresponding labels'''
import random

#process data without labels
datas = Processing.process_data("*/dlc_output_resnet50/*.h5","DLC_resnet50_LadderWalkFeb13shuffle1_450000","LadderWalkMar12shuffle1",rung_it=450000,max_len=356,show_skipped = False)

data_x = datas[0]
videos = datas[1]

temp_data = np.array(data_x)
temp_vids = np.array(videos)


#get a random sample of the dataset
random.seed(23)
random.shuffle(temp_data)
indexes = []
values = []
#just take 1/10th of the dataset
for idx, val in random.sample(list(enumerate(temp_data)), (len(temp_data)//10)):
    indexes.append(idx)
    values.append(val)

#get video info
vids = temp_vids[indexes]

#load model
loaded_model = joblib.load("dominant_front_hit_svr.joblib")
pred_labels = loaded_model.predict(values)

ex_df = pd.DataFrame(np.concatenate(vids),columns=["Subject","Date","Crossing","Run"])
ex_df["predictions"] = pred_labels
print(ex_df)

   Subject        Date Crossing Run  predictions
0     MC70  2019-04-09        7  L4     4.899937
1     MC30  2019-12-04        3  R2     4.353295
2     MC87  2018-12-10        3  R2     3.433569
3     MC78  2019-03-28        7  R4     5.233646
4     MC61  2019-05-22        7  R4     4.100091
5     MC78  2019-04-23        2  L1     3.433569
6     MC61  2019-05-21        5  R3     3.182038
7     MC30  2019-12-04        7  R4     3.433464
8     MC87  2018-12-12        1  R1     4.568946
9     MC78  2019-04-22        6  L3     3.899816
10    MC30  2019-11-06        6  L3     3.099924
11    MC70  2019-03-14        1  R1     3.726849
12    MC87  2018-12-12        7  R4     3.791385
13    MC30  2019-11-04        7  R4     4.899755
14    MC78  2019-03-21        6  L3     3.371577
15    MC70  2019-03-14        2  L1     3.566815
16    MC61  2019-05-28        1  R1     3.099924
17    MC87  2018-12-14        4  L2     2.433230
18    MC30  2019-12-03        4  L2     4.766538
19    MC30  2019-11-