# Analysis of Ladder Walk Videos

BIOF 509 Spring 2021

This notebook contains all the code needed to train and run machine learning models to predict the different scores of interest

In [1]:
import pandas as pd
import numpy as np
import glob
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from dateutil import parser
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib, os

# Preprocessing

The function separates the scores for the different limbs and processes the raw output from DeepLabCut. For each video, all features are normalized, frames are made the same length, and flattend.

In [51]:
class Processing:
    def __init__(self):
        #list with a string order of the output of processing
        self.keys = ["dominant_front_hit", "dominant_front_miss", "dominant_front_step", 
 "nondominant_front_hit", "nondominant_front_miss", "nondominant_front_step",
    "dominant_back_hit", "dominant_back_miss", "dominant_back_step", 
 "nondominant_back_hit", "nondominant_back_miss", "nondominant_back_step"]
    def process_with_labels(data_path,label_path,project_name,rung_network,rung_it,show_skipped):
        scores_df = pd.read_csv(label_path)

        #hit scores
        hit_df = scores_df[["subject","date","crossing number","run","limb","avg_human_hit"]]

        #miss
        miss_df = scores_df[["subject","date","crossing number","run","limb","avg_human_miss"]]

        #steps
        step_df = scores_df[["subject","date","crossing number","run","limb","avg_human_steps"]]

        #score extraction
        dom_f_hit = hit_df.loc[hit_df["limb"]=="Dominant Front"]
        dom_f_miss = miss_df.loc[miss_df["limb"]=="Dominant Front"]
        dom_f_step = step_df.loc[step_df["limb"]=="Dominant Front"]

        ndom_f_hit = hit_df.loc[hit_df["limb"]=="Nondominant Front"]
        ndom_f_miss = miss_df.loc[miss_df["limb"]=="Nondominant Front"]
        ndom_f_step = step_df.loc[step_df["limb"]=="Nondominant Front"]

        dom_b_hit = hit_df.loc[hit_df["limb"]=="Dominant Back"]
        dom_b_miss = miss_df.loc[miss_df["limb"]=="Dominant Back"]
        dom_b_step = step_df.loc[step_df["limb"]=="Dominant Back"]

        ndom_b_hit = hit_df.loc[hit_df["limb"]=="Nondominant Back"]
        ndom_b_miss = miss_df.loc[miss_df["limb"]=="Nondominant Back"]
        ndom_b_step = step_df.loc[step_df["limb"]=="Nondominant Back"]
        dom_f_hit_x = []
        dom_f_miss_x = []
        dom_f_step_x =[]

        ndom_f_hit_x = []
        ndom_f_miss_x = []
        ndom_f_step_x = []

        dom_b_hit_x = []
        dom_b_miss_x = []
        dom_b_step_x = []

        ndom_b_hit_x = []
        ndom_b_miss_x = []
        ndom_b_step_x = []

        #sets of labels
        dom_f_hit_y = []
        dom_f_miss_y = []
        dom_f_step_y =[]

        ndom_f_hit_y = []
        ndom_f_miss_y = []
        ndom_f_step_y = []

        dom_b_hit_y = []
        dom_b_miss_y = []
        dom_b_step_y = []

        ndom_b_hit_y = []
        ndom_b_miss_y = []
        ndom_b_step_y = []

        lengths = []
        
        #video_list
        dom_f_hit_videos = []
        dom_f_miss_videos = []
        dom_f_step_videos =[]

        ndom_f_hit_videos = []
        ndom_f_miss_videos = []
        ndom_f_step_videos = []

        dom_b_hit_videos = []
        dom_b_miss_videos = []
        dom_b_step_videos = []

        ndom_b_hit_videos = []
        ndom_b_miss_videos = []
        ndom_b_step_videos = []
        
        rat_folder = glob.glob(data_path)
        
        for file in rat_folder:
            #rung file info
            rung_name_list = file.split("/")[-1].split("_")[0:8]+[rung_network,str(rung_it)+".h5"]
            rung_file = '_'.join(rung_name_list)
            #rat tracking file
            #open the file
            rat_df = pd.read_hdf(file)[project_name]
            #properties of the file
            subject = file.split("/")[0]
            date_raw = rung_name_list[1]
            date = parser.parse(date_raw).date().strftime("%Y-%m-%d")
            run = rung_name_list[2]
            crossing = [int(s) for s in rung_name_list[3] if s.isdigit()][0]
                
            
            #
            dom_f_hit_score = dom_f_hit[(dom_f_hit["subject"]==subject) & (dom_f_hit["date"]==date) & (dom_f_hit["run"]==run)].reset_index()
            dom_f_miss_score = dom_f_miss[(dom_f_miss["subject"]==subject) & (dom_f_miss["date"]==date) & (dom_f_miss["run"]==run)].reset_index()
            dom_f_step_score = dom_f_step[(dom_f_step["subject"]==subject) & (dom_f_step["date"]==date) & (dom_f_step["run"]==run)].reset_index()

            ndom_f_hit_score = ndom_f_hit[(ndom_f_hit["subject"]==subject) & (ndom_f_hit["date"]==date) & (ndom_f_hit["run"]==run)].reset_index()
            ndom_f_miss_score = ndom_f_miss[(ndom_f_miss["subject"]==subject) & (ndom_f_miss["date"]==date) & (ndom_f_miss["run"]==run)].reset_index()
            ndom_f_step_score = ndom_f_step[(ndom_f_step["subject"]==subject) & (ndom_f_step["date"]==date) & (ndom_f_step["run"]==run)].reset_index()

            dom_b_hit_score = dom_b_hit[(dom_b_hit["subject"]==subject) & (dom_b_hit["date"]==date) & (dom_b_hit["run"]==run)].reset_index()
            dom_b_miss_score = dom_b_miss[(dom_b_miss["subject"]==subject) & (dom_b_miss["date"]==date) & (dom_b_miss["run"]==run)].reset_index()
            dom_b_step_score = dom_b_step[(dom_b_step["subject"]==subject) & (dom_b_step["date"]==date) & (dom_b_step["run"]==run)].reset_index()

            ndom_b_hit_score = ndom_b_hit[(ndom_b_hit["subject"]==subject) & (ndom_b_hit["date"]==date) & (ndom_b_hit["run"]==run)].reset_index()
            ndom_b_miss_score = ndom_b_miss[(ndom_b_miss["subject"]==subject) & (ndom_b_miss["date"]==date) & (ndom_b_miss["run"]==run)].reset_index()
            ndom_b_step_score = ndom_b_step[(ndom_b_step["subject"]==subject) & (ndom_b_step["date"]==date) & (ndom_b_step["run"]==run)].reset_index()

            
            video_df = scores_df[["subject","date","crossing number","run"]]
            video_info = video_df[(video_df["subject"]==subject) & (video_df["date"]==date) & (video_df["run"]==run)].reset_index(drop=True)
            
            
            #join the rat and rung dataframes
            df = rat_df
            df_cols = df.columns.tolist()
            df_temp = df

            #remove where median likelihood is low
            #df_low_like = df.columns[-df_temp[df_temp.columns.get_level_values(0).unique()].median().ge(0.2)].get_level_values(0).tolist()
            #df_cols_up = [x for x in df_cols if x not in df_low_like ]

            #df = rat_df[df_cols_up]
            if df.shape[1] == 0:
                continue
            #if len(dom_f_low_like) >0:
                #print(dom_f_low_like)
            df = df.drop('likelihood', axis=1, level=1)

            #scale data
            ####Uses the values function of pandas - converts any dataframe to an array
            data_for_scaling = df.values
            #### the scaling object
            scaler = MinMaxScaler()
            #We will use fit_transform here - we want to actually scale this data, not use the scaler on a different dataset
            scaled_data = scaler.fit_transform(data_for_scaling)

            if len(scaled_data) < 356:
                newlength = (356-len(scaled_data))
                zero = np.zeros((newlength,6))
                scaled_temp = pd.DataFrame(scaled_data).append(pd.DataFrame(zero),ignore_index=True)
                scaled_temp = scaled_temp.fillna(0)
                scaled_data2 = scaled_temp.values
            else:
                scaled_data2 = scaled_data

            video_data = scaled_data2.flatten()
            #lengths.append([len(video_data)])
            if len(dom_f_hit_score) !=0:
                dom_f_hit_y.append(dom_f_hit_score["avg_human_hit"][0])
                dom_f_hit_x.append(video_data)
                dom_f_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Front Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_f_miss_score) != 0:
                dom_f_miss_y.append(dom_f_miss_score["avg_human_miss"][0])
                dom_f_miss_x.append(video_data)
                dom_f_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Front Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_f_step_score) != 0:
                dom_f_step_y.append(dom_f_step_score["avg_human_steps"][0])
                dom_f_step_x.append(video_data)
                dom_f_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Front Step"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_f_hit_score) != 0:
                ndom_f_hit_y.append(ndom_f_hit_score["avg_human_hit"][0])
                ndom_f_hit_x.append(video_data)
                ndom_f_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Front Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_f_miss_score) != 0:
                ndom_f_miss_y.append(ndom_f_miss_score["avg_human_miss"][0])
                ndom_f_miss_x.append(video_data)
                ndom_f_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Front Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_f_step_score) != 0:
                ndom_f_step_y.append(ndom_f_step_score["avg_human_steps"][0])
                ndom_f_step_x.append(video_data)
                ndom_f_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Front Step"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_b_hit_score) != 0:
                dom_b_hit_y.append(dom_b_hit_score["avg_human_hit"][0])
                dom_b_hit_x.append(video_data)
                dom_b_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Back Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_b_miss_score) != 0:
                dom_b_miss_y.append(dom_b_miss_score["avg_human_miss"][0])
                dom_b_miss_x.append(video_data)
                dom_b_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Back Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(dom_b_step_score) != 0:
                dom_b_step_y.append(dom_b_step_score["avg_human_steps"][0])
                dom_b_step_x.append(video_data)
                dom_b_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Dom Back Step"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_b_hit_score) != 0:
                ndom_b_hit_y.append(ndom_b_hit_score["avg_human_hit"][0])
                ndom_b_hit_x.append(video_data)
                ndom_b_hit_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Back Hit"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_b_miss_score) != 0:
                ndom_b_miss_y.append(ndom_b_miss_score["avg_human_miss"][0])
                ndom_b_miss_x.append(video_data)
                ndom_b_miss_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Back Miss"+"Missing scores: "+subject + " " + date + " " + run)
            if len(ndom_b_step_score) != 0:
                ndom_b_step_y.append(ndom_b_step_score["avg_human_steps"][0])
                ndom_b_step_x.append(video_data)
                ndom_b_step_videos.append(video_info.iloc[0])
            else:
                if show_skipped:
                    print("Nondom Back Step"+"Missing scores: "+subject + " " + date + " " + run)
                continue
        return [dom_f_hit_x, dom_f_miss_x, dom_f_step_x, ndom_f_hit_x, ndom_f_miss_x, ndom_f_step_x,
    dom_b_hit_x, dom_b_miss_x, dom_b_step_x, ndom_b_hit_x, ndom_b_miss_x, ndom_b_step_x],[dom_f_hit_y, dom_f_miss_y, dom_f_step_y, ndom_f_hit_y, ndom_f_miss_y, ndom_f_step_y, dom_b_hit_y,
    dom_b_miss_y, dom_b_step_y, ndom_b_hit_y, ndom_b_miss_y, ndom_b_step_y],[dom_f_hit_videos, dom_f_miss_videos, dom_f_step_videos, ndom_f_hit_videos, ndom_f_miss_videos, ndom_f_step_videos, dom_b_hit_videos,
    dom_b_miss_videos, dom_b_step_videos, ndom_b_hit_videos, ndom_b_miss_videos, ndom_b_step_videos]

    def process_data(data_path,project_name,rung_network,rung_it,max_len,show_skipped):
        
        X = []
        video_list = []

        rat_folder = glob.glob(data_path)
        
        for file in rat_folder:
            #rung file info
            rung_name_list = file.split("/")[-1].split("_")[0:8]+[rung_network,str(rung_it)+".h5"]
            rung_file = '_'.join(rung_name_list)
            #rat tracking file
            #open the file
            rat_df = pd.read_hdf(file)[project_name]
            #properties of the file
            subject = file.split("/")[0]
            date_raw = rung_name_list[1]
            date = parser.parse(date_raw).date().strftime("%Y-%m-%d")
            run = rung_name_list[2]
            crossing = [int(s) for s in rung_name_list[3] if s.isdigit()][0]
                
            video_info = pd.DataFrame({"subject":[subject],"date":[date],"crossing":[crossing],"run":[run]})

            df = rat_df
            df_cols = df.columns.tolist()
            df_temp = df

            #remove where median likelihood is low
            #df_low_like = df.columns[-df_temp[df_temp.columns.get_level_values(0).unique()].median().ge(0.2)].get_level_values(0).tolist()
            #df_cols_up = [x for x in df_cols if x not in df_low_like ]

            #df = rat_df[df_cols_up]
            if df.shape[1] == 0:
                continue
            #if len(dom_f_low_like) >0:
                #print(dom_f_low_like)
            df = df.drop('likelihood', axis=1, level=1)

            #scale data
            ####Uses the values function of pandas - converts any dataframe to an array
            data_for_scaling = df.values
            #### the scaling object
            scaler = MinMaxScaler()
            #We will use fit_transform here - we want to actually scale this data, not use the scaler on a different dataset
            scaled_data = scaler.fit_transform(data_for_scaling)

            if len(scaled_data) < max_len:
                newlength = (max_len-len(scaled_data))
                zero = np.zeros((newlength,6))
                scaled_temp = pd.DataFrame(scaled_data).append(pd.DataFrame(zero),ignore_index=True)
                scaled_temp = scaled_temp.fillna(0)
                scaled_data2 = scaled_temp.values
            else:
                scaled_data2 = scaled_data

            video_data = scaled_data2.flatten()
            X.append(video_data)
            video_list.append(video_info)
            
        return X,video_list


# Scoring

This class makes scoring possible with a support vector regressor and random forest regreesor. The result is some metrics of interest and a saved model

In [3]:
class Scoring:
    def support_vector(X,y,name,kern,save=True):
        data = np.array(X)
        labels = np.array(y)
        train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 42)
        regressor = SVR(kernel=kern)
        regressor.fit(train_features, train_labels)
        predictions = regressor.predict(test_features)# Calculate the absolute errors
        errors = abs(predictions - test_labels)# Print out the mean absolute error (mae)
        # Calculate mean absolute percentage error (MAPE)
        mape = 100 * (errors / test_labels)# Calculate and display accuracy
        rmse = mean_squared_error(test_labels,predictions,squared=True)
        accuracy = 100 - np.mean(mape)
        if save == True:
            joblib.dump(regressor, name+".joblib", compress=0)
        return accuracy,rmse
    def forest(X,y,name,data_path,project_name, njobs=None,save=True):
        data = np.array(X)
        labels = np.array(y)
        train_features, test_features, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 42)
        # Instantiate model with n decision trees
        rf = RandomForestRegressor(n_estimators = 1000, random_state = 42,n_jobs=njobs)# Train the model on training data
        rf.fit(train_features, train_labels)
        # Use the forest's predict method on the test data
        predictions = rf.predict(test_features)# Calculate the absolute errors
        errors = abs(predictions - test_labels)# Print out the mean absolute error (mae)
        # Calculate mean absolute percentage error (MAPE)
        mape = 100 * (errors / test_labels)# Calculate and display accuracy
        accuracy = 100 - np.mean(mape)
        rmse = mean_squared_error(test_labels,predictions,squared=True)
        
        if save == True:
            joblib.dump(rf, name+".joblib", compress=0) 
            #print(f"Uncompressed Random Forest: {np.round(os.path.getsize(rf_name+'.joblib') / 1024 / 1024, 2) } MB")
        
        rat_folder = glob.glob(data_path)
        df = pd.read_hdf(rat_folder[0])[project_name]
        df = df.drop('likelihood', axis=1, level=1)
        feats = {} # a dict to hold feature_name: feature_importance
        for feature, importance in zip(df.columns, rf.feature_importances_):
            feats[feature] = importance #add the name/value pair 
        importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
        return predictions,errors,mape,accuracy,rmse,importances.sort_values(by="Gini-importance",ascending=False)

# Process the Data

In [4]:
datas = Processing.process_with_labels("*/dlc_output_resnet50/*.h5","LW_Manual_scores_for_ICC_2020-05-20.csv","DLC_resnet50_LadderWalkFeb13shuffle1_450000","LadderWalkMar12shuffle1",rung_it=450000,show_skipped = False)

In [5]:
data_x = datas[0]
labels_y = datas[1]
videos = datas[2]

# Train and save the support vector machines

In [6]:
for i,k in enumerate(Processing().keys):
    print(i,k)
    svr_results = Scoring.support_vector(data_x[i],labels_y[i],k+"_svr",kern="rbf",save=True)
    print("accuracy: ", svr_results[0]," RMSE: ", svr_results[1])

0 dominant_front_hit
accuracy:  81.49738841675024  RMSE:  0.6367471567401566
1 dominant_front_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.851113772828861
2 dominant_front_step
accuracy:  87.7935220047853  RMSE:  0.7090797674513794
3 nondominant_front_hit
accuracy:  86.53867990781295  RMSE:  0.6345549453646989
4 nondominant_front_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.18857045264878267
5 nondominant_front_step
accuracy:  85.02256467467588  RMSE:  0.7360277948512086
6 dominant_back_hit
accuracy:  86.97608356360936  RMSE:  0.3196878367470701
7 dominant_back_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.2710642785596339
8 dominant_back_step
accuracy:  89.76351535316701  RMSE:  0.2539401459284037
9 nondominant_back_hit
accuracy:  88.0646789214716  RMSE:  0.2904149005470281
10 nondominant_back_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.22521397195566492
11 nondominant_back_step
accuracy:  89.60608060900736  RMSE:  0.27027278691849765


# Train and save random forests

In [7]:
for i,k in enumerate(Processing().keys):
    print(i,k)
    forest_results = Scoring.forest(data_x[i],labels_y[i],k+"_random_forest","*/dlc_output_resnet50/*.h5","DLC_resnet50_LadderWalkFeb13shuffle1_450000",njobs=6,save=True)
    print("accuracy: ", forest_results[3]," RMSE: ", forest_results[4]," Top Features: \n",forest_results[5].head())

0 dominant_front_hit
accuracy:  82.11270034929609  RMSE:  0.6114119290780143  Top Features: 
                    Gini-importance
(left fingers, y)         0.002786
(right ankle, y)          0.002783
(left wrist, y)           0.001687
(hip, y)                  0.001400
(left ankle, y)           0.001380
1 dominant_front_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.769741510638298  Top Features: 
                    Gini-importance
(right eye, y)            0.002610
(left fingers, y)         0.002061
(nose, y)                 0.001460
(left wrist, y)           0.001326
(right ankle, y)          0.000507
2 dominant_front_step
accuracy:  87.82283696608636  RMSE:  0.7351758203309677  Top Features: 
                    Gini-importance
(left fingers, y)         0.001347
(base of tail, y)         0.000822
(left ankle, y)           0.000809
(right elbow, y)          0.000674
(right ankle, y)          0.000559
3 nondominant_front_hit
accuracy:  85.79727826375145  RMSE:  0.7382999834515391  Top Features: 
                  Gini-importance
(hip, y)                0.000896
(left elbow, y)         0.000519
(right toes, y)         0.000510
(left wrist, y)         0.000491
(nose, y)               0.000476
4 nondominant_front_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.25520445153664334  Top Features: 
                   Gini-importance
(left eye, y)            0.006461
(left toes, y)           0.001739
(right ankle, y)         0.001710
(nose, y)                0.001166
(right eye, y)           0.001145
5 nondominant_front_step
accuracy:  85.96109723547757  RMSE:  0.7996752553191462  Top Features: 
                    Gini-importance
(left elbow, y)           0.000896
(hip, y)                  0.000424
(left fingers, y)         0.000391
(left wrist, y)           0.000244
(right ankle, y)          0.000191
6 dominant_back_hit
accuracy:  86.13756133936978  RMSE:  0.3726776997635938  Top Features: 
                    Gini-importance
(hip, y)                  0.004001
(left elbow, y)           0.002062
(left fingers, y)         0.001907
(right wrist, y)          0.001287
(right ankle, y)          0.001271
7 dominant_back_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.28605895271867604  Top Features: 
                    Gini-importance
(left toes, y)            0.002155
(right wrist, y)          0.000676
(left fingers, y)         0.000553
(hip, y)                  0.000510
(right ankle, y)          0.000467
8 dominant_back_step
accuracy:  89.81079459842756  RMSE:  0.26266828841607737  Top Features: 
                    Gini-importance
(right ankle, y)          0.003611
(left elbow, y)           0.002479
(left fingers, y)         0.001531
(right eye, y)            0.000946
(left toes, y)            0.000882
9 nondominant_back_hit
accuracy:  88.17773108097578  RMSE:  0.2800198416075649  Top Features: 
                    Gini-importance
(left ankle, y)           0.001845
(right ankle, y)          0.001254
(left elbow, y)           0.001226
(nose, y)                 0.001218
(base of tail, y)         0.000871
10 nondominant_back_miss


  mape = 100 * (errors / test_labels)# Calculate and display accuracy


accuracy:  -inf  RMSE:  0.22126353191489354  Top Features: 
                     Gini-importance
(left toes, y)             0.000333
(right ankle, y)           0.000273
(right fingers, y)         0.000218
(left eye, x)              0.000203
(right toes, y)            0.000150
11 nondominant_back_step
accuracy:  89.97073172099769  RMSE:  0.25062372340425493  Top Features: 
                   Gini-importance
(left toes, y)           0.002809
(right ankle, y)         0.001439
(left elbow, y)          0.001239
(hip, y)                 0.001229
(left ankle, y)          0.000704


# Validation with KFold Cross-Validation

In [8]:
from sklearn.model_selection import KFold

loaded_model = joblib.load("dominant_front_hit_random_forest.joblib")


temp_data = np.array(data_x[0])
temp_labels = np.array(labels_y[0])
temp_vids = np.array(videos[0])

scores = []

kf = KFold(n_splits=4, shuffle=True, random_state=0)

for train_index,test_index in kf.split(temp_data):
    X_train, X_test, y_train, y_test = temp_data[train_index], temp_data[test_index], temp_labels[train_index], temp_labels[test_index]
    scores.append(loaded_model.score(X_test, y_test))
print(scores)

[0.7094229827122307, 0.7535589655288377, 0.7872374389131778, 0.7680611759984091]


# Example of using loaded model

In [9]:
loaded_model = joblib.load("dominant_front_hit_random_forest.joblib")
temp_data = np.array(data_x[0])
temp_labels = np.array(labels_y[0])
temp_vids = np.array(videos[0])


train_features, test_features, train_labels, test_labels = train_test_split(temp_data, temp_labels, test_size = 0.5, random_state = 1)

result = loaded_model.score(test_features, test_labels)
print(result)

0.8000538634784186


# Example of use

I imagine that this tool can be used to score videos and quickly output a dataframe with the video information and scores

In [62]:
import random

#process data without labels
datas = Processing.process_data("*/dlc_output_resnet50/*.h5","DLC_resnet50_LadderWalkFeb13shuffle1_450000","LadderWalkMar12shuffle1",rung_it=450000,max_len=356,show_skipped = False)

data_x = datas[0]
videos = datas[1]

temp_data = np.array(data_x)
temp_vids = np.array(videos)


#get a random sample of the dataset
random.seed(23)
random.shuffle(temp_data)
indexes = []
values = []
for idx, val in random.sample(list(enumerate(temp_data)), (len(temp_data)//10)):
    indexes.append(idx)
    values.append(val)

#get video info
vids = temp_vids[indexes]

#load model
loaded_model = joblib.load("dominant_front_hit_random_forest.joblib")
pred_labels = loaded_model.predict(values)

ex_df = pd.DataFrame(np.concatenate(vids),columns=["Subject","Date","Crossing","Run"])
ex_df["predictions"] = pred_labels
print(ex_df[["predictions"]])

    predictions
0      4.432667
1      4.807000
2      3.361000
3      4.685000
4      4.095000
5      3.361000
6      3.241000
7      3.469333
8      4.619333
9      4.017333
10     3.520000
11     3.652333
12     3.385333
13     3.559333
14     3.789667
15     3.619000
16     3.520000
17     2.829000
18     4.096667
19     3.558667
20     3.357667
21     2.871333
22     2.978333
