# RadmonForest classifier with cross validation

In former runs we saw an extremely good performance on train and on validation data, while performance on the test data was worse (but still over 80%). Since we are cautious if this is related to potential overfitting (from data in train and validation set being too similar) or if this stems from certain characteristics from the test data (which was hand picked), we want to use cross-validation to get a better feeling for the generalizability of our models.

We start with RandomForest on single frame, since performance seemed okay and it is much faster to fit the model as opposed to other architectures.

In [9]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import preprocessing, metrics
import scikitplot as skplt
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import pickle
from sklearn.tree import DecisionTreeClassifier

We import a labelled and norlazed frames. However, for now we removed all unlabelled frames, so that we don't end up with a huge imbalance on `not_defined` labels in our data set.

In [3]:
df = pd.read_hdf("merged_20210405.h5")
df = df[(df['behavior'] != "not_defined")]
df

bodyparts,head,head,head,beak,beak,beak,left_neck,left_neck,left_neck,right_neck,...,body,tail,tail,tail,middle_neck,middle_neck,rotation_angle,file_name,frame,behavior
coords,x,y,likelihood,x,y,likelihood,x,y,likelihood,x,...,likelihood,x,y,likelihood,x,y,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34600,-6.256115,133.140698,0.998600,28.458820,157.263363,0.974056,39.766114,99.337532,1.000000,-39.766114,...,0.999989,16.614607,-106.037636,0.999988,-6.559721e-15,98.676032,-127.844964,233_1000ms_gopro_cut_day1DLC_resnet50_goprovrJ...,34600,head_shake
34601,3.769024,145.079564,0.992194,1.435601,168.635165,0.998704,39.091074,102.566764,0.999997,-39.091074,...,0.999955,17.690069,-106.504741,0.999975,6.983719e-15,101.558517,-127.805934,233_1000ms_gopro_cut_day1DLC_resnet50_goprovrJ...,34601,head_shake
34602,0.213448,148.213782,0.997358,-2.950560,176.568780,0.999940,39.161616,105.930661,0.999996,-39.161616,...,0.999971,19.303123,-104.565719,0.999963,1.584214e-14,104.192925,-128.614585,233_1000ms_gopro_cut_day1DLC_resnet50_goprovrJ...,34602,head_shake
34603,2.762456,147.657923,0.999761,525.309970,604.796841,0.860005,38.959007,106.304114,0.999999,-38.959007,...,0.999973,16.542397,-105.026859,0.999965,1.052610e-14,102.874054,-127.998885,233_1000ms_gopro_cut_day1DLC_resnet50_goprovrJ...,34603,head_shake
34604,2.196279,143.526230,0.997774,34.527160,149.529552,0.999838,39.876856,103.607613,1.000000,-39.876856,...,0.999987,19.000453,-110.547172,0.999992,-1.431422e-14,100.011360,-128.414942,233_1000ms_gopro_cut_day1DLC_resnet50_goprovrJ...,34604,head_shake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38031,4.897417,177.600705,0.999900,30.771665,203.767842,0.999990,40.332658,120.383899,0.999997,-40.332658,...,0.999998,30.366276,-111.999133,0.999990,2.690406e-14,125.305453,-133.738577,892_500ms_gopro_cut_day2DLC_resnet50_goprovrJu...,38031,head_shake
38032,1.005224,169.487090,0.999758,16.386927,209.473704,0.999982,40.241152,121.762393,0.999998,-40.241152,...,0.999998,26.714623,-110.071315,0.999994,2.121088e-14,125.353381,-132.919632,892_500ms_gopro_cut_day2DLC_resnet50_goprovrJu...,38032,head_shake
38033,1.033190,170.307134,0.999995,11.928434,205.935297,0.999999,41.282461,123.487411,1.000000,-41.282461,...,0.999998,29.245919,-108.853585,0.999989,3.990525e-15,128.161332,-133.700335,892_500ms_gopro_cut_day2DLC_resnet50_goprovrJu...,38033,head_shake
38034,-1.618751,173.363199,0.999999,16.374546,204.249601,1.000000,40.873566,124.888961,0.999999,-40.873566,...,0.999997,26.189131,-106.770526,0.999983,4.046852e-14,130.237987,-133.262292,892_500ms_gopro_cut_day2DLC_resnet50_goprovrJu...,38034,head_shake


In [4]:
features = [('head', 'x'), ('head', 'y'), 
            ('tail', 'x'), ('tail', 'y'),  
            
            ('left_neck', 'x'), ('left_neck', 'y'), 
            ('right_neck', 'x'), ('right_neck', 'y'), 
            
            ('left_up_wing', 'x'), ('left_up_wing', 'y'),
            ('left_middle_wing', 'x'), ('left_middle_wing', 'y'),
            ('left_down_wing', 'x'), ('left_down_wing', 'y'),
            
            ('right_up_wing', 'x'), ('right_up_wing', 'y'),
            ('right_middle_wing', 'x'), ('right_middle_wing', 'y'),
            ('right_down_wing', 'x'), ('right_down_wing', 'y'),
]

y = df['behavior']
x = df[features].values

clf = RandomForestClassifier(random_state=0, verbose=1)
scores = cross_val_score(clf, x, y, n_jobs=-1)
print(scores)

[0.82273121 0.76566345 0.67487725 0.8269934  0.79107961]


## Shuffle Stratified KFold

In [7]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_shuffle = cross_val_score(clf, x, y, n_jobs=-1, cv=kfold)
print(scores_shuffle)

[0.98787476 0.98757437 0.98768413 0.98840037 0.98773027]


# Decision Tree

Trying out decision tree to visualize something we might be able to understand.

In [11]:
dt = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(dt, x, y, n_jobs=-1)
print(scores)

[0.62520363 0.69361678 0.56003697 0.67853202 0.76070568]


In [12]:
scores_shuffle = cross_val_score(dt, x, y, n_jobs=-1, cv=kfold)
print(scores_shuffle)

[0.95771475 0.95825776 0.95769164 0.95871387 0.95815353]
