# Feature selection as basis of sensor selection
Starting with the 70x3k mutual information dataset, I first try to see how low I can get with the features without sacrifizing accuracy. I later (next notebook) use that to rank the reduction per sensor.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import pickle
import pathlib
import tqdm
import numpy as np
import numpy.random
import pandas as pd
from sklearn import svm
import sklearn.metrics as metr
import sklearn.preprocessing as pp
import sklearn.linear_model as lm
import sklearn.neighbors.nearest_centroid as nc
import sklearn.neighbors as ne
import sklearn.naive_bayes as nb
import sklearn.ensemble as em
import sklearn.discriminant_analysis as da
import sklearn.gaussian_process as gp
from sklearn import tree
from sklearn.metrics import f1_score
from sklearn.base import clone
import sklearn.model_selection as ms
import gestureanalysis.svm_helpers as svmhelper
import sklearn.feature_selection as fs
import sklearn.utils
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
base_path = "/home/jsimon/Documents/thesis/gesture-analysis/data/"
time_groups_path_corrected_pickl = base_path+"transformed/time_added/all/time-and-groups-corrected-all.pkl"
stats_added_base_path = base_path+"transformed/stats_added/all/"
stats_added_path_pickl = stats_added_base_path+"raw_stats-added-all.pkl"
gyro_calibration_path = base_path+'../scripts/gestureanalysis/gyro_offset.txt'

In [4]:
import os
os.getcwd()

'/home/jsimon/Documents/thesis/gesture-analysis/scripts'

In [5]:
with open( stats_added_base_path+'train-data-scaled-mutual-inf60.pkl', "rb" ) as users_pickle_file:
    data = pickle.load(users_pickle_file)

In [6]:
X = data['X']
y = data['y']
Xval = data['Xval']
yval = data['yval']
gestures = data['gestures']
features = data['headers']

In [7]:
X, y = sklearn.utils.shuffle(X, y)

In [8]:
del [[data]]
gc.collect()
data = pd.DataFrame()
gc.collect()

0

## RFE with Lasso, $\alpha$ = 0.1

In [9]:
linc = sklearn.linear_model.Lasso(alpha = 0.1)
print('svc start')
linc.fit(X, y)
print('svc fittet')
rfecv = fs.RFE(estimator=linc, step=0.1, verbose=2)
print('rfe start')
rfecv.fit(X, y)
print('rfe fittet')
col_index = rfecv.get_support(indices=True)

svc start




svc fittet
rfe start
Fitting estimator with 3821 features.




Fitting estimator with 3439 features.




Fitting estimator with 3057 features.




Fitting estimator with 2675 features.




Fitting estimator with 2293 features.




Fitting estimator with 1911 features.




rfe fittet




In [10]:
Xrfe = X[:,col_index]
XValrfe = Xval[:,col_index]

In [11]:
col_index.shape

(1910,)

In [12]:
rfe_features = features[:,col_index]
rfe_features

array([['0_10_Thumb_base_kurtosis', '0_11_Thumb_base_mode',
        '0_12_Thumb_base_fft_mean', ..., '7197_fftPval_8_10',
        '7198_diff_8_10', '7200_xcorr_8_10']], dtype='<U73')

In [13]:
with open( stats_added_base_path+'train-data-rfe.pkl', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': Xrfe, 'y': y, 'Xval' : XValrfe, 'yval': yval, 
        'gestures' : gestures, 'headers': rfe_features
    }, users_pickle_file)

## RFE-CV with Lasso, $\alpha$ = 0.1

In [15]:
linc = sklearn.linear_model.Lasso(alpha = 0.1)
print('svc start')
linc.fit(X, y)
print('svc fittet')
rfecv = fs.RFECV(estimator=linc, step=0.05, verbose=2)
print('rfe start')
rfecv.fit(X, y)
print('rfe fittet')
col_index = rfecv.get_support(indices=True)

svc start




svc fittet
rfe start
Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.
Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




rfe fittet




In [16]:
Xrfe = X[:,col_index]
XValrfe = Xval[:,col_index]

In [17]:
col_index.shape

(383,)

In [18]:
rfe_features = features[:,col_index]
rfe_features

array([['0_22_Thumb_base_fft_kurtosis', '0_57_Thumb_base_peak_max',
        '2_50_Angle_between_thumb_and_hand_cwt_range',
        '3_25_Finger_1_base_spectral_entropy',
        '4_21_Finger_1_tip_fft_skew',
        '4_24_Finger_1_tip_spectral_centroid',
        '4_25_Finger_1_tip_spectral_entropy', '4_27_Finger_1_tip_ff2',
        '4_33_Finger_1_tip_cwt_sums_0', '4_51_Finger_1_tip_cwt_var',
        '5_1_Finger_2_base_std', '5_32_Finger_2_base_bandwith',
        '5_40_Finger_2_base_cwt_sums_7', '6_26_Finger_2_tip_ff1',
        '6_33_Finger_2_tip_cwt_sums_0', '6_44_Finger_2_tip_cwt_std',
        '6_55_Finger_2_tip_num_peaks', '7_33_Finger_3_base_cwt_sums_0',
        '8_5_Finger_3_tip_75q', '8_28_Finger_3_tip_ff3',
        '8_44_Finger_3_tip_cwt_std', '9_32_Finger_4_base_bandwith',
        '10_17_Finger_4_tip_fft_75q', '10_19_Finger_4_tip_fft_range',
        '10_20_Finger_4_tip_fft_var', '10_21_Finger_4_tip_fft_skew',
        '10_27_Finger_4_tip_ff2', '10_29_Finger_4_tip_ff4',
        '1

In [19]:
with open( stats_added_base_path+'train-data-rfecv1.pkl', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': Xrfe, 'y': y, 'Xval' : XValrfe, 'yval': yval, 
        'gestures' : gestures, 'headers': rfe_features
    }, users_pickle_file)

## RFE with Lasso, $\alpha$ = 0.01

In [9]:
linc2 = sklearn.linear_model.Lasso(alpha = 0.01, max_iter=2000, tol=1e-3)
print('svc start')
linc2.fit(X, y)
print('svc fittet')
rfecv2 = fs.RFECV(estimator=linc2, cv=5, step=0.05, verbose=2)
print('rfe start')
rfecv2.fit(X, y)
print('rfe fittet')
col_index2 = rfecv2.get_support(indices=True)

svc start




svc fittet
rfe start
Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.
Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.
Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




rfe fittet




NameError: name 'rfecv' is not defined

In [11]:
col_index2 = rfecv2.get_support(indices=True)
Xrfe = X[:,col_index2]
XValrfe = Xval[:,col_index2]

In [12]:
col_index2.shape

(765,)

In [13]:
rfe_features = features[:,col_index2]
rfe_features

array([['0_31_Thumb_base_freq_5sum', '0_46_Thumb_base_cwt_25q',
        '0_58_Thumb_base_peak_mean', '1_26_Thumb_pressure_ff1',
        '2_1_Angle_between_thumb_and_hand_std',
        '2_12_Angle_between_thumb_and_hand_fft_mean',
        '2_17_Angle_between_thumb_and_hand_fft_75q',
        '2_23_Angle_between_thumb_and_hand_fft_mode',
        '2_24_Angle_between_thumb_and_hand_spectral_centroid',
        '2_27_Angle_between_thumb_and_hand_ff2',
        '2_32_Angle_between_thumb_and_hand_bandwith',
        '2_38_Angle_between_thumb_and_hand_cwt_sums_5',
        '2_43_Angle_between_thumb_and_hand_cwt_mean',
        '2_44_Angle_between_thumb_and_hand_cwt_std',
        '2_45_Angle_between_thumb_and_hand_cwt_min',
        '2_50_Angle_between_thumb_and_hand_cwt_range',
        '2_55_Angle_between_thumb_and_hand_num_peaks',
        '2_57_Angle_between_thumb_and_hand_peak_max',
        '2_58_Angle_between_thumb_and_hand_peak_mean',
        '3_4_Finger_1_base_median', '3_5_Finger_1_base_75q',
 

In [14]:
with open( stats_added_base_path+'train-data-rfecv2.pk', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': Xrfe, 'y': y, 'Xval' : XValrfe, 'yval': yval, 
        'gestures' : gestures, 'headers': rfe_features
    }, users_pickle_file)

## RFE with Lasso, $\alpha$ = 0.5

In [16]:
linc3 = sklearn.linear_model.Lasso(alpha = 0.5, max_iter=2000, tol=1e-3)
print('svc start')
linc3.fit(X, y)
print('svc fittet')
rfecv3 = fs.RFECV(estimator=linc3, cv=5, step=0.05, verbose=2)
print('rfe start')
rfecv3.fit(X, y)
print('rfe fittet')
col_index3 = rfecv3.get_support(indices=True)

svc start
svc fittet
rfe start
Fitting estimator with 3821 features.
Fitting estimator with 3630 features.
Fitting estimator with 3439 features.
Fitting estimator with 3248 features.
Fitting estimator with 3057 features.
Fitting estimator with 2866 features.
Fitting estimator with 2675 features.
Fitting estimator with 2484 features.
Fitting estimator with 2293 features.
Fitting estimator with 2102 features.
Fitting estimator with 1911 features.
Fitting estimator with 1720 features.
Fitting estimator with 1529 features.
Fitting estimator with 1338 features.
Fitting estimator with 1147 features.
Fitting estimator with 956 features.
Fitting estimator with 765 features.
Fitting estimator with 574 features.
Fitting estimator with 383 features.
Fitting estimator with 192 features.
Fitting estimator with 3821 features.
Fitting estimator with 3630 features.
Fitting estimator with 3439 features.
Fitting estimator with 3248 features.
Fitting estimator with 3057 features.
Fitting estimator with 2

In [22]:
col_index3 = rfecv3.get_support(indices=True)
Xrfe = X[:,col_index3]
XValrfe = Xval[:,col_index3]

NotFittedError: This RFECV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
col_index3.shape

In [18]:
rfe_features = features[:,col_index3]
rfe_features

array([['4_21_Finger_1_tip_fft_skew', '4_27_Finger_1_tip_ff2',
        '6_26_Finger_2_tip_ff1', '7_1_Finger_3_base_std',
        '7_2_Finger_3_base_min', '8_5_Finger_3_tip_75q',
        '8_19_Finger_3_tip_fft_range', '8_44_Finger_3_tip_cwt_std',
        '9_32_Finger_4_base_bandwith', '9_33_Finger_4_base_cwt_sums_0',
        '10_17_Finger_4_tip_fft_75q', '10_19_Finger_4_tip_fft_range',
        '10_20_Finger_4_tip_fft_var', '10_27_Finger_4_tip_ff2',
        '10_46_Finger_4_tip_cwt_25q', '10_56_Finger_4_tip_peak_min',
        '11_4_Thumb_tip_median', '11_5_Thumb_tip_75q',
        '12_26_Finger_1_pressure_ff1', '12_49_Finger_1_pressure_cwt_max',
        '16_10_Wrist_extension_kurtosis',
        '16_12_Wrist_extension_fft_mean',
        '16_14_Wrist_extension_fft_min', '16_17_Wrist_extension_fft_75q',
        '16_19_Wrist_extension_fft_range',
        '16_20_Wrist_extension_fft_var',
        '16_21_Wrist_extension_fft_skew', '16_26_Wrist_extension_ff1',
        '16_33_Wrist_extension_cwt_su

In [20]:
with open( stats_added_base_path+'train-data-rfecv3.pk', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': Xrfe, 'y': y, 'Xval' : XValrfe, 'yval': yval, 
        'gestures' : gestures, 'headers': rfe_features
    }, users_pickle_file)

## RFE with Lasso, $\alpha$ = 0.9

In [9]:
linc3 = sklearn.linear_model.Lasso(alpha = 0.9, max_iter=2000, tol=1e-4)
print('svc start')
linc3.fit(X, y)
print('svc fittet')
rfecv3 = fs.RFECV(estimator=linc3, cv=5, step=0.05, verbose=2)
print('rfe start')
rfecv3.fit(X, y)
print('rfe fittet')
col_index3 = rfecv3.get_support(indices=True)

svc start
svc fittet
rfe start
Fitting estimator with 3821 features.
Fitting estimator with 3630 features.
Fitting estimator with 3439 features.
Fitting estimator with 3248 features.
Fitting estimator with 2866 features.
Fitting estimator with 2675 features.
Fitting estimator with 2484 features.
Fitting estimator with 2293 features.
Fitting estimator with 2102 features.
Fitting estimator with 1911 features.
Fitting estimator with 1720 features.
Fitting estimator with 1529 features.
Fitting estimator with 1338 features.
Fitting estimator with 1147 features.
Fitting estimator with 956 features.
Fitting estimator with 765 features.
Fitting estimator with 574 features.
Fitting estimator with 383 features.
Fitting estimator with 192 features.
Fitting estimator with 3821 features.
Fitting estimator with 3630 features.
Fitting estimator with 3439 features.
Fitting estimator with 3248 features.
Fitting estimator with 3057 features.
Fitting estimator with 2866 features.
Fitting estimator with 2



Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.




Fitting estimator with 3630 features.




Fitting estimator with 3439 features.




Fitting estimator with 3248 features.




Fitting estimator with 3057 features.




Fitting estimator with 2866 features.




Fitting estimator with 2675 features.




Fitting estimator with 2484 features.




Fitting estimator with 2293 features.




Fitting estimator with 2102 features.




Fitting estimator with 1911 features.




Fitting estimator with 1720 features.




Fitting estimator with 1529 features.




Fitting estimator with 1338 features.




Fitting estimator with 1147 features.




Fitting estimator with 956 features.




Fitting estimator with 765 features.




Fitting estimator with 574 features.




Fitting estimator with 383 features.




Fitting estimator with 192 features.




Fitting estimator with 3821 features.
Fitting estimator with 3630 features.
Fitting estimator with 3439 features.
Fitting estimator with 3248 features.
Fitting estimator with 3057 features.
Fitting estimator with 2866 features.
Fitting estimator with 2675 features.
Fitting estimator with 2484 features.
Fitting estimator with 2293 features.
Fitting estimator with 2102 features.
Fitting estimator with 1911 features.
Fitting estimator with 1720 features.
Fitting estimator with 1529 features.
Fitting estimator with 1338 features.
Fitting estimator with 1147 features.
Fitting estimator with 956 features.
Fitting estimator with 765 features.
Fitting estimator with 574 features.
Fitting estimator with 383 features.
Fitting estimator with 192 features.
Fitting estimator with 3821 features.
Fitting estimator with 3630 features.
Fitting estimator with 3439 features.
Fitting estimator with 3248 features.
Fitting estimator with 3057 features.
Fitting estimator with 2866 features.
Fitting estimator

In [10]:
col_index3 = rfecv3.get_support(indices=True)
Xrfe = X[:,col_index3]
XValrfe = Xval[:,col_index3]

In [11]:
col_index3.shape

(192,)

In [12]:
rfe_features = features[:,col_index3]
rfe_features

array([['4_21_Finger_1_tip_fft_skew', '6_48_Finger_2_tip_cwt_75q',
        '7_1_Finger_3_base_std', '7_2_Finger_3_base_min',
        '8_5_Finger_3_tip_75q', '8_19_Finger_3_tip_fft_range',
        '8_44_Finger_3_tip_cwt_std', '9_33_Finger_4_base_cwt_sums_0',
        '10_17_Finger_4_tip_fft_75q', '10_19_Finger_4_tip_fft_range',
        '10_27_Finger_4_tip_ff2', '10_46_Finger_4_tip_cwt_25q',
        '10_56_Finger_4_tip_peak_min', '11_4_Thumb_tip_median',
        '12_26_Finger_1_pressure_ff1', '12_49_Finger_1_pressure_cwt_max',
        '16_10_Wrist_extension_kurtosis',
        '16_14_Wrist_extension_fft_min', '16_17_Wrist_extension_fft_75q',
        '16_19_Wrist_extension_fft_range',
        '16_21_Wrist_extension_fft_skew',
        '16_33_Wrist_extension_cwt_sums_0', '17_9_Wrist_flexion_skew',
        '17_18_Wrist_flexion_fft_max',
        '17_22_Wrist_flexion_fft_kurtosis', '17_29_Wrist_flexion_ff4',
        '17_32_Wrist_flexion_bandwith', '17_36_Wrist_flexion_cwt_sums_3',
        '17_38

In [13]:
with open( stats_added_base_path+'train-data-rfecv4.pk', "wb" ) as users_pickle_file:
    pickle.dump({
        'X': Xrfe, 'y': y, 'Xval' : XValrfe, 'yval': yval, 
        'gestures' : gestures, 'headers': rfe_features
    }, users_pickle_file)

# Try all the classifiers for all reduced data sets

## RFE with Lasso, $\alpha$ = 0.1

In [5]:
with open( stats_added_base_path+'train-data-rfe.pkl', "rb" ) as users_pickle_file:
    data = pickle.load(users_pickle_file)
Xrfe = data['X']
y = data['y']
XValrfe = data['Xval']
yval = data['yval']
gestures = data['gestures']
rfe_features = data['headers']

In [6]:
del [[data]]
gc.collect()
data = pd.DataFrame()
gc.collect()

0

In [7]:
# start with some list, and remove then when the classifier takes too long
classifiers = [("passive agressive", lm.PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, class_weight='balanced', n_jobs=3)), # put verbose away, too much output
               ("sgd[hinge]",lm.SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3, class_weight='balanced', n_jobs=3)), # put verbose away, too much output
               ("LDA", da.LinearDiscriminantAnalysis()),
               ("QDA", da.QuadraticDiscriminantAnalysis()),
               ("nn",nc.NearestCentroid()),
               ("gaussian navie bayse",nb.GaussianNB()),
               ("decision tree",tree.DecisionTreeClassifier(class_weight='balanced')), # timed out
               ("random forrest",em.RandomForestClassifier(n_estimators=10, class_weight='balanced', n_jobs=3, verbose=True)),
               ("extra trees",em.ExtraTreesClassifier(n_estimators=10, class_weight='balanced', n_jobs=3, verbose=True)),
               ("ada boost",em.AdaBoostClassifier(n_estimators=100)),
              ]

In [8]:
def fit_classifier(classifiers, X, y, Xval, yval):
    clf = None
    clf_n = None
    
    results = []
    progress = tqdm.tqdm_notebook(classifiers)
    for clf_name, clf_candidate in progress:
        progress.set_description(clf_name)
        clf_candidate = clone(clf_candidate)
        gc.collect()
        progress.set_description(clf_name + ' cloned')
        clf_candidate.fit(X, y)
        progress.set_description(clf_name + ' fittet')
        p = clf_candidate.predict(Xval)
        s = f1_score(yval, p, average='micro')
        p_t = clf_candidate.predict(X)
        s_t = f1_score(y, p_t, average='micro')
        print(f"val score {s} for {clf_name} (test: {s_t})")
        cmatrix = metr.confusion_matrix(yval, p)
        #print(cmatrix)
        results.append((clf_name, s, s_t, cmatrix, clf_candidate))
        gc.collect()

    return results

In [9]:
# 1910 features
gc.collect()
results = fit_classifier(classifiers, Xrfe, y, XValrfe, yval)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

val score 0.3289686388619463 for passive agressive (test: 0.429854400206159)
val score 0.06983511154219205 for sgd[hinge] (test: 0.25370441953356526)




val score 0.9335596508244423 for LDA (test: 0.9300347893312717)




val score 0.6058842547688329 for QDA (test: 0.9105914186316196)
val score 0.05835758163595215 for nn (test: 0.5099858265687411)
val score 0.37714193339799545 for gaussian navie bayse (test: 0.590310526993944)
val score 0.9164241836404785 for decision tree (test: 1.0)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:   14.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.3s finished


val score 0.965082444228904 for random forrest (test: 0.9976034016235021)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    2.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9624959586162302 for extra trees (test: 1.0)
val score 0.16359521500161656 for ada boost (test: 0.5719237211699523)



In [10]:
with open( stats_added_base_path+'rfe-a01cls.pkl', "wb" ) as users_pickle_file:
    pickle.dump(results, users_pickle_file)

## RFE-CV with Lasso, $\alpha$ = 0.1

In [11]:
with open( stats_added_base_path+'train-data-rfecv1.pkl', "rb" ) as users_pickle_file:
    data = pickle.load(users_pickle_file)
Xrfe = data['X']
y = data['y']
XValrfe = data['Xval']
yval = data['yval']
gestures = data['gestures']
rfe_features = data['headers']

In [12]:
rfe_features.shape

(1, 383)

In [13]:
del [[data]]
gc.collect()
data = pd.DataFrame()
gc.collect()

0

In [14]:
# start with some list, and remove then when the classifier takes too long
classifiers = [("passive agressive", lm.PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, class_weight='balanced', n_jobs=3)),
               ("sgd[hinge]",lm.SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3, class_weight='balanced', n_jobs=3)),
               ("LDA", da.LinearDiscriminantAnalysis()),
               ("QDA", da.QuadraticDiscriminantAnalysis()),
               ("nn",nc.NearestCentroid()),
               ("knn",ne.KNeighborsClassifier(n_jobs=3)),
               ("gaussian navie bayse",nb.GaussianNB()),
               ("decision tree",tree.DecisionTreeClassifier(class_weight='balanced')), # timed out
               ("random forrest",em.RandomForestClassifier(n_estimators=10, class_weight='balanced', n_jobs=3, verbose=True)),
               ("extra trees",em.ExtraTreesClassifier(n_estimators=10, class_weight='balanced', n_jobs=3, verbose=True)),
               ("ada boost",em.AdaBoostClassifier(n_estimators=100)),
               ("gradient boost",em.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0, verbose=True)) # timed out, says after 1h running that it still has 400 open
]

In [15]:
gc.collect()
results = fit_classifier(classifiers, Xrfe, y, XValrfe, yval)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

val score 0.8640478499838344 for passive agressive (test: 0.8402138899626337)
val score 0.8679275784028452 for sgd[hinge] (test: 0.8898853240561784)
val score 0.860491432266408 for LDA (test: 0.87862388867414)




val score 0.9547365017782089 for QDA (test: 0.9499935575312459)
val score 0.31037827352085356 for nn (test: 0.5547738693467337)
val score 0.8642095053346266 for knn (test: 0.9319546450199716)
val score 0.6665050113158746 for gaussian navie bayse (test: 0.7584589614740368)
val score 0.9212738441642419 for decision tree (test: 1.0)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    6.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9542515357258325 for random forrest (test: 0.997319932998325)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9553831231813773 for extra trees (test: 1.0)
val score 0.08098933074684772 for ada boost (test: 0.5457286432160804)
      Iter       Train Loss   Remaining Time 
         1 148777134548465481356780213633024.0000           28.50m
         2 148777134548465481356780213633024.0000           29.83m
         3 148777134548465481356780213633024.0000           30.00m
         4 148777134548465481356780213633024.0000           29.77m
         5 148777134548465481356780213633024.0000           29.55m
         6 148777134548465481356780213633024.0000           29.29m
         7 148777134548465481356780213633024.0000           29.04m
         8 148777134548465481356780213633024.0000           28.81m
         9 148777134548465481356780213633024.0000           28.52m
        10 148777134548465481356780213633024.0000           28.27m
        20 148777134548465481356780213633024.0000           25.25m
        30 148777134548465481356780213633024.0000           22.04m
        40 14877713454

In [16]:
with open( stats_added_base_path+'rfecv-a01cls.pkl', "wb" ) as users_pickle_file:
    pickle.dump(results, users_pickle_file)

## RFE with Lasso, $\alpha$ = 0.01

In [17]:
with open( stats_added_base_path+'train-data-rfecv2.pk', "rb" ) as users_pickle_file:
    data = pickle.load(users_pickle_file)
Xrfe = data['X']
y = data['y']
XValrfe = data['Xval']
yval = data['yval']
gestures = data['gestures']
rfe_features = data['headers']

In [18]:
rfe_features.shape

(1, 765)

In [19]:
del [[data]]
gc.collect()
data = pd.DataFrame()
gc.collect()

0

In [20]:
gc.collect()
results = fit_classifier(classifiers, Xrfe, y, XValrfe, yval)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

val score 0.9335596508244423 for passive agressive (test: 0.9095735085684834)
val score 0.9085030714516651 for sgd[hinge] (test: 0.9169050380105657)




val score 0.918525703200776 for LDA (test: 0.918683159386677)




val score 0.9159392175881022 for QDA (test: 0.7456513335910321)
val score 0.24264468153895893 for nn (test: 0.5305244169565778)
val score 0.9398642095053347 for knn (test: 0.9577116350985698)
val score 0.6753960556094407 for gaussian navie bayse (test: 0.7645019971653138)
val score 0.9295182670546395 for decision tree (test: 1.0)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    8.8s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9569996766892984 for random forrest (test: 0.9975132070609457)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    1.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9591011962495959 for extra trees (test: 1.0)
val score 0.06417717426446816 for ada boost (test: 0.5249194691405746)
      Iter       Train Loss   Remaining Time 
         1 156526536824715692912342015105701964652319330845701401737519169536.0000           54.24m
         2 156526536824715692912342015105701964652319330845701401737519169536.0000           55.46m
         3 156526536824715692912342015105701964652319330845701401737519169536.0000           55.14m
         4 156526536824715692912342015105701964652319330845701401737519169536.0000           54.67m
         5 156526536824715692912342015105701964652319330845701401737519169536.0000           54.16m
         6 156526536824715692912342015105701964652319330845701401737519169536.0000           53.60m
         7 156526536824715692912342015105701964652319330845701401737519169536.0000           53.18m
         8 156526536824715692912342015105701964652319330845701401737519169536.0000           52.64m
         9 156526536824715

In [21]:
with open( stats_added_base_path+'rfecv-a001cls.pkl', "wb" ) as users_pickle_file:
    pickle.dump(results, users_pickle_file)

## RFE with Lasso, $\alpha$ = 0.5

In [22]:
with open( stats_added_base_path+'train-data-rfecv3.pk', "rb" ) as users_pickle_file:
    data = pickle.load(users_pickle_file)
Xrfe = data['X']
y = data['y']
XValrfe = data['Xval']
yval = data['yval']
gestures = data['gestures']
rfe_features = data['headers']

In [23]:
rfe_features.shape

(1, 192)

In [24]:
del [[data]]
gc.collect()
data = pd.DataFrame()
gc.collect()

0

In [25]:
gc.collect()
results = fit_classifier(classifiers, Xrfe, y, XValrfe, yval)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

val score 0.5640155189136761 for passive agressive (test: 0.65095992784435)
val score 0.3942774005819593 for sgd[hinge] (test: 0.5442082205901302)




val score 0.7620433236340123 for LDA (test: 0.8215565004509727)




val score 0.871968962172648 for QDA (test: 0.6622084782888803)
val score 0.11024894924021987 for nn (test: 0.08577502899110939)
val score 0.5918202392499192 for knn (test: 0.8134389898208993)
val score 0.6138053669576463 for gaussian navie bayse (test: 0.7314907872696819)
val score 0.9139993533785968 for decision tree (test: 1.0)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    4.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


val score 0.9474620109925639 for random forrest (test: 0.9972555083107847)


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9429356611703847 for extra trees (test: 1.0)
val score 0.10006466214031684 for ada boost (test: 0.5253833268908645)
      Iter       Train Loss   Remaining Time 
         1 6602015407899320117018377891688703017680184815464363796367646607254270465528076589142113573227195027418678670216657366941696.0000           16.46m
         2 6602015407899320117018377891688703017680184815464363796367646607254270465528076589142113573227195027418678670216657366941696.0000           17.46m
         3 6602015407899320117018377891688703017680184815464363796367646607254270465528076589142113573227195027418678670216657366941696.0000           17.70m
         4 6602015407899320117018377891688703017680184815464363796367646607254270465528076589142113573227195027418678670216657366941696.0000           17.79m
         5 6602015407899320117018377891688703017680184815464363796367646607254270465528076589142113573227195027418678670216657366941696.0000           17.69m
         6 6602015407899320117018377

In [26]:
with open( stats_added_base_path+'rfecv-a05cls.pkl', "wb" ) as users_pickle_file:
    pickle.dump(results, users_pickle_file)

## RFE with Lasso, $\alpha$ = 0.9

In [27]:
with open( stats_added_base_path+'train-data-rfecv4.pk', "rb" ) as users_pickle_file:
    data = pickle.load(users_pickle_file)
Xrfe = data['X']
y = data['y']
XValrfe = data['Xval']
yval = data['yval']
gestures = data['gestures']
rfe_features = data['headers']

In [28]:
rfe_features.shape

(1, 192)

In [29]:
del [[data]]
gc.collect()
data = pd.DataFrame()
gc.collect()

0

In [30]:
gc.collect()
results = fit_classifier(classifiers, Xrfe, y, XValrfe, yval)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

val score 0.14807630132557387 for passive agressive (test: 0.3236954000773096)
val score 0.06902683478823149 for sgd[hinge] (test: 0.2505604947816003)




val score 0.6768509537665697 for LDA (test: 0.7818708929261693)




val score 0.7586485612673779 for QDA (test: 0.6123566550702229)
val score 0.06805690268347882 for nn (test: 0.5228707640767942)
val score 0.3027804720336243 for knn (test: 0.6806339389253963)
val score 0.3992887164565147 for gaussian navie bayse (test: 0.35454194047158877)
val score 0.8937924345295828 for decision tree (test: 1.0)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    4.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


val score 0.9367927578402845 for random forrest (test: 0.9970364643731477)


[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.9s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.2s finished


val score 0.9311348205625606 for extra trees (test: 1.0)
val score 0.11994827028774653 for ada boost (test: 0.5564617961602886)
      Iter       Train Loss   Remaining Time 
         1 313676269965149321367281407589102590509028946418998141834681898922597290180135658221980801621160886272.0000           15.46m
         2 313676269965149321367281407589102590509028946418998141834681898922597290180135658221980801621160886272.0000           15.56m
         3 313676269965149321367281407589102590509028946418998141834681898922597290180135658221980801621160886272.0000           15.41m
         4 313676269965149321367281407589102590509028946418998141834681898922597290180135658221980801621160886272.0000           15.41m
         5 313676269965149321367281407589102590509028946418998141834681898922597290180135658221980801621160886272.0000           15.26m
         6 313676269965149321367281407589102590509028946418998141834681898922597290180135658221980801621160886272.0000           15.07m
         7

In [31]:
with open( stats_added_base_path+'rfecv-a09cls.pkl', "wb" ) as users_pickle_file:
    pickle.dump(results, users_pickle_file)