In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [4]:
data = pd.read_csv('data.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,no_strokes_st,no_strokes_dy,speed_st,speed_dy,magnitude_vel_st,magnitude_horz_vel_st,magnitude_vert_vel_st,magnitude_vel_dy,magnitude_horz_vel_dy,...,magnitude_horz_jerk_dy,magnitude_vert_jerk_dy,ncv_st,ncv_dy,nca_st,nca_dy,in_air_stcp,on_surface_st,on_surface_dy,target
0,0,12.0,2.0,0.000293,0.000431,0.061342,0.038319,0.03905,0.084891,0.053885,...,6e-06,6e-06,185.25,412.857143,61.833333,470.0,0.0,3678.0,4852.0,1.0
1,1,4.0,6.0,0.000286,0.000281,0.119159,0.077012,0.074216,0.160497,0.10136,...,1e-05,9e-06,192.777778,173.875,102.5,54.0,0.0,1688.0,1587.0,1.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'Unnamed: 0'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((53, 29), (24, 29))

In [6]:
sfs1 = SFS(RandomForestClassifier(n_jobs=4), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   18.4s finished

[2021-07-08 22:10:52] Features: 1/10 -- score: 0.943452380952381[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   12.0s finished

[2021-07-08 22:11:04] Features: 2/10 -- score: 0.9672619047619048[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   12.5s finished

[2021-07-08 22:11:16] Features: 3/10 -- score: 0.9702380952380952[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

In [13]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_dy', 'speed_st', 'speed_dy', 'magnitude_horz_vel_dy',
       'magnitude_vert_acc_dy', 'magnitude_vert_jerk_st', 'magnitude_jerk_dy',
       'ncv_dy', 'nca_st', 'in_air_stcp'],
      dtype='object')

In [14]:
sfs1.k_score_

0.9751984126984127

In [19]:
pd.DataFrame.from_dict(sfs1.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(15,)","[0.9375, 0.9642857142857143, 0.9285714285714286]",0.943452,"(15,)",0.0341502,0.0151757,0.0107308
2,"(1, 15)","[0.9375, 0.9642857142857143, 1.0]",0.967262,"(1, 15)",0.0576133,0.0256022,0.0181035
3,"(1, 15, 26)","[0.9821428571428572, 0.9285714285714286, 1.0]",0.970238,"(1, 15, 26)",0.0683004,0.0303513,0.0214616
4,"(1, 15, 19, 26)","[1.0, 1.0, 0.9047619047619049]",0.968254,"(1, 15, 19, 26)",0.10103,0.0448957,0.031746
5,"(1, 15, 19, 23, 26)","[0.9821428571428572, 0.9821428571428572, 1.0]",0.988095,"(1, 15, 19, 23, 26)",0.0189431,0.00841794,0.00595238
6,"(1, 3, 15, 19, 23, 26)","[1.0, 0.9821428571428572, 0.9761904761904763]",0.986111,"(1, 3, 15, 19, 23, 26)",0.0227668,0.0101171,0.00715387
7,"(1, 3, 15, 18, 19, 23, 26)","[1.0, 1.0, 0.9642857142857143]",0.988095,"(1, 3, 15, 18, 19, 23, 26)",0.0378863,0.0168359,0.0119048
8,"(1, 3, 8, 15, 18, 19, 23, 26)","[1.0, 0.9642857142857143, 0.9880952380952381]",0.984127,"(1, 3, 8, 15, 18, 19, 23, 26)",0.0334125,0.0148478,0.010499
9,"(1, 3, 8, 15, 18, 19, 23, 24, 26)","[1.0, 1.0, 0.9404761904761905]",0.980159,"(1, 3, 8, 15, 18, 19, 23, 24, 26)",0.0631438,0.0280598,0.0198413
10,"(1, 2, 3, 8, 15, 18, 19, 23, 24, 26)","[1.0, 0.9732142857142857, 0.9523809523809524]",0.975198,"(1, 2, 3, 8, 15, 18, 19, 23, 24, 26)",0.0438611,0.019491,0.0137822


In [20]:
sfs1 = SFS(RandomForestClassifier(n_jobs=4), 
           k_features=(1,26), 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   22.4s finished

[2021-07-08 22:30:58] Features: 1/26 -- score: 0.9315476190476191[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   12.8s finished

[2021-07-08 22:31:11] Features: 2/26 -- score: 0.9642857142857143[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   11.4s finished

[2021-07-08 22:31:22] Features: 3/26 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 |

In [21]:
sfs1.k_score_

1.0

In [22]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['magnitude_vert_acc_dy', 'magnitude_jerk_dy', 'in_air_stcp'], dtype='object')

In [25]:
features=data[['magnitude_vert_acc_dy','magnitude_jerk_dy','in_air_stcp',]]
features.head(1)

Unnamed: 0,magnitude_vert_acc_dy,magnitude_jerk_dy,in_air_stcp
0,0.000282,1e-05,0.0


In [27]:
y=data[['target']]


In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler((-1,1))
x=scaler.fit_transform(features)

In [29]:
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=5)

In [32]:
from sklearn.metrics import accuracy_score,mean_squared_error
model=RandomForestClassifier()
model.fit(x_train,y_train)

RandomForestClassifier()

In [33]:
y_predtr=model.predict(x_train)
print(accuracy_score(y_train,y_predtr)*100)

100.0


In [34]:
y_pred=model.predict(x_test)
print(accuracy_score(y_test, y_pred)*100)

93.75
