In [92]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [93]:
data = pd.read_csv('data.csv')
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'Unnamed: 0'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((28, 29), (12, 29))

In [95]:
# find and remove correlated features
# in order to reduce the feature space a bit
# so that the algorithm takes shorter

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  17


In [96]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((28, 12), (12, 12))

In [97]:
# step forward feature selection
# Select 10 features based on optimal ROC_AUC scoring criteria

sfs1 = SFS(RandomForestClassifier(n_jobs=4), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    8.5s finished

[2021-02-10 14:07:26] Features: 1/10 -- score: 0.9837962962962964[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    5.0s finished

[2021-02-10 14:07:31] Features: 2/10 -- score: 0.976851851851852[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.5s finished

[2021-02-10 14:07:36] Features: 3/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | 

In [98]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_st', 'nca_st', 'nca_dy', 'in_air_stcp',
       'on_surface_st'],
      dtype='object')

In [99]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [100]:
# evaluate performance of algorithm built
# using selected features

run_randomForests(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.8857142857142857


In [101]:
# step forward feature selection
# Select 10 features based on optimal ROC_AUC scoring criteria
from sklearn.linear_model import LogisticRegression
sfs1 = SFS(LogisticRegression(n_jobs=4), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.2s finished

[2021-02-10 14:07:56] Features: 1/10 -- score: 0.875[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.2s finished

[2021-02-10 14:07:56] Features: 2/10 -- score: 0.875[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished

[2021-02-10 14:07:57] Features: 3/10 -- score: 0.7268518518518517[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  

In [102]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_dy', 'nca_dy', 'in_air_stcp', 'on_surface_st',
       'on_surface_dy'],
      dtype='object')

In [103]:

def run_logisticRegression(X_train, X_test, y_train, y_test):
    rf = LogisticRegression()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [104]:
run_logisticRegression(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 0.8722222222222222
Test set
auc: 0.8857142857142857


In [105]:
from sklearn.tree import DecisionTreeClassifier
sfs1 = SFS(DecisionTreeClassifier(), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2021-02-10 14:08:00] Features: 1/10 -- score: 0.9837962962962964[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s finished

[2021-02-10 14:08:00] Features: 2/10 -- score: 0.9629629629629631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished

[2021-02-10 14:08:00] Features: 3/10 -- score: 0.8935185185185186[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [106]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_st', 'ncv_dy', 'nca_st', 'nca_dy',
       'in_air_stcp', 'on_surface_dy'],
      dtype='object')

In [107]:

def run_decisionTreeClassifier(X_train, X_test, y_train, y_test):
    rf = DecisionTreeClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [108]:
run_decisionTreeClassifier(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.6285714285714286


In [109]:
from xgboost import XGBClassifier
sfs1 = SFS(XGBClassifier(), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s







[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    1.6s finished

[2021-02-10 14:08:02] Features: 1/10 -- score: 0.8541666666666666[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s






[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    1.5s finished

[2021-02-10 14:08:04] Features: 2/10 -- score: 0.9444444444444445[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.4s finished

[2021-02-10 14:08:05] Features: 3/10 -- score: 0.9814814814814815[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s





[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.2s finished

[2021-02-10 14:08:07] Features: 4/10 -- score: 0.9537037037037037[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.1s finished

[2021-02-10 14:08:08] Features: 5/10 -- score: 0.9629629629629631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.9s finished

[2021-02-10 14:08:09] Features: 6/10 -- score: 0.9629629629629631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.8s finished

[2021-02-10 14:08:10] Features: 7/10 -- score: 0.925925925925926[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.6s finished

[2021-02-10 14:08:10] Features: 8/10 -- score: 0.9398148148148149[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s finished

[2021-02-10 14:08:11] Features: 9/10 -- score: 0.9490740740740741[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished

[2021-02-10 14:08:11] Features: 10/10 -- score: 0.9907407407407408

In [110]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'speed_st', 'magnitude_vel_st', 'magnitude_jerk_st',
       'ncv_dy', 'nca_st', 'nca_dy', 'in_air_stcp', 'on_surface_st',
       'on_surface_dy'],
      dtype='object')

In [111]:

def run_xgb(X_train, X_test, y_train, y_test):
    rf = XGBClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [112]:
run_xgb(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.9142857142857143


In [113]:
from sklearn.ensemble import ExtraTreesClassifier
sfs1 = SFS(XGBClassifier(), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s







[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    1.7s finished

[2021-02-10 14:08:14] Features: 1/10 -- score: 0.8541666666666666[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s






[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    1.4s finished

[2021-02-10 14:08:15] Features: 2/10 -- score: 0.9444444444444445[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s







[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.3s finished

[2021-02-10 14:08:17] Features: 3/10 -- score: 0.9814814814814815[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.2s finished

[2021-02-10 14:08:18] Features: 4/10 -- score: 0.9537037037037037[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.1s finished

[2021-02-10 14:08:19] Features: 5/10 -- score: 0.9629629629629631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.9s finished

[2021-02-10 14:08:20] Features: 6/10 -- score: 0.9629629629629631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.8s finished

[2021-02-10 14:08:21] Features: 7/10 -- score: 0.925925925925926[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s finished

[2021-02-10 14:08:22] Features: 8/10 -- score: 0.9398148148148149[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s finished

[2021-02-10 14:08:22] Features: 9/10 -- score: 0.9490740740740741[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s




[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished

[2021-02-10 14:08:23] Features: 10/10 -- score: 0.9907407407407408

In [114]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'speed_st', 'magnitude_vel_st', 'magnitude_jerk_st',
       'ncv_dy', 'nca_st', 'nca_dy', 'in_air_stcp', 'on_surface_st',
       'on_surface_dy'],
      dtype='object')

In [115]:

def run_extra(X_train, X_test, y_train, y_test):
    rf = ExtraTreesClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [116]:
run_extra(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.8857142857142857
