In [54]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [55]:
data = pd.read_csv('data.csv')
data.shape

(40, 31)

In [56]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]

In [57]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'Unnamed: 0'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((28, 29), (12, 29))

In [58]:
# find and remove correlated features
# in order to reduce the feature space a bit
# so that the algorithm takes shorter to find
# the features

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  17


In [59]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((28, 12), (12, 12))

In [60]:
# step backward feature selection
# Using 15 features with ROC_AUC scoring

sfs1 = SFS(RandomForestClassifier(n_jobs=4), 
           k_features=10, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    5.6s finished

[2021-02-10 14:09:38] Features: 11/10 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    5.4s finished

[2021-02-10 14:09:44] Features: 10/10 -- score: 1.0

In [61]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [62]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_st', 'nca_st', 'nca_dy', 'in_air_stcp',
       'on_surface_dy'],
      dtype='object')

In [63]:
# evaluate performance of algorithm built
# using selected features

run_randomForests(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.8285714285714285


In [64]:
from sklearn.linear_model import LogisticRegression
sfs1 = SFS(LogisticRegression(n_jobs=4), 
           k_features=10, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.8s finished

[2021-02-10 14:09:46] Features: 11/10 -- score: 0.7083333333333334[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.7s finished

[2021-02-10 14:09:46] Features: 10/10 -- score: 0.763888888888889

In [65]:

def run_logisticRegression(X_train, X_test, y_train, y_test):
    rf = LogisticRegression()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [66]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_dy', 'nca_dy', 'in_air_stcp', 'on_surface_st',
       'on_surface_dy'],
      dtype='object')

In [67]:
run_logisticRegression(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 0.8722222222222222
Test set
auc: 0.8857142857142857


In [68]:
from sklearn.tree import DecisionTreeClassifier
sfs1 = SFS(DecisionTreeClassifier(), 
           k_features=10, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2021-02-10 14:09:47] Features: 11/10 -- score: 0.8611111111111112[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s finished

[2021-02-10 14:09:47] Features: 10/10 -- score: 0.8611111111111112

In [69]:

def run_decisionTreeClassifier(X_train, X_test, y_train, y_test):
    rf = DecisionTreeClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [70]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'ncv_st', 'ncv_dy', 'nca_st', 'nca_dy', 'in_air_stcp', 'on_surface_st'],
      dtype='object')

In [71]:
run_decisionTreeClassifier(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.6571428571428573


In [72]:
from xgboost import XGBClassifier
sfs1 = SFS(XGBClassifier(), 
           k_features=10, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.





[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s






[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    1.7s finished

[2021-02-10 14:09:49] Features: 11/10 -- score: 0.9629629629629631[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s






[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    1.5s finished

[2021-02-10 14:09:51] Features: 10/10 -- score: 0.9907407407407408

In [73]:

def run_xgb(X_train, X_test, y_train, y_test):
    rf = XGBClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [74]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'speed_st', 'magnitude_vel_st', 'magnitude_jerk_st',
       'ncv_dy', 'nca_st', 'nca_dy', 'in_air_stcp', 'on_surface_st',
       'on_surface_dy'],
      dtype='object')

In [75]:
run_xgb(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.9142857142857143


In [76]:
from sklearn.ensemble import ExtraTreesClassifier
sfs1 = SFS(ExtraTreesClassifier(), 
           k_features=10, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=3)

sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    3.7s finished

[2021-02-10 14:09:56] Features: 11/10 -- score: 0.9629629629629629[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    3.3s finished

[2021-02-10 14:09:59] Features: 10/10 -- score: 0.9861111111111112

In [77]:

def run_extra(X_train, X_test, y_train, y_test):
    rf = ExtraTreesClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [78]:
selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_st', 'ncv_dy', 'nca_st', 'nca_dy',
       'on_surface_st'],
      dtype='object')

In [79]:
run_extra(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.8285714285714285
