In [43]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [44]:
data = pd.read_csv('data.csv')
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target', 'Unnamed: 0'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((28, 29), (12, 29))

In [46]:
# find and remove correlated features
# in order to reduce the feature space a bit
# so that the algorithm takes shorter

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  17


In [47]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((28, 12), (12, 12))

In [48]:
X_train.columns[0:10]

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st',
       'magnitude_jerk_st', 'ncv_st', 'ncv_dy', 'nca_st', 'nca_dy',
       'in_air_stcp'],
      dtype='object')

In [49]:
# exhaustive feature selection
# Using 10 features with ROC_AUC Scoring

efs1 = EFS(RandomForestClassifier(n_jobs=4, random_state=0), 
           min_features=1,
           max_features=4, 
           scoring='roc_auc',
           print_progress=True,
           cv=2)

efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)

Features: 15/15

In [50]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [51]:

efs1.best_idx_

(0, 1, 2, 3)

In [52]:
selected_feat= X_train.columns[list(efs1.best_idx_)]
selected_feat

Index(['no_strokes_st', 'no_strokes_dy', 'speed_st', 'magnitude_vel_st'], dtype='object')

In [53]:
run_randomForests(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.6


In [54]:
from sklearn.linear_model import LogisticRegression
efs1 = EFS(LogisticRegression(n_jobs=4, random_state=0), 
           min_features=1,
           max_features=4, 
           scoring='roc_auc',
           print_progress=True,
           cv=2)

efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)

Features: 15/15

In [55]:
efs1.best_idx_
selected_feat= X_train.columns[list(efs1.best_idx_)]


In [56]:

def run_logisticRegression(X_train, X_test, y_train, y_test):
    rf = LogisticRegression()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [57]:
run_logisticRegression(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 0.8833333333333333
Test set
auc: 0.5714285714285714


In [58]:
from sklearn.tree import DecisionTreeClassifier
efs1 = EFS(DecisionTreeClassifier( random_state=0), 
           min_features=1,
           max_features=4, 
           scoring='roc_auc',
           print_progress=True,
           cv=2)

efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)

Features: 15/15

In [59]:
efs1.best_idx_
selected_feat= X_train.columns[list(efs1.best_idx_)]


In [60]:

def run_decisionTreeClassifier(X_train, X_test, y_train, y_test):
    rf = DecisionTreeClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [61]:
run_decisionTreeClassifier(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.6571428571428573


In [62]:
from xgboost import XGBClassifier
efs1 = EFS(XGBClassifier( random_state=0), 
           min_features=1,
           max_features=4, 
           scoring='roc_auc',
           print_progress=True,
           cv=2)

efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)



Features: 15/15



In [63]:
efs1.best_idx_
selected_feat= X_train.columns[list(efs1.best_idx_)]

In [64]:

def run_xgb(X_train, X_test, y_train, y_test):
    rf = XGBClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [65]:
run_xgb(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 0.9861111111111112
Test set
auc: 0.6857142857142857


In [66]:
from sklearn.ensemble import ExtraTreesClassifier
efs1 = EFS(ExtraTreesClassifier( random_state=0), 
           min_features=1,
           max_features=4, 
           scoring='roc_auc',
           print_progress=True,
           cv=2)

efs1 = efs1.fit(np.array(X_train[X_train.columns[0:4]].fillna(0)), y_train)

Features: 15/15

In [67]:
efs1.best_idx_
selected_feat= X_train.columns[list(efs1.best_idx_)]

In [68]:

def run_extra(X_train, X_test, y_train, y_test):
    rf = ExtraTreesClassifier()
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [69]:
run_extra(X_train[selected_feat].fillna(0),
                  X_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
auc: 1.0
Test set
auc: 0.5428571428571428
