# 1. Execute imports

In [1]:
import pandas as pd
import numpy as np
from genetic_selection import GeneticSelectionCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings
warnings.filterwarnings("ignore")
data_glass = pd.read_csv('datasets/accent-mfcc-data-1.csv', delimiter=',')
X = pd.read_csv('datasets/speaker_formula_c.csv', delimiter=',')
y = LabelEncoder().fit_transform(data_glass['language'])
features = list(X.columns.values)
y = pd.DataFrame(y)

## KNN

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from numpy import mean
from numpy import std

### Information gain

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', KNeighborsClassifier())])

search_space = [{'selector__k': ['all']},
                {'classifier': [KNeighborsClassifier()]}]

In [6]:
%%time
knn_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
knn_info.fit(X_train, y_train)

Wall time: 10.3 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x000002C1061A8AF0>)),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [KNeighborsClassifier()]}])

In [7]:
print(cross_val_score(knn_info, X, y, cv=10).mean())

0.7297348484848485


In [8]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', KNeighborsClassifier())])
    # define search space
    search_space = [{'selector__k': ['all']}, {'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f' % (acc, result.best_score_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))

>acc=0.803, est=0.640
>acc=0.742, est=0.673
>acc=0.697, est=0.612
>acc=0.803, est=0.597
>acc=0.769, est=0.644
Accuracy: 0.763 (0.040)
Wall time: 44.3 s


### Variance threshold

In [9]:
from sklearn.feature_selection import VarianceThreshold

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', KNeighborsClassifier())])

search_space = [{'classifier': [KNeighborsClassifier()]}]

In [11]:
%%time
knn_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
knn_variance.fit(X_train, y_train)

Wall time: 127 ms


GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'classifier': [KNeighborsClassifier()]}],
             scoring='accuracy')

In [12]:
print(cross_val_score(knn_variance, X, y, cv=10).mean())

0.7418560606060607


In [13]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', KNeighborsClassifier())])

    search_space = [{'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.773, est=0.658, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.758, est=0.662, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.727, est=0.639, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.833, est=0.612, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.785, est=0.640, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
Accuracy: 0.775 (0.035)
Wall time: 586 ms


### Chi-Square

In [14]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', KNeighborsClassifier())])

In [16]:
search_space = [{'classifier': [KNeighborsClassifier()]}]

In [17]:
%%time
knn_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
knn_chi.fit(X_train, y_train)

Wall time: 140 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x000002C105F70F70>)),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'classifier': [KNeighborsClassifier()]}])

In [18]:
print(cross_val_score(knn_chi, X, y, cv=10).mean())

0.7206439393939394


In [19]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', KNeighborsClassifier())])
    # define search space
    search_space = [{'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.682, est=0.582, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.636, est=0.551, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.682, est=0.517, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.606, est=0.566, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.692, est=0.553, cfg=Pipeline(steps=[('sca

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

### Information gain

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', LogisticRegression())])

search_space = [{'selector__k': ['all']},
                {'classifier': [LogisticRegression()]}]

In [23]:
%%time
logistic_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
logistic_info.fit(X_train, y_train)

Wall time: 8.96 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x000002C1061A8AF0>)),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [LogisticRegression()]}])

In [24]:
print(cross_val_score(logistic_info, X, y, cv=10).mean())

0.7203598484848486


In [25]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', LogisticRegression())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.652, est=0.579, cfg={'selector__k': 10}
>acc=0.667, est=0.547, cfg={'selector__k': 10}
>acc=0.515, est=0.564, cfg={'selector__k': 10}
>acc=0.682, est=0.582, cfg={'selector__k': 10}
>acc=0.615, est=0.542, cfg={'selector__k': 10}
Accuracy: 0.626 (0.060)
Wall time: 48.3 s


### Variance threshold

In [26]:
from sklearn.feature_selection import VarianceThreshold

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', LogisticRegression())])

search_space = [{'classifier': [LogisticRegression()]}]

In [28]:
%%time
logistic_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
logistic_variance.fit(X_train, y_train)

Wall time: 390 ms


GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()]}],
             scoring='accuracy')

In [29]:
print(cross_val_score(logistic_variance, X, y, cv=10).mean())

0.6442234848484849


In [30]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', LogisticRegression())])

    search_space = [{'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.788, est=0.631, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.788, est=0.615, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.682, est=0.658, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.833, est=0.669, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.769, est=0.614, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
Accuracy: 0.772 (0.050)
Wall time: 2.11 s


### Chi-Square

In [31]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', LogisticRegression())])

In [33]:
search_space = [{'classifier': [LogisticRegression()]}]

In [34]:
%%time
logistic_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
logistic_chi.fit(X_train, y_train)

Wall time: 368 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x000002C105F70F70>)),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()]}])

In [35]:
print(cross_val_score(logistic_chi, X, y, cv=10).mean())

0.669034090909091


In [36]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', LogisticRegression())])
    # define search space
    search_space = [{'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.682, est=0.559, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', LogisticRegression())])
>acc=0.636, est=0.563, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', LogisticRegression())])
>acc=0.545, est=0.586, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', LogisticRegression())])
>acc=0.621, est=0.567, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', LogisticRegression())])
>acc=0.677, est=0.565, cfg=Pipeline(steps=[('scaler', Mi

## SVM

In [37]:
from sklearn.svm import SVC

### Information gain

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', SVC(kernel='linear'))])

search_space = [{'selector__k': ['all']},
                {'classifier': [SVC(kernel='linear')]}]

In [40]:
%%time
svm_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
svm_info.fit(X_train, y_train)

Wall time: 7.48 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x000002C1061A8AF0>)),
                                       ('classifier', SVC(kernel='linear'))]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [SVC(kernel='linear')]}])

In [41]:
print(cross_val_score(svm_info, X, y, cv=10).mean())

0.65625


In [42]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=9)),
                 ('classifier', SVC(kernel='linear'))])
    # define search space
    search_space = [{'selector__k': [9]}, {'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.712, est=0.575, cfg={'selector__k': 9}
>acc=0.652, est=0.574, cfg={'selector__k': 9}
>acc=0.515, est=0.571, cfg={'selector__k': 9}
>acc=0.667, est=0.560, cfg={'selector__k': 9}
>acc=0.600, est=0.538, cfg={'selector__k': 9}
Accuracy: 0.629 (0.067)
Wall time: 43.2 s


### Variance threshold

In [43]:
from sklearn.feature_selection import VarianceThreshold

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', SVC(kernel='linear'))])

search_space = [{'classifier': [SVC(kernel='linear')]}]

In [45]:
%%time
svm_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
svm_variance.fit(X_train, y_train)

Wall time: 234 ms


GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', SVC(kernel='linear'))]),
             param_grid=[{'classifier': [SVC(kernel='linear')]}],
             scoring='accuracy')

In [46]:
print(cross_val_score(svm_variance, X, y, cv=10).mean())

0.6170454545454546


In [47]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold(3)),
                 ('classifier', SVC(kernel='linear'))])

    search_space = [{'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.697, est=0.597, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.803, est=0.600, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.742, est=0.597, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.758, est=0.566, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.769, est=0.512, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
Accuracy: 0.754 (0.035)
Wall time: 742 ms


### Chi-Square

In [48]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', SVC())])

In [50]:
search_space = [{'classifier': [SVC()]}]

In [51]:
%%time
svm_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
svm_chi.fit(X_train, y_train)

Wall time: 124 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x000002C105F70F70>)),
                                       ('classifier', SVC())]),
             param_grid=[{'classifier': [SVC()]}])

In [52]:
print(cross_val_score(svm_chi, X, y, cv=10).mean())

0.7205492424242423


In [53]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', SVC(kernel='linear'))])
    # define search space
    search_space = [{'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.667, est=0.559, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.682, est=0.555, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.682, est=0.559, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.652, est=0.552, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.708, est=0.557, cfg=Pipeline(steps=[('scaler', Mi

## Naive Bayes

In [54]:
from sklearn.naive_bayes import GaussianNB

### Information gain

In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', GaussianNB())])

search_space = [{'selector__k': ['all']},
                {'classifier': [GaussianNB()]}]

In [57]:
%%time
nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)

Wall time: 8.72 s


GridSearchCV(estimator=Pipeline(steps=[('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x000002C1061A8AF0>)),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [GaussianNB()]}])

In [58]:
print(cross_val_score(nb_info, X, y, cv=10).mean())

0.5295454545454545


In [59]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', GaussianNB())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.470, est=0.426, cfg={'selector__k': 10}
>acc=0.530, est=0.445, cfg={'selector__k': 10}
>acc=0.409, est=0.460, cfg={'selector__k': 10}
>acc=0.439, est=0.411, cfg={'selector__k': 10}
>acc=0.446, est=0.390, cfg={'selector__k': 10}
Accuracy: 0.459 (0.041)
Wall time: 37.3 s


### Variance threshold

In [60]:
from sklearn.feature_selection import VarianceThreshold

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', GaussianNB())])

search_space = [{'classifier': [GaussianNB()]}]

In [62]:
nb_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
nb_variance.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'classifier': [GaussianNB()]}], scoring='accuracy')

In [63]:
print(cross_val_score(nb_variance, X, y, cv=10).mean())

0.5295454545454545


In [64]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', GaussianNB())])

    search_space = [{'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.606, est=0.510, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.545, est=0.525, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.470, est=0.536, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.636, est=0.456, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.569, est=0.485, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
Accuracy: 0.565 (0.057)
Wall time: 370 ms


### Chi-Square

In [65]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', GaussianNB())])

In [67]:
search_space = [{'classifier': [GaussianNB()]}]

In [68]:
%%time
nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

Wall time: 111 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x000002C105F70F70>)),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'classifier': [GaussianNB()]}])

In [69]:
print(cross_val_score(nb_chi, X, y, cv=10).mean())

0.5295454545454545


In [70]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', GaussianNB())])
    # define search space
    search_space = [{'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.439, est=0.430, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', GaussianNB())])
>acc=0.591, est=0.460, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', GaussianNB())])
>acc=0.409, est=0.449, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', GaussianNB())])
>acc=0.439, est=0.468, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', GaussianNB())])
>acc=0.492, est=0.470, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                (

In [71]:
## Random Forest

from sklearn.ensemble import RandomForestRegressor

### Information gain

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', RandomForestRegressor())])

search_space = [{'selector__k': ['all']},
                {'classifier': [RandomForestRegressor()]}]

nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)
print('Random Forest -> Information gain')
print(cross_val_score(nb_info, X, y, cv=10).mean())

### Chi-Square

from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', RandomForestRegressor())])

search_space = [{'classifier': [RandomForestRegressor()]}]

nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

print('Random Forest -> Chi-Square')
print(cross_val_score(nb_chi, X, y, cv=10).mean())


from sklearn.preprocessing import MinMaxScaler

from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', MLPClassifier())])

search_space = [{'selector__k': ['all']},
                {'classifier': [MLPClassifier()]}]

nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)

print('MLPClassifier -> Info')
print(cross_val_score(nb_info, X, y, cv=10).mean())

### Variance threshold

from sklearn.feature_selection import VarianceThreshold

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', VarianceThreshold()),
                 ('classifier', MLPClassifier())])

search_space = [{'classifier': [MLPClassifier()]}]

nb_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
nb_variance.fit(X_train, y_train)

print('MLPClassifier -> Variance')
print(cross_val_score(nb_variance, X, y, cv=10).mean())


### Chi-Square

from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', MLPClassifier())])

search_space = [{'classifier': [MLPClassifier()]}]

nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

print('MLPClassifier -> Chi')
print(cross_val_score(nb_chi, X, y, cv=10).mean())

Random Forest -> Information gain
-24.27688190655531
Random Forest -> Chi-Square
-22.824261936290092
MLPClassifier -> Info
0.7298295454545455
MLPClassifier -> Variance
0.6569128787878789
MLPClassifier -> Chi
0.6660984848484849


In [72]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

In [73]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', MLPClassifier())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.636, est=0.586, cfg={'classifier': MLPClassifier()}
>acc=0.727, est=0.585, cfg={'classifier': MLPClassifier()}
>acc=0.530, est=0.613, cfg={'classifier': MLPClassifier()}
>acc=0.697, est=0.605, cfg={'classifier': MLPClassifier()}
>acc=0.585, est=0.576, cfg={'classifier': MLPClassifier()}
Accuracy: 0.635 (0.072)
Wall time: 1min 14s


In [74]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', MLPClassifier())])

    search_space = [{'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.667, est=0.563, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.788, est=0.555, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.788, est=0.673, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.773, est=0.608, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.785, est=0.569, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
Accuracy: 0.760 (0.047)
Wall time: 14.6 s


In [75]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', MLPClassifier())])
    # define search space
    search_space = [{'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.636, est=0.567, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', MLPClassifier())])
>acc=0.682, est=0.567, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', MLPClassifier())])
>acc=0.591, est=0.570, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', MLPClassifier())])
>acc=0.621, est=0.548, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000002C105F70F70>)),
                ('classifier', MLPClassifier())])
>acc=0.723, est=0.561, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
     