## 1. Execute imports

In [1]:
import pandas as pd
import numpy as np
from genetic_selection import GeneticSelectionCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings
warnings.filterwarnings("ignore")
data_glass = pd.read_csv('datasets/Algerian_forest_fires.csv', delimiter=',')
X = pd.read_csv('datasets/algerian_formula_c.csv', delimiter=',')
y = LabelEncoder().fit_transform(data_glass['Classes'])
features = list(X.columns.values)
y = pd.DataFrame(y)

## KNN

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

### Information gain

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', KNeighborsClassifier())])

search_space = [{'selector__k': ['all']},
                {'classifier': [KNeighborsClassifier()]}]

In [7]:
%%time
knn_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
knn_info.fit(X_train, y_train)



Wall time: 6.18 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000025FC31CCB80>)),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [KNeighborsClassifier()]}])

In [8]:
print(cross_val_score(knn_info, X, y, cv=10).mean())



0.7044999999999999


In [5]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', KNeighborsClassifier())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.898, est=0.923, cfg={'selector__k': 10}
>acc=0.939, est=0.908, cfg={'selector__k': 10}
>acc=0.980, est=0.897, cfg={'selector__k': 10}
>acc=0.857, est=0.923, cfg={'selector__k': 10}
>acc=0.958, est=0.918, cfg={'selector__k': 10}
Accuracy: 0.926 (0.044)
Wall time: 29.8 s


### Variance threshold

In [12]:
from sklearn.feature_selection import VarianceThreshold

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', KNeighborsClassifier())])

search_space = [{'classifier': [KNeighborsClassifier()]}]

In [11]:
%%time
knn_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
knn_variance.fit(X_train, y_train)

Wall time: 86.8 ms




GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'classifier': [KNeighborsClassifier()]}],
             scoring='accuracy')

In [12]:
print(cross_val_score(knn_variance, X, y, cv=10).mean())



0.8083333333333332




In [13]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', KNeighborsClassifier())])

    search_space = [{'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.755, est=0.821, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.878, est=0.800, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.898, est=0.785, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.857, est=0.831, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.833, est=0.801, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
Accuracy: 0.844 (0.049)
Wall time: 581 ms


### Chi-Square

In [14]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', KNeighborsClassifier())])

In [15]:
search_space = [{'classifier': [KNeighborsClassifier()]}]

In [16]:
%%time
knn_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
knn_chi.fit(X_train, y_train)

Wall time: 131 ms




GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x0000025FC1FDE040>)),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'classifier': [KNeighborsClassifier()]}])

In [17]:
print(cross_val_score(knn_chi, X, y, cv=10).mean())



0.6508333333333334


In [15]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', KNeighborsClassifier())])
    # define search space
    search_space = [{'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.878, est=0.851, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.898, est=0.867, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.939, est=0.851, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.878, est=0.872, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.875, est=0.826, cfg=Pipeline(steps=[('sca

### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

### Information gain

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', LogisticRegression())])

search_space = [{'selector__k': ['all']},
                {'classifier': [LogisticRegression()]}]

In [21]:
%%time
logistic_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
logistic_info.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Wall time: 5.89 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000025FC31CCB80>)),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [LogisticRegression()]}])

In [22]:
print(cross_val_score(logistic_info, X, y, cv=10).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8695


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', LogisticRegression())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.898, est=0.908, cfg={'selector__k': 10}
>acc=0.939, est=0.903, cfg={'selector__k': 10}
>acc=0.878, est=0.887, cfg={'selector__k': 10}
>acc=0.898, est=0.938, cfg={'selector__k': 10}
>acc=0.875, est=0.918, cfg={'selector__k': 10}
Accuracy: 0.897 (0.023)
Wall time: 31.4 s


### Variance threshold

In [23]:
from sklearn.feature_selection import VarianceThreshold

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', LogisticRegression())])

search_space = [{'classifier': [LogisticRegression()]}]

In [25]:
%%time
logistic_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
logistic_variance.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Wall time: 503 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()]}],
             scoring='accuracy')

In [26]:
print(cross_val_score(logistic_variance, X, y, cv=10).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8733333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', LogisticRegression())])

    search_space = [{'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.837, est=0.877, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.878, est=0.851, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.939, est=0.836, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.837, est=0.887, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.875, est=0.852, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
Accuracy: 0.873 (0.037)
Wall time: 2.29 s


### Chi-Square

In [27]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', LogisticRegression())])

In [29]:
search_space = [{'classifier': [LogisticRegression()]}]

In [30]:
%%time
logistic_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
logistic_chi.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Wall time: 500 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x0000025FC1FDE040>)),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()]}])

In [31]:
print(cross_val_score(logistic_chi, X, y, cv=10).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8325000000000001


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', LogisticRegression())])
    # define search space
    search_space = [{'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.918, est=0.887, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.939, est=0.877, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.878, est=0.867, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.898, est=0.908, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.875, est=0.862, cfg=Pipeline(steps=[('scaler', Mi

## SVM

In [22]:
from sklearn.svm import SVC

### Information gain

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', SVC(kernel='linear'))])

search_space = [{'selector__k': ['all']},
                {'classifier': [SVC(kernel='linear')]}]

In [35]:
%%time
svm_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
svm_info.fit(X_train, y_train)



Wall time: 6.44 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000025FC31CCB80>)),
                                       ('classifier', SVC(kernel='linear'))]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [SVC(kernel='linear')]}])

In [36]:
print(cross_val_score(svm_info, X, y, cv=10).mean())



0.8365000000000002


In [24]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=15)),
                 ('classifier', SVC(kernel='linear'))])
    # define search space
    search_space = [{'selector__k': [15]}, {'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.837, est=0.877, cfg={'selector__k': 15}
>acc=0.939, est=0.897, cfg={'classifier': SVC(kernel='linear')}
>acc=0.918, est=0.897, cfg={'selector__k': 15}
>acc=0.857, est=0.923, cfg={'selector__k': 15}
>acc=0.875, est=0.883, cfg={'selector__k': 15}
Accuracy: 0.885 (0.038)
Wall time: 30.5 s


### Variance threshold

In [37]:
from sklearn.feature_selection import VarianceThreshold

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', SVC(kernel='linear'))])

search_space = [{'classifier': [SVC(kernel='linear')]}]

In [39]:
%%time
svm_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
svm_variance.fit(X_train, y_train)

Wall time: 149 ms




GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', SVC(kernel='linear'))]),
             param_grid=[{'classifier': [SVC(kernel='linear')]}],
             scoring='accuracy')

In [40]:
print(cross_val_score(svm_variance, X, y, cv=10).mean())



0.8494999999999999


In [25]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold(3)),
                 ('classifier', SVC(kernel='linear'))])

    search_space = [{'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.816, est=0.810, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.918, est=0.826, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.918, est=0.846, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.857, est=0.897, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.917, est=0.821, cfg=Pipeline(steps=[('selector', VarianceThreshold(threshold=3)),
                ('classifier', SVC(kernel='linear'))])
Accuracy: 0.885 (0.042)
Wall time: 2.71 s


### Chi-Square

In [41]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', SVC())])

In [43]:
search_space = [{'classifier': [SVC()]}]

In [44]:
%%time
svm_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
svm_chi.fit(X_train, y_train)

Wall time: 141 ms




GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x0000025FC1FDE040>)),
                                       ('classifier', SVC())]),
             param_grid=[{'classifier': [SVC()]}])

In [45]:
print(cross_val_score(svm_chi, X, y, cv=10).mean())



0.7495


In [26]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=15)),
                 ('classifier', SVC(kernel='linear'))])
    # define search space
    search_space = [{'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.837, est=0.862, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(k=15,
                             score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.918, est=0.877, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(k=15,
                             score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.918, est=0.867, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(k=15,
                             score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.837, est=0.862, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(k=15,
                             score_func=<function ch

## Naive Bayes

In [27]:
from sklearn.naive_bayes import GaussianNB

### Information gain

In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', GaussianNB())])

search_space = [{'selector__k': ['all']},
                {'classifier': [GaussianNB()]}]

In [49]:
%%time
nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)



Wall time: 5.57 s


GridSearchCV(estimator=Pipeline(steps=[('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000025FC31CCB80>)),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [GaussianNB()]}])

In [50]:
print(cross_val_score(nb_info, X, y, cv=10).mean())



0.8275


In [28]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=15)),
                 ('classifier', GaussianNB())])
    # define search space
    search_space = [{'selector__k': [15]}, {'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.673, est=0.646, cfg={'classifier': GaussianNB()}
>acc=0.286, est=0.467, cfg={'selector__k': 15}
>acc=0.306, est=0.359, cfg={'selector__k': 15}
>acc=0.776, est=0.815, cfg={'selector__k': 15}
>acc=0.479, est=0.501, cfg={'classifier': GaussianNB()}
Accuracy: 0.504 (0.195)
Wall time: 30 s


### Variance threshold

In [51]:
from sklearn.feature_selection import VarianceThreshold

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', GaussianNB())])

search_space = [{'classifier': [GaussianNB()]}]

In [53]:
nb_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
nb_variance.fit(X_train, y_train)



GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'classifier': [GaussianNB()]}], scoring='accuracy')

In [54]:
print(cross_val_score(nb_variance, X, y, cv=10).mean())



0.8275


In [29]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', GaussianNB())])

    search_space = [{'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.816, est=0.872, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.837, est=0.836, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.796, est=0.831, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.857, est=0.836, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.875, est=0.841, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
Accuracy: 0.836 (0.028)
Wall time: 639 ms


### Chi-Square

In [55]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', GaussianNB())])

In [57]:
search_space = [{'classifier': [GaussianNB()]}]

In [58]:
%%time
nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

Wall time: 108 ms




GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x0000025FC1FDE040>)),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'classifier': [GaussianNB()]}])

In [59]:
print(cross_val_score(nb_chi, X, y, cv=10).mean())



0.7575000000000001


In [30]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', GaussianNB())])
    # define search space
    search_space = [{'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.429, est=0.559, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', GaussianNB())])
>acc=0.367, est=0.400, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', GaussianNB())])
>acc=0.388, est=0.359, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', GaussianNB())])
>acc=0.776, est=0.790, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', GaussianNB())])
>acc=0.500, est=0.460, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                (

## Random Forest

In [31]:
from sklearn.ensemble import RandomForestRegressor

### Information gain

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', RandomForestRegressor())])

search_space = [{'selector__k': ['all']},
                {'classifier': [RandomForestRegressor()]}]

In [9]:
%%time
nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)

Wall time: 6.2 s


GridSearchCV(estimator=Pipeline(steps=[('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000024B65865A60>)),
                                       ('classifier',
                                        RandomForestRegressor())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [RandomForestRegressor()]}])

In [10]:
print(cross_val_score(nb_info, X, y, cv=10).mean())

0.8513298086553723


In [32]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', RandomForestRegressor())])

    search_space = [{'selector__k': ['all']},
                {'classifier': [RandomForestRegressor()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = RandomForestRegressor()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

### Chi-Square

In [15]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', RandomForestRegressor())])

In [17]:
search_space = [{'classifier': [RandomForestRegressor()]}]

In [18]:
%%time
nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

Wall time: 1.42 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x0000024B6465BEE0>)),
                                       ('classifier',
                                        RandomForestRegressor())]),
             param_grid=[{'classifier': [RandomForestRegressor()]}])

In [19]:
print(cross_val_score(nb_chi, X, y, cv=10).mean())

0.8523868173889513


In [None]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', RandomForestRegressor())])
    # define search space
    search_space = [{'classifier': [RandomForestRegressor()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
    # execute search
    result = _info.fit(X_train, y_train)
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = RandomForestRegressor()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

In [35]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

In [6]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', MLPClassifier())])

search_space = [{'selector__k': ['all']},
                {'classifier': [MLPClassifier()]}]

nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)

print(cross_val_score(nb_info, X, y, cv=10).mean())





0.8278333333333334








0.8366666666666667








0.8161666666666665




In [36]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', MLPClassifier())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.837, est=0.882, cfg={'classifier': MLPClassifier()}
>acc=0.918, est=0.903, cfg={'selector__k': 10}
>acc=0.857, est=0.877, cfg={'selector__k': 10}
>acc=0.694, est=0.903, cfg={'selector__k': 10}
>acc=0.896, est=0.893, cfg={'selector__k': 10}
Accuracy: 0.840 (0.079)
Wall time: 47.4 s


In [None]:

### Variance threshold

from sklearn.feature_selection import VarianceThreshold

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', VarianceThreshold()),
                 ('classifier', MLPClassifier())])

search_space = [{'classifier': [MLPClassifier()]}]

nb_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
nb_variance.fit(X_train, y_train)

print(cross_val_score(nb_variance, X, y, cv=10).mean())

In [37]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', MLPClassifier())])

    search_space = [{'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.837, est=0.800, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.898, est=0.821, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.898, est=0.831, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.837, est=0.867, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.792, est=0.837, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
Accuracy: 0.852 (0.041)
Wall time: 13.7 s


In [None]:
### Chi-Square

from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', MLPClassifier())])

search_space = [{'classifier': [MLPClassifier()]}]

nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

print(cross_val_score(nb_chi, X, y, cv=10).mean())

In [38]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', MLPClassifier())])
    # define search space
    search_space = [{'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.776, est=0.887, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.939, est=0.867, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.898, est=0.851, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.857, est=0.856, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x0000011BE2A1EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.875, est=0.857, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
     