## 1. Execute imports

In [5]:
import pandas as pd
import numpy as np
from genetic_selection import GeneticSelectionCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [6]:
import warnings
warnings.filterwarnings("ignore")
data_glass = pd.read_csv('datasets/user_knowledge.csv', delimiter=',')
X = pd.read_csv('datasets/user_knowledge_formula_a.csv', delimiter=',')
y = LabelEncoder().fit_transform(data_glass['UNS'])
features = list(X.columns.values)
y = pd.DataFrame(y)

## KNN

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from numpy import mean
from numpy import std

### Information gain

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', KNeighborsClassifier())])

search_space = [{'selector__k': ['all']},
                {'classifier': [KNeighborsClassifier()]}]

In [6]:
%%time
knn_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
knn_info.fit(X_train, y_train)

Wall time: 1.36 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000029514B31AF0>)),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [KNeighborsClassifier()]}])

In [7]:
print(cross_val_score(knn_info, X, y, cv=10).mean())

0.8069512195121951


In [9]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', KNeighborsClassifier())])
    # define search space
    search_space = [{'selector__k': ['all']}, {'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f' % (acc, result.best_score_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))

>acc=0.741, est=0.779
>acc=0.840, est=0.773
>acc=0.864, est=0.755
>acc=0.812, est=0.762
>acc=0.725, est=0.759
Accuracy: 0.796 (0.055)
Wall time: 10.2 s


### Variance threshold

In [10]:
from sklearn.feature_selection import VarianceThreshold

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', KNeighborsClassifier())])

search_space = [{'classifier': [KNeighborsClassifier()]}]

In [10]:
%%time
knn_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
knn_variance.fit(X_train, y_train)

Wall time: 113 ms


GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'classifier': [KNeighborsClassifier()]}],
             scoring='accuracy')

In [11]:
print(cross_val_score(knn_variance, X, y, cv=10).mean())

0.8318292682926829


In [11]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', KNeighborsClassifier())])

    search_space = [{'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.815, est=0.801, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.864, est=0.826, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.901, est=0.783, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.812, est=0.793, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
>acc=0.800, est=0.792, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier())])
Accuracy: 0.839 (0.038)
Wall time: 551 ms


### Chi-Square

In [12]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', KNeighborsClassifier())])

In [14]:
search_space = [{'classifier': [KNeighborsClassifier()]}]

In [15]:
%%time
knn_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
knn_chi.fit(X_train, y_train)

Wall time: 62.1 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x00000295148F9F70>)),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid=[{'classifier': [KNeighborsClassifier()]}])

In [16]:
print(cross_val_score(knn_chi, X, y, cv=10).mean())

0.8293902439024391


In [13]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', KNeighborsClassifier())])
    # define search space
    search_space = [{'classifier': [KNeighborsClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.889, est=0.851, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.852, est=0.873, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.926, est=0.814, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.863, est=0.855, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', KNeighborsClassifier())])
>acc=0.875, est=0.858, cfg=Pipeline(steps=[('sca

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

### Information gain

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', LogisticRegression())])

search_space = [{'selector__k': ['all']},
                {'classifier': [LogisticRegression()]}]

In [20]:
%%time
logistic_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
logistic_info.fit(X_train, y_train)

Wall time: 1.23 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000029514B31AF0>)),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [LogisticRegression()]}])

In [21]:
print(cross_val_score(logistic_info, X, y, cv=10).mean())

0.9107926829268294


In [15]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', LogisticRegression())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.840, est=0.891, cfg={'selector__k': 10}
>acc=0.765, est=0.907, cfg={'selector__k': 10}
>acc=0.864, est=0.891, cfg={'classifier': LogisticRegression()}
>acc=0.812, est=0.892, cfg={'classifier': LogisticRegression()}
>acc=0.863, est=0.892, cfg={'classifier': LogisticRegression()}
Accuracy: 0.829 (0.037)
Wall time: 11.9 s


### Variance threshold

In [16]:
from sklearn.feature_selection import VarianceThreshold

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', LogisticRegression())])

search_space = [{'classifier': [LogisticRegression()]}]

In [24]:
%%time
logistic_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
logistic_variance.fit(X_train, y_train)

Wall time: 234 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()]}],
             scoring='accuracy')

In [25]:
print(cross_val_score(logistic_variance, X, y, cv=10).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8439024390243901


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', LogisticRegression())])

    search_space = [{'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.827, est=0.832, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.765, est=0.792, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.877, est=0.777, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.812, est=0.821, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
>acc=0.863, est=0.792, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', LogisticRegression())])
Accuracy: 0.829 (0.039)
Wall time: 2.23 s


### Chi-Square

In [18]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', LogisticRegression())])

In [28]:
search_space = [{'classifier': [LogisticRegression()]}]

In [29]:
%%time
logistic_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
logistic_chi.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Wall time: 269 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x00000295148F9F70>)),
                                       ('classifier', LogisticRegression())]),
             param_grid=[{'classifier': [LogisticRegression()]}])

In [30]:
print(cross_val_score(logistic_chi, X, y, cv=10).mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8339024390243903


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [19]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', LogisticRegression())])
    # define search space
    search_space = [{'classifier': [LogisticRegression()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.840, est=0.811, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.753, est=0.774, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.877, est=0.774, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.787, est=0.802, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', LogisticRegression())])
>acc=0.863, est=0.783, cfg=Pipeline(steps=[('scaler', Mi

## SVM

In [20]:
from sklearn.svm import SVC

### Information gain

In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', SVC(kernel='linear'))])

search_space = [{'selector__k': ['all']},
                {'classifier': [SVC(kernel='linear')]}]

In [34]:
%%time
svm_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
svm_info.fit(X_train, y_train)

Wall time: 984 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000029514B31AF0>)),
                                       ('classifier', SVC(kernel='linear'))]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [SVC(kernel='linear')]}])

In [35]:
print(cross_val_score(svm_info, X, y, cv=10).mean())

0.9232317073170732


In [21]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=9)),
                 ('classifier', SVC(kernel='linear'))])
    # define search space
    search_space = [{'selector__k': [9]}, {'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.889, est=0.923, cfg={'classifier': SVC(kernel='linear')}
>acc=0.864, est=0.932, cfg={'selector__k': 9}
>acc=0.938, est=0.913, cfg={'selector__k': 9}
>acc=0.838, est=0.935, cfg={'selector__k': 9}
>acc=0.925, est=0.913, cfg={'selector__k': 9}
Accuracy: 0.891 (0.037)
Wall time: 9.31 s


### Variance threshold

In [22]:
from sklearn.feature_selection import VarianceThreshold

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', SVC(kernel='linear'))])

search_space = [{'classifier': [SVC(kernel='linear')]}]

In [38]:
%%time
svm_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
svm_variance.fit(X_train, y_train)

Wall time: 77.4 ms


GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', SVC(kernel='linear'))]),
             param_grid=[{'classifier': [SVC(kernel='linear')]}],
             scoring='accuracy')

In [39]:
print(cross_val_score(svm_variance, X, y, cv=10).mean())

0.9083536585365852


In [25]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', SVC(kernel='linear'))])

    search_space = [{'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.901, est=0.888, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', SVC(kernel='linear'))])
>acc=0.889, est=0.863, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', SVC(kernel='linear'))])
>acc=0.951, est=0.866, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', SVC(kernel='linear'))])
>acc=0.850, est=0.907, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', SVC(kernel='linear'))])
>acc=0.912, est=0.895, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', SVC(kernel='linear'))])
Accuracy: 0.901 (0.033)
Wall time: 424 ms


### Chi-Square

In [26]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', SVC())])

In [42]:
search_space = [{'classifier': [SVC()]}]

In [43]:
%%time
svm_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
svm_chi.fit(X_train, y_train)

Wall time: 99.3 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x00000295148F9F70>)),
                                       ('classifier', SVC())]),
             param_grid=[{'classifier': [SVC()]}])

In [44]:
print(cross_val_score(svm_chi, X, y, cv=10).mean())

0.9108536585365854


In [27]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', SVC(kernel='linear'))])
    # define search space
    search_space = [{'classifier': [SVC(kernel='linear')]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = SVC(kernel='linear')
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.901, est=0.867, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.864, est=0.842, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.938, est=0.851, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.838, est=0.892, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', SVC(kernel='linear'))])
>acc=0.925, est=0.882, cfg=Pipeline(steps=[('scaler', Mi

## Naive Bayes

In [28]:
from sklearn.naive_bayes import GaussianNB

### Information gain

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', GaussianNB())])

search_space = [{'selector__k': ['all']},
                {'classifier': [GaussianNB()]}]

In [48]:
%%time
nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)

Wall time: 1.35 s


GridSearchCV(estimator=Pipeline(steps=[('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function mutual_info_classif at 0x0000029514B31AF0>)),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'selector__k': ['all']},
                         {'classifier': [GaussianNB()]}])

In [49]:
print(cross_val_score(nb_info, X, y, cv=10).mean())

0.821890243902439


In [29]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', GaussianNB())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.914, est=0.858, cfg={'selector__k': 10}
>acc=0.840, est=0.845, cfg={'selector__k': 10}
>acc=0.877, est=0.833, cfg={'classifier': GaussianNB()}
>acc=0.850, est=0.845, cfg={'selector__k': 10}
>acc=0.863, est=0.845, cfg={'classifier': GaussianNB()}
Accuracy: 0.868 (0.026)
Wall time: 10.7 s


### Variance threshold

In [50]:
from sklearn.feature_selection import VarianceThreshold

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', GaussianNB())])

search_space = [{'classifier': [GaussianNB()]}]

In [52]:
nb_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
nb_variance.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('selector', VarianceThreshold()),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'classifier': [GaussianNB()]}], scoring='accuracy')

In [53]:
print(cross_val_score(nb_variance, X, y, cv=10).mean())

0.821890243902439


In [30]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', GaussianNB())])

    search_space = [{'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.889, est=0.817, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.790, est=0.817, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.840, est=0.792, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.838, est=0.799, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
>acc=0.787, est=0.811, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', GaussianNB())])
Accuracy: 0.829 (0.037)
Wall time: 420 ms


### Chi-Square

In [54]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', GaussianNB())])

In [56]:
search_space = [{'classifier': [GaussianNB()]}]

In [57]:
%%time
nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

Wall time: 59.8 ms


GridSearchCV(estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('selector',
                                        SelectKBest(k='all',
                                                    score_func=<function chi2 at 0x00000295148F9F70>)),
                                       ('classifier', GaussianNB())]),
             param_grid=[{'classifier': [GaussianNB()]}])

In [58]:
print(cross_val_score(nb_chi, X, y, cv=10).mean())

0.821890243902439


In [31]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', GaussianNB())])
    # define search space
    search_space = [{'classifier': [GaussianNB()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.901, est=0.854, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', GaussianNB())])
>acc=0.840, est=0.854, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', GaussianNB())])
>acc=0.852, est=0.833, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', GaussianNB())])
>acc=0.875, est=0.848, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', GaussianNB())])
>acc=0.850, est=0.833, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                (

In [59]:
## Random Forest

from sklearn.ensemble import RandomForestRegressor

### Information gain

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', RandomForestRegressor())])

search_space = [{'selector__k': ['all']},
                {'classifier': [RandomForestRegressor()]}]

nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)
print('Random Forest -> Information gain')
print(cross_val_score(nb_info, X, y, cv=10).mean())

### Chi-Square

from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', RandomForestRegressor())])

search_space = [{'classifier': [RandomForestRegressor()]}]

nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

print('Random Forest -> Chi-Square')
print(cross_val_score(nb_chi, X, y, cv=10).mean())


from sklearn.preprocessing import MinMaxScaler

from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([
                 ('selector', SelectKBest(mutual_info_classif, k='all')),
                 ('classifier', MLPClassifier())])

search_space = [{'selector__k': ['all']},
                {'classifier': [MLPClassifier()]}]

nb_info = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_info.fit(X_train, y_train)

print('MLPClassifier -> Info')
print(cross_val_score(nb_info, X, y, cv=10).mean())

### Variance threshold

from sklearn.feature_selection import VarianceThreshold

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', VarianceThreshold()),
                 ('classifier', MLPClassifier())])

search_space = [{'classifier': [MLPClassifier()]}]

nb_variance = GridSearchCV(pipe, search_space, cv=None, verbose=0, scoring= 'accuracy')
nb_variance.fit(X_train, y_train)

print('MLPClassifier -> Variance')
print(cross_val_score(nb_variance, X, y, cv=10).mean())


### Chi-Square

from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k='all')),
                 ('classifier', MLPClassifier())])

search_space = [{'classifier': [MLPClassifier()]}]

nb_chi = GridSearchCV(pipe, search_space, cv=None, verbose=0)
nb_chi.fit(X_train, y_train)

print('MLPClassifier -> Chi')
print(cross_val_score(nb_chi, X, y, cv=10).mean())

Random Forest -> Information gain
0.838656406457251
Random Forest -> Chi-Square
0.8354181794508936




MLPClassifier -> Info










0.8785365853658537




MLPClassifier -> Variance






0.8711585365853658




MLPClassifier -> Chi






0.8686585365853657




In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

In [33]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=10)),
                 ('classifier', MLPClassifier())])
    # define search space
    search_space = [{'selector__k': [10]}, {'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.852, est=0.888, cfg={'classifier': MLPClassifier()}
>acc=0.877, est=0.888, cfg={'selector__k': 10}
>acc=0.914, est=0.873, cfg={'classifier': MLPClassifier()}
>acc=0.863, est=0.883, cfg={'classifier': MLPClassifier()}
>acc=0.900, est=0.873, cfg={'classifier': MLPClassifier()}
Accuracy: 0.881 (0.023)
Wall time: 1min 1s


In [34]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix].loc[:, features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('selector', VarianceThreshold()),
                 ('classifier', MLPClassifier())])

    search_space = [{'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    select_indexes = np.arange(len(X_train.columns)).reshape(1, -1)
    select_indexes = result.best_estimator_.named_steps['selector'].transform(select_indexes)
    selected_features = X.iloc[:,select_indexes.reshape(-1)]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.840, est=0.870, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.864, est=0.848, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.901, est=0.839, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.838, est=0.855, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
>acc=0.912, est=0.867, cfg=Pipeline(steps=[('selector', VarianceThreshold()),
                ('classifier', MLPClassifier())])
Accuracy: 0.871 (0.031)
Wall time: 30.5 s


In [35]:
%%time
features = list(X.columns.values)
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()
for train_ix, test_ix in kf.split(X):
    # split data
    X_train, X_test = X.iloc[train_ix][features], X.iloc[test_ix][features]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    # define the model
    pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', SelectKBest(chi2, k=10)),
                 ('classifier', MLPClassifier())])
    # define search space
    search_space = [{'classifier': [MLPClassifier()]}]
    # define search
    _info = GridSearchCV(pipe, search_space, scoring='accuracy', cv=None, refit=True)
    # execute search
    result = _info.fit(X_train, y_train)
    
    selected = result.best_estimator_.named_steps['selector'].get_support(indices=True)
    selected_features = X.iloc[:,selected]
    x_t = X_test[selected_features.columns.values]
    
    clf = MLPClassifier()
    clf.fit(X_train[selected_features.columns.values], y_train)
    # evaluate the model
    acc = accuracy_score(y_test, clf.predict(x_t))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_estimator_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

>acc=0.852, est=0.839, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.827, est=0.820, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.901, est=0.823, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.812, est=0.833, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('selector',
                 SelectKBest(score_func=<function chi2 at 0x000001BB4FF5EF70>)),
                ('classifier', MLPClassifier())])
>acc=0.900, est=0.830, cfg=Pipeline(steps=[('scaler', MinMaxScaler()),
     