In [52]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, as_float_array
from sklearn.datasets import make_regression

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector, ExhaustiveFeatureSelector

In [40]:
import warnings
warnings.filterwarnings('ignore')

# RF regressor optimization code

In [48]:
def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

In [49]:
df = pd.read_csv('./classification/datasets/cancer/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [53]:
params_opt = bayesian_optimization_lightgbm(X, y, cv=4, max_iter_opt=10)

In [57]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [58]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2, max_iter=250),
}

# SequentialFeatureSelection

In [69]:
for k, v in models.items():
    print('model type: {}'.format(k))
    sfs = SequentialFeatureSelector(v, k_features=10, scoring='accuracy')
    sfs.fit(X, y)
    print(sfs.subsets_[10]['feature_names'])
    print(sfs.subsets_[10]['avg_score'])

model type: rf
('1', '2', '6', '14', '15', '20', '21', '24', '27', '28')
0.943886110042324
model type: lr
('0', '6', '9', '14', '15', '17', '19', '21', '22', '25')
0.9631704501731434
model type: svc
('5', '6', '7', '9', '10', '11', '14', '16', '18', '26')
0.945686802616391


# ExhaustiveFeatureSelection

In [60]:
efs = ExhaustiveFeatureSelector(estimator=models['lr'], max_features=5, scoring='accuracy', n_jobs=-1, print_progress=False)

In [61]:
efs.fit(X, y)

ExhaustiveFeatureSelector(clone_estimator=True, cv=5,
             estimator=LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False),
             max_features=5, min_features=1, n_jobs=-1,
             pre_dispatch='2*n_jobs', print_progress=False,
             scoring='accuracy')

In [62]:
print(efs.best_feature_names_)
print(efs.best_score_)

('6', '20', '21', '27', '28')
0.9737283570604077


# RecursiveFeatureElimination

In [67]:
for k, v in models.items():
    print('model type: {}'.format(k))
    rfe = RFE(estimator=v, n_features_to_select=10)
    rfe.fit(X, y)
    print(np.array(list(range(X.shape[1])))[rfe.support_])
    print(rfe.score(X, y))

model type: rf
[ 0  1  6  7 10 13 21 22 23 27]
0.9718804920913884
model type: lr
[ 0  1  2  3 13 20 21 22 23 26]
0.9244288224956063
model type: svc
[ 0  2  3 13 20 21 22 23 25 26]
0.9261862917398945
