In [24]:
import numpy as np
import pandas as pd
from importlib import reload
from copy import deepcopy as copy

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, as_float_array
from sklearn.datasets import make_regression

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector, ExhaustiveFeatureSelector

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import sys
sys.path.append('../')

In [19]:
import mlrank.hyperparams_opt as hyperparams_opt

reload(hyperparams_opt)

bayesian_optimization_lightgbm = hyperparams_opt.bayesian_optimization_lightgbm

In [15]:
df = pd.read_csv('./classification/datasets/cancer/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [20]:
params_opt = bayesian_optimization_lightgbm(X, y, cv=4, max_iter_opt=10)

In [21]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [22]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2, max_iter=250),
}

# SequentialFeatureSelection

In [69]:
for k, v in models.items():
    print('model type: {}'.format(k))
    sfs = SequentialFeatureSelector(v, k_features=10, scoring='accuracy')
    sfs.fit(X, y)
    print(sfs.subsets_[10]['feature_names'])
    print(sfs.subsets_[10]['avg_score'])

model type: rf
('1', '2', '6', '14', '15', '20', '21', '24', '27', '28')
0.943886110042324
model type: lr
('0', '6', '9', '14', '15', '17', '19', '21', '22', '25')
0.9631704501731434
model type: svc
('5', '6', '7', '9', '10', '11', '14', '16', '18', '26')
0.945686802616391


# ExhaustiveFeatureSelection

In [60]:
efs = ExhaustiveFeatureSelector(estimator=models['lr'], max_features=5, scoring='accuracy', n_jobs=-1, print_progress=False)

In [61]:
efs.fit(X, y)

ExhaustiveFeatureSelector(clone_estimator=True, cv=5,
             estimator=LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False),
             max_features=5, min_features=1, n_jobs=-1,
             pre_dispatch='2*n_jobs', print_progress=False,
             scoring='accuracy')

In [62]:
print(efs.best_feature_names_)
print(efs.best_score_)

('6', '20', '21', '27', '28')
0.9737283570604077


# RecursiveFeatureElimination

In [67]:
for k, v in models.items():
    print('model type: {}'.format(k))
    rfe = RFE(estimator=v, n_features_to_select=10)
    rfe.fit(X, y)
    print(np.array(list(range(X.shape[1])))[rfe.support_])
    print(rfe.score(X, y))

model type: rf
[ 0  1  6  7 10 13 21 22 23 27]
0.9718804920913884
model type: lr
[ 0  1  2  3 13 20 21 22 23 26]
0.9244288224956063
model type: svc
[ 0  2  3 13 20 21 22 23 25 26]
0.9261862917398945


# Logistic Regression coefficents

In [36]:
model_coefs = []

c = copy(models['lr'])
c.fit(X, y)
    
model_coefs = np.abs(c.coef_).sum(0)

In [43]:
sorted(zip(model_coefs.tolist(), range(model_coefs.shape[0])), key = lambda x: -x[0])

[(0.12941427519545518, 2),
 (0.07119361736812418, 22),
 (0.06080937819631187, 21),
 (0.03744949299196023, 13),
 (0.03315573040365625, 20),
 (0.03177794924708716, 0),
 (0.02979118764768479, 23),
 (0.024930010785713594, 1),
 (0.011327217384643661, 3),
 (0.008216080697687216, 26),
 (0.006367041609851984, 25),
 (0.005346895828013415, 12),
 (0.002776771506181147, 6),
 (0.0021540964297854896, 27),
 (0.0017498220089531818, 5),
 (0.0010915943591670958, 7),
 (0.0009399048060519815, 28),
 (0.000629659446417253, 16),
 (0.0005007572838046502, 10),
 (0.00047737120665777713, 15),
 (0.00038205966600363334, 29),
 (0.0002827708890818849, 24),
 (0.00023122859256097418, 11),
 (0.00014470228130750016, 17),
 (8.64503355601112e-05, 9),
 (6.216562923148185e-05, 18),
 (3.833914090999487e-05, 4),
 (3.512145239055656e-05, 19),
 (2.3220175208675074e-05, 14),
 (1.1279364862935998e-05, 8)]