In [6]:
import numpy as np
import pandas as pd
from importlib import reload
from copy import deepcopy as copy

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, as_float_array
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector, ExhaustiveFeatureSelector

In [17]:
from importlib import reload

import mlrank.hyperparams_opt as hyperparams_opt

reload(hyperparams_opt)

bayesian_optimization_lightgbm = hyperparams_opt.bayesian_optimization_lightgbm

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
import sys
sys.path.append('../')

In [9]:
n_holdout_interations = 100

In [11]:
df = pd.read_csv('./classification/datasets/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [12]:
X = StandardScaler().fit_transform(X)

In [20]:
params_opt = bayesian_optimization_lightgbm(X, y, cv=4, max_iter_opt=10)

In [21]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [45]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', min_child_samples=10, **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2, max_iter=250),
}

# SequentialFeatureSelection

In [61]:
accur_score = list()

for i in range(n_holdout_interations):
    record = dict()
    for name, model in models.items():
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5)#, random_state=1)
        
        sfs = SequentialFeatureSelector(model, k_features=5, scoring='accuracy')
        sfs.fit(X_train, y_train)
        
        model.fit(X_train[:, [int(i) for i in sfs.subsets_[5]['feature_names']]], y_train.squeeze())
        record[name] = accuracy_score(model.predict(X_val[:, [int(i) for i in sfs.subsets_[5]['feature_names']]]), y_val.squeeze())
    accur_score.append(record)

print(np.mean([i['svc'] for i in accur_score]))
print(np.mean([i['lr'] for i in accur_score]))
print(np.mean([i['rf'] for i in accur_score]))

0.9263859649122808
0.9604561403508773
0.9388070175438595


# ExhaustiveFeatureSelection

In [24]:
efs = ExhaustiveFeatureSelector(estimator=models['lr'], max_features=5, scoring='accuracy', n_jobs=-1, print_progress=False)
efs.fit(X, y)

ExhaustiveFeatureSelector(clone_estimator=True, cv=5,
             estimator=LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False),
             max_features=5, min_features=1, n_jobs=-1,
             pre_dispatch='2*n_jobs', print_progress=False,
             scoring='accuracy')

In [25]:
efs.best_feature_names_

('17', '21', '23', '24', '28')

In [29]:
accur_score = list()

for i in range(n_holdout_interations):
    record = dict()
    X_train, X_val, y_train, y_val = train_test_split(X[:, [int(i) for i in efs.best_feature_names_]], y, test_size=0.5)#, random_state=1)
        
    models['lr'].fit(X_train, y_train.squeeze())
    record['lr'] = accuracy_score(models['lr'].predict(X_val), y_val.squeeze())
    accur_score.append(record)

print(np.mean([i['lr'] for i in accur_score]))

0.9732280701754387


# RecursiveFeatureElimination

In [49]:
model_feats = dict()

for k, v in models.items():
    model_feats[k] = [int(i) for i in sfs.subsets_[5]['feature_names']]
    rfe = RFE(estimator=v, n_features_to_select=5)
    rfe.fit(X, y)
    model_feats[k] = np.array(list(range(X.shape[1])))[rfe.support_]

In [50]:
model_feats

{'lr': array([10, 20, 21, 23, 27]),
 'rf': array([ 9, 15, 16, 17, 19]),
 'svc': array([ 0,  2,  7, 13, 23])}

In [51]:
accur_score = list()

for i in range(n_holdout_interations):
    record = dict()
    for name, model in models.items():
        X_train, X_val, y_train, y_val = train_test_split(X[:, model_feats[name]], y, test_size=0.5)#, random_state=1)
        
        model.fit(X_train, y_train.squeeze())
        record[name] = accuracy_score(model.predict(X_val), y_val.squeeze())
    accur_score.append(record)

print(np.mean([i['svc'] for i in accur_score]))
print(np.mean([i['lr'] for i in accur_score]))
print(np.mean([i['rf'] for i in accur_score]))

0.9082105263157895
0.9637543859649124
0.6942456140350877


# Logistic Regression coefficents

In [52]:
model_coefs = []

c = copy(models['lr'])
c.fit(X, y)
    
model_coefs = np.abs(c.coef_).sum(0)

In [53]:
sorted(zip(model_coefs.tolist(), range(model_coefs.shape[0])), key = lambda x: -x[0])[:5]

[(1.8956650592717343, 10),
 (1.7403622083957953, 21),
 (1.3205567864813037, 20),
 (1.3197392480935997, 28),
 (1.2445662612696022, 13)]

In [54]:
accur_score = list()

for i in range(n_holdout_interations):
    record = dict()
    for name, model in models.items():
        X_train, X_val, y_train, y_val = train_test_split(X[:, [10, 21, 20, 28, 13]], y, test_size=0.5)#, random_state=1)
        
        model.fit(X_train, y_train.squeeze())
        record[name] = accuracy_score(model.predict(X_val), y_val.squeeze())
    accur_score.append(record)

print(np.mean([i['svc'] for i in accur_score]))
print(np.mean([i['lr'] for i in accur_score]))
print(np.mean([i['rf'] for i in accur_score]))

0.9244210526315788
0.9571578947368421
0.9078947368421052


# Feature Importance

In [55]:
model_coefs = []

c = copy(models['rf'])
c.fit(X, y)
    
print(c.feature_importances_)
print(np.argsort(c.feature_importances_)[::-1])#[:5]

[37 41 15 46 37 10  9 25 83 24 19 13  4 57 45 23 73 31  0 38 24 25 17  8
 49 18 31 32 27 51]
[ 8 16 13 29 24  3 14  1 19  0  4 27 17 26 28  7 21 20  9 15 10 25 22  2
 11  5  6 23 12 18]


In [56]:
accur_score = list()

for i in range(n_holdout_interations):
    record = dict()
    for name, model in models.items():
        X_train, X_val, y_train, y_val = train_test_split(X[:, [8, 16, 13,  3, 24]], y, test_size=0.5)#, random_state=1)
        
        model.fit(X_train, y_train.squeeze())
        record[name] = accuracy_score(model.predict(X_val), y_val.squeeze())
    accur_score.append(record)

print(np.mean([i['svc'] for i in accur_score]))
print(np.mean([i['lr'] for i in accur_score]))
print(np.mean([i['rf'] for i in accur_score]))

0.8868070175438596
0.9376842105263159
0.901017543859649
