In [1]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import numpy as np
import struct

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, as_float_array
from sklearn.decomposition import FastICA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMModel
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

# RF regressor optimization code

In [3]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

# MNIST

In [14]:
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)

In [15]:
X = read_idx('./datasets/mnist/train-images.idx3-ubyte')
y = read_idx('./datasets/mnist/train-labels.idx1-ubyte')

In [16]:
X = X / np.max(X)

In [17]:
X = X.reshape(X.shape[0], -1)

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [19]:
print('dataset size: {}'.format(X.shape))

dataset size: (60000, 784)


### Training model with ICA-processed data

In [20]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [21]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [22]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [23]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [41]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [27]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [28]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
rf, Accuracy: 0.8332592592592591
lr, Accuracy: 0.7409074074074075
svc, Accuracy: 0.637888888888889


In [29]:
print('for ICA data (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for ICA data (validation scores):
rf, Accuracy: 0.8321666666666667
lr, Accuracy: 0.7451666666666666
svc, Accuracy: 0.6686666666666666


# Credit Card Fraud

In [30]:
df = pd.read_csv('./datasets/creditcard/creditcard.csv')

In [31]:
X = df[np.setdiff1d(df.columns, ["Class", "Time"])].values
y = df.Class.values

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [33]:
print('dataset size: {}'.format(X.shape))

dataset size: (284807, 29)


### Training model with ICA-processed data

In [35]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [36]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [38]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [39]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [42]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [43]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [44]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
rf, Accuracy: 0.9993523879857662
lr, Accuracy: 0.9988920351922305
svc, Accuracy: 0.9988881329666137


In [45]:
print('for ICA data (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for ICA data (validation scores):
rf, Accuracy: 0.9995435553526912
lr, Accuracy: 0.9989115550718023
svc, Accuracy: 0.9990871107053826


# Breast Cancer

In [46]:
df = pd.read_csv('./datasets/cancer/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [47]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [48]:
print('dataset size: {}'.format(X.shape))

dataset size: (569, 30)


### Training model with ICA-processed data

In [50]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [51]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [53]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [54]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [56]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [57]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [58]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
rf, Accuracy: 0.857421875
lr, Accuracy: 0.94921875
svc, Accuracy: 0.92578125


In [59]:
print('for ICA data (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for ICA data (validation scores):
rf, Accuracy: 0.8771929824561403
lr, Accuracy: 0.9298245614035088
svc, Accuracy: 0.9298245614035088
