In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
import struct

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

import matplotlib.pyplot as plt

%matplotlib inline

# RF regressor optimization code

In [5]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

# MNIST

In [70]:
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)

In [71]:
X = read_idx('./datasets/mnist/train-images.idx3-ubyte')
y = read_idx('./datasets/mnist/train-labels.idx1-ubyte')

In [72]:
X = X / np.max(X)

In [73]:
X = X.reshape(X.shape[0], -1)

In [74]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [75]:
print('dataset size: {}'.format(X.shape))

dataset size: (60000, 784)


## Training plain model

In [76]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [77]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [78]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [79]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [80]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

rf is ready
lr is ready
svc is ready


In [81]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
rf, Accuracy: 0.9244629629629629
lr, Accuracy: 0.9156296296296297
svc, Accuracy: 0.8632962962962963


In [82]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
rf, Accuracy: 0.9288333333333333
lr, Accuracy: 0.9158333333333334
svc, Accuracy: 0.8576666666666667


### Training model with PCA-processed data

In [83]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [84]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating LightGBM params

In [85]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=4, max_iter_opt=10)

In [86]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [87]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [88]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [89]:
print('for PCA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for PCA data (averate scores):
svr, Accuracy: 0.7684074074074073
linear, Accuracy: 0.7775555555555554
forest, Accuracy: 0.8348333333333334


In [90]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_pca, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_pca), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
svr, Accuracy: 0.7608333333333334
linear, Accuracy: 0.7696666666666667
forest, Accuracy: 0.8371666666666666


# Credit Card Fraud

In [57]:
df = pd.read_csv('./datasets/creditcard/creditcard.csv')

In [58]:
X = df[np.setdiff1d(df.columns, ["Class", "Time"])].values
y = df.Class.values

In [59]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [60]:
print('dataset size: {}'.format(X.shape))

dataset size: (284807, 29)


## Training plain model

In [61]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [62]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [63]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [64]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [65]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [66]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
svr, Accuracy: 0.9991768295061757
linear, Accuracy: 0.9981859038880176
forest, Accuracy: 0.9994772282171921


In [67]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
svr, Accuracy: 0.9992977774656788
linear, Accuracy: 0.9981742214107651
forest, Accuracy: 0.9996488887328394


### Training model with PCA-processed data

In [48]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [49]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating LightGBM params

In [50]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=4, max_iter_opt=10)

In [51]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [52]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [53]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [54]:
print('for PCA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for PCA data (averate scores):
linear, Accuracy: 0.9989817654081131
svr, Accuracy: 0.9989232461786943
forest, Accuracy: 0.9992665589912845


In [55]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_pca, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_pca), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
linear, Accuracy: 0.9990519995786665
svr, Accuracy: 0.9988413328183702
forest, Accuracy: 0.9993328885923949


# Breast Cancer

In [39]:
df = pd.read_csv('./datasets/cancer/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [41]:
print('dataset size: {}'.format(X.shape))

dataset size: (569, 30)


## Training plain model

In [42]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [43]:
params_opt = bayesian_optimization_lightgbm(X, y, cv=4, max_iter_opt=10)

In [44]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [45]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [46]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

rf is ready
lr is ready
svc is ready


In [47]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
rf, Accuracy: 0.955061559507524
lr, Accuracy: 0.9414728682170543
svc, Accuracy: 0.8336297309621523


In [48]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
rf, Accuracy: 0.8947368421052632
lr, Accuracy: 0.8771929824561403
svc, Accuracy: 0.8245614035087719


### Training model with PCA-processed data

In [49]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [50]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating LightGBM params

In [51]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=4, max_iter_opt=10)

In [52]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [53]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [54]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [55]:
print('for PCA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for PCA data (averate scores):
rf, Accuracy: 0.9453944368445053
lr, Accuracy: 0.9355905152758779
svc, Accuracy: 0.9102143182854538


In [56]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_pca, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_pca), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
rf, Accuracy: 0.9122807017543859
lr, Accuracy: 0.8421052631578947
svc, Accuracy: 0.8596491228070176
