In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import struct

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

import matplotlib.pyplot as plt

%matplotlib inline

# RF regressor optimization code

In [3]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

# MNIST

In [56]:
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)

In [57]:
X = read_idx('./datasets/mnist/train-images.idx3-ubyte')
y = read_idx('./datasets/mnist/train-labels.idx1-ubyte')

In [58]:
X = X / np.max(X)

In [59]:
X = X.reshape(X.shape[0], -1)

In [60]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [61]:
print('dataset size: {}'.format(X.shape))

dataset size: (60000, 784)


## Training plain model

In [62]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [63]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [64]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [65]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [66]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [67]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
svr, Accuracy: 0.9128703703703703
linear, Accuracy: 0.9156481481481481
forest, Accuracy: 0.920388888888889


In [68]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
svr, Accuracy: 0.9056666666666666
linear, Accuracy: 0.9071666666666667
forest, Accuracy: 0.9176666666666666


### Training model with PCA-processed data

In [69]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [70]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating LightGBM params

In [71]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=4, max_iter_opt=10)

In [72]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [73]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [74]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [75]:
print('for PCA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for PCA data (averate scores):
svr, Accuracy: 0.7669444444444444
linear, Accuracy: 0.7760370370370371
forest, Accuracy: 0.8283148148148148


In [76]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_pca, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_pca), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
svr, Accuracy: 0.7715
linear, Accuracy: 0.7815
forest, Accuracy: 0.8345


# Credit Card Fraud

In [37]:
df = pd.read_csv('./datasets/creditcard/creditcard.csv')

In [38]:
X = df[np.setdiff1d(df.columns, ["Class", "Time"])].values
y = df.Class.values

In [39]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [40]:
print('dataset size: {}'.format(X.shape))

dataset size: (284807, 29)


## Training plain model

In [41]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [42]:
params_opt = bayesian_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [43]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [44]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [45]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [46]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
svr, Accuracy: 0.9991300141226408
linear, Accuracy: 0.9982093115797852
forest, Accuracy: 0.9994577218073859


In [47]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
svr, Accuracy: 0.9991573329588147
linear, Accuracy: 0.9978933323970366
forest, Accuracy: 0.9992275552122467


### Training model with PCA-processed data

In [48]:
# number of components is fixed to 10
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [49]:
X_pca, y_train = shuffle(X_pca, y_train)

### * Estimating LightGBM params

In [50]:
params_opt = bayesian_optimization_lightgbm(X_pca, y_train, cv=4, max_iter_opt=10)

In [51]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [52]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [53]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_pca, y_train):
        X_crossval_train, X_crossval_test = X_pca[train_ix], X_pca[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [54]:
print('for PCA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for PCA data (averate scores):
linear, Accuracy: 0.9989817654081131
svr, Accuracy: 0.9989232461786943
forest, Accuracy: 0.9992665589912845


In [55]:
print('for decorrelated data using PCA (validation scores):')
for name, model in models.items():
    model.fit(X_pca, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_pca), y_val.squeeze())))

for decorrelated data using PCA (validation scores):
linear, Accuracy: 0.9990519995786665
svr, Accuracy: 0.9988413328183702
forest, Accuracy: 0.9993328885923949
