In [32]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import struct

from sklearn.manifold import LocallyLinearEmbedding as LLE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

import matplotlib.pyplot as plt

%matplotlib inline

# RF regressor optimization code

In [21]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(init_points=10, n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

# MNIST

In [4]:
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)

In [5]:
X = read_idx('../../datasets/mnist/train-images.idx3-ubyte')
y = read_idx('../../datasets/mnist/train-labels.idx1-ubyte')

In [6]:
X = X / np.max(X)

In [7]:
X = X.reshape(X.shape[0], -1)

In [None]:
X = X[:5000, :]
y = y[:5000]

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [11]:
print('dataset size: {}'.format(X.shape))

dataset size: (450, 784)


## Training plain model

In [12]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [23]:
params_opt = hyperopt_optimization_lightgbm(X_train[:100, :], y_train[:100], cv=4, max_iter_opt=10)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [27]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [28]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [29]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


rf is ready
lr is ready
svc is ready


In [30]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
rf, Accuracy: 0.10444444444444445
lr, Accuracy: 0.08666666666666667
svc, Accuracy: 0.07777777777777777


In [33]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
rf, Accuracy: 0.2
lr, Accuracy: 0.06
svc, Accuracy: 0.08


### Training model with LLE-processed data

In [18]:
# number of components is fixed to 10
lle = LLE(n_components=10)
X_lle = lle.fit_transform(X_train)
X_val_lle = lle.transform(X_val)

In [19]:
X_lle, y_train = shuffle(X_lle, y_train)

### * Estimating LightGBM params

In [34]:
params_opt = hyperopt_optimization_lightgbm(X_lle, y_train, cv=4, max_iter_opt=10)

In [35]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [36]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [37]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_lle, y_train):
        X_crossval_train, X_crossval_test = X_lle[train_ix], X_lle[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [38]:
print('for LLE data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for LLE data (averate scores):
svr, Accuracy: 0.7977777777777778
linear, Accuracy: 0.8022222222222223
forest, Accuracy: 0.29777777777777775


In [39]:
print('for decorrelated data using LLE (validation scores):')
for name, model in models.items():
    model.fit(X_lle, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_lle), y_val.squeeze())))

for decorrelated data using LLE (validation scores):
svr, Accuracy: 0.88
linear, Accuracy: 0.86
forest, Accuracy: 0.44


# Credit Card Fraud

In [42]:
df = pd.read_csv('../../datasets/creditcard/creditcard.csv')

In [51]:
X = df[np.setdiff1d(df.columns, ["Class", "Time"])].values
y = df.Class.values

In [54]:
X_train, X_val, y_train, y_val = train_test_split(X[:500, :], y[:500], test_size=0.1)

In [55]:
print('dataset size: {}'.format(X_train.shape))

dataset size: (900, 29)


## Training plain model

In [65]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [66]:
params_opt = hyperopt_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [67]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [68]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [69]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [70]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
svr, Accuracy: 0.9944444444444445
linear, Accuracy: 0.9966666666666666
forest, Accuracy: 0.9977777777777778


In [71]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
svr, Accuracy: 1.0
linear, Accuracy: 1.0
forest, Accuracy: 1.0


### Training model with LLE-processed data

In [72]:
# number of components is fixed to 10
lle = LLE(n_components=10)
X_lle = lle.fit_transform(X_train)
X_val_lle = lle.transform(X_val)

In [73]:
X_lle, y_train = shuffle(X_lle, y_train)

### * Estimating LightGBM params

In [None]:
params_opt = hyperopt_optimization_lightgbm(X_lle, y_train, cv=4, max_iter_opt=10)

In [None]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [None]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [None]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_lle, y_train):
        X_crossval_train, X_crossval_test = X_lle[train_ix], X_lle[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [None]:
print('for LLE data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

In [None]:
print('for decorrelated data using LLE (validation scores):')
for name, model in models.items():
    model.fit(X_lle, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_lle), y_val.squeeze())))

# Breast Cancer

In [76]:
df = pd.read_csv('../../datasets/cancer/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [77]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [78]:
print('dataset size: {}'.format(X.shape))

dataset size: (569, 30)


## Training plain model

In [79]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [80]:
params_opt = hyperopt_optimization_lightgbm(X, y, cv=4, max_iter_opt=10)

In [81]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [82]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [83]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

rf is ready
lr is ready
svc is ready


In [84]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
rf, Accuracy: 0.36529867761057916
lr, Accuracy: 0.9315321477428181
svc, Accuracy: 0.9238030095759234


In [85]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
rf, Accuracy: 0.43859649122807015
lr, Accuracy: 0.8947368421052632
svc, Accuracy: 0.9298245614035088


### Training model with LLE-processed data

In [86]:
# number of components is fixed to 10
lle = LLE(n_components=10)
X_lle = lle.fit_transform(X_train)
X_val_lle = lle.transform(X_val)

In [87]:
X_lle, y_train = shuffle(X_lle, y_train)

### * Estimating LightGBM params

In [88]:
params_opt = hyperopt_optimization_lightgbm(X_lle, y_train, cv=4, max_iter_opt=10)

In [89]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [90]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [91]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_lle, y_train):
        X_crossval_train, X_crossval_test = X_lle[train_ix], X_lle[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [92]:
print('for LLE data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for LLE data (averate scores):
rf, Accuracy: 0.8264021887824896
lr, Accuracy: 0.9238486092111264
svc, Accuracy: 0.7266529867761058


In [93]:
print('for decorrelated data using LLE (validation scores):')
for name, model in models.items():
    model.fit(X_lle, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_lle), y_val.squeeze())))

for decorrelated data using LLE (validation scores):
rf, Accuracy: 0.8947368421052632
lr, Accuracy: 0.8596491228070176
svc, Accuracy: 0.42105263157894735
