In [1]:
import pandas as pd
import numpy as np
import struct

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

import matplotlib.pyplot as plt

%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
import warnings
warnings.filterwarnings('ignore')

# RF regressor optimization code

In [3]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

# MNIST

In [4]:
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)

In [5]:
X = read_idx('../../datasets/mnist/train-images.idx3-ubyte')
y = read_idx('../../datasets/mnist/train-labels.idx1-ubyte')

In [6]:
X = X / np.max(X)

In [7]:
X = X.reshape(X.shape[0], -1)

In [8]:
X = X[:1000, :]
y = y[:1000]

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [10]:
print('dataset size: {}'.format(X.shape))

dataset size: (1000, 784)


## Training plain model

In [11]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [12]:
params_opt = hyperopt_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [13]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [14]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [15]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

rf is ready
lr is ready
svc is ready


In [16]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
rf, Accuracy: 0.33555555555555555
lr, Accuracy: 0.8455555555555555
svc, Accuracy: 0.8444444444444444


In [17]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
rf, Accuracy: 0.42
lr, Accuracy: 0.85
svc, Accuracy: 0.85


### Training model with T-SNE-processed data

In [18]:
# number of components is fixed to 10
tsne = TSNE(n_components=10, n_iter=300, method='exact')
X = tsne.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [19]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [21]:
params_opt = hyperopt_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [22]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [23]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [24]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [25]:
print('for T-SNE data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for T-SNE data (averate scores):
svr, Accuracy: 0.6144444444444445
linear, Accuracy: 0.64
forest, Accuracy: 0.27555555555555555


In [26]:
print('for T-SNE data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for T-SNE data (validation scores):
svr, Accuracy: 0.72
linear, Accuracy: 0.73
forest, Accuracy: 0.3


# Credit Card Fraud

In [28]:
df = pd.read_csv('../../datasets/creditcard/creditcard.csv')

In [55]:
X = df[np.setdiff1d(df.columns, ["Class", "Time"])].values
y = df.Class.values

In [56]:
X = X[:5000, :]
y = y[:5000]

In [57]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [58]:
print('dataset size: {}'.format(X.shape))

dataset size: (5000, 29)


## Training plain model

In [59]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [60]:
params_opt = hyperopt_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [61]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [62]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [63]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [64]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
svr, Accuracy: 0.9986666666666667
linear, Accuracy: 0.9991111111111112
forest, Accuracy: 0.9993333333333334


In [65]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
svr, Accuracy: 0.998
linear, Accuracy: 1.0
forest, Accuracy: 1.0


### Training model with T-SNE-processed data

In [66]:
# number of components is fixed to 10
tsne = TSNE(n_components=10, n_iter=300, method='exact')
X = tsne.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

KeyboardInterrupt: 

In [None]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [None]:
params_opt = hyperopt_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [None]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [None]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [None]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

In [None]:
print('for T-SNE data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

In [None]:
print('for T-SNE data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

# Breast Cancer

In [68]:
df = pd.read_csv('../../datasets/cancer/breast_cancer.csv')
y = df.diagnosis.replace('M', 0).replace('B', 1).values
X = np.asarray(df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).as_matrix())

In [69]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [70]:
print('dataset size: {}'.format(X.shape))

dataset size: (569, 30)


## Training plain model

In [71]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [73]:
params_opt = hyperopt_optimization_lightgbm(X, y, cv=4, max_iter_opt=10)

In [74]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [75]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', **params_opt),
    'lr': LogisticRegression(random_state=42, multi_class='ovr', solver='liblinear', C=10000, tol=1e-2),
    'svc': LinearSVC(multi_class='ovr', C=10000, tol=1e-2),
}

In [76]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

rf is ready
lr is ready
svc is ready


In [77]:
print('for pure data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
rf, Accuracy: 0.3729822161422709
lr, Accuracy: 0.923825809393525
svc, Accuracy: 0.908253533971728


In [78]:
print('for pure data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for pure data (validation scores):
rf, Accuracy: 0.9473684210526315
lr, Accuracy: 0.9298245614035088
svc, Accuracy: 0.9298245614035088


### Training model with T-SNE-processed data

In [79]:
# number of components is fixed to 10
tsne = TSNE(n_components=10, n_iter=300, method='exact')
X = tsne.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [80]:
X_train, y_train = shuffle(X_train, y_train)

### * Estimating LightGBM params

In [81]:
params_opt = hyperopt_optimization_lightgbm(X_train, y_train, cv=4, max_iter_opt=10)

In [82]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

In [83]:
models = {
    'svr': LinearSVC(C=10000, dual=False, max_iter=100, tol=1e-2),
    'linear': LogisticRegression(random_state=42, multi_class='ovr', solver='sag', C=10000, tol=1e-2, n_jobs=10),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

### * Evaluate with CV

In [84]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=6, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_train, y_train):
        X_crossval_train, X_crossval_test = X_train[train_ix], X_train[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    print(k + ' is ready')

svr is ready
linear is ready
forest is ready


In [85]:
print('for T-SNE data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for T-SNE data (averate scores):
svr, Accuracy: 0.7674190606475149
linear, Accuracy: 0.7968308253533972
forest, Accuracy: 0.3713178294573643


In [86]:
print('for T-SNE data (validation scores):')
for name, model in models.items():
    model.fit(X_train, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val), y_val.squeeze())))

for T-SNE data (validation scores):
svr, Accuracy: 0.7543859649122807
linear, Accuracy: 0.7719298245614035
forest, Accuracy: 0.7368421052631579
