In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import FastICA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMModel
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

In [62]:
class MedianBinarizer(object):
    def __init__(self, n_bins: int):
        self.n_bins = n_bins
        self.x_bins = None
        self.encoder = None
    
    def _get_feature_bins(self, X):
        #extra_bin = 0
        #if X.shape[0] % self.n_bins != 0:
        #    extra_bin = 1
        #    print('number of bins does not fit to dataset'
        #          ', extending number of bins to extra one')
        #
        #n_batch = X.shape[0] // self.n_bins
        #X_sorted = np.sort(X)
        #x_bins = [X[i*n_batch:(i+1)*n_batch] for i in range(self.n_bins + extra_bin)]
        
        iterations = np.log2(self.n_bins)
        if iterations - int(iterations) != 0:
            raise Exception('bins should be a power of 2')
        
        # получаем вариационный ряд
        x_bins = [np.sort(X)]
        for i in range(int(iterations)):
            new_bins = list()
            for j in x_bins:
                # середина среза вариационного ряда
                index = j.shape[0] // 2
                new_bins += [j[:index], j[index:]]
            x_bins = new_bins
        return x_bins
    
    def _lookup(self, x):
        for k, _bin in enumerate(self.x_bins):
            if x <= _bin[-1]:
                return k
        return k
    
    def _real_to_category(self, X):
        # X should not be a column vector
        X_new = []
        for x in X:
            i = self._lookup(x)
            
            if i != -1:
                X_new.append(i)
            else:
                raise Exception('something went wrong')

        return np.array(X_new).reshape(-1, 1)
    
    def fit(self, X):
        self.x_bins = self._get_feature_bins(X)
        X_cat = self._real_to_category(X)

        self.encoder = OneHotEncoder(n_values=self.n_bins, sparse=False)
        self.encoder.fit(X_cat)
        
        return self
    
    def transform_categorical(self, X):
        return self._real_to_category(X)
    
    def transform_onehot(self, X):
        x_cat = self._real_to_category(X)
        return self.encoder.transform(x_cat)
    
    def onehot_to_categorical(self, X):
        return np.argwhere(X)[:, 1]
    
    def categorical_to_onehot(self, X):
        return self.encoder.transform(X)

# RF regressor optimization code

In [63]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

## Facebook Comment Volume Dataset

In [64]:
df = pd.read_csv('../datasets/facebook_comments.csv', index_col=0)

In [65]:
print('dataset size: {}'.format(df.shape))

dataset size: (40949, 54)


In [66]:
y = MedianBinarizer(n_bins=32).fit(df.Target.values.reshape(-1, 1)).transform_categorical(df.Target.values.reshape(-1, 1))
df.drop(['Target'], axis=1, inplace=True)

categorical_features = ['H Local', 'Post Promotion Status', 'Base Time', 'Page Category']


features_categorical = [
    OneHotEncoder().fit_transform(df[i].values.reshape(-1, 1)).todense() for i in categorical_features
]

features_real = [
    MedianBinarizer(n_bins=32).fit(df[i].values.reshape(-1, 1)).transform_onehot(df[i].values.reshape(-1, 1))
    for i in np.setdiff1d(ar1=df.columns, ar2=categorical_features)
]

X = np.concatenate(features_categorical + features_real, axis=1)

In [67]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [68]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (40949, 1747)


### Training model with ICA-processed data

In [69]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [70]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [72]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [73]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [74]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [75]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [76]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for pure data (averate scores):
linear, Accuracy: 0.5512014179935929
svr, Accuracy: 0.6526295942731336
forest, Accuracy: 0.6601722694259458


In [77]:
print('for ICA data (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for ICA data (validation scores):
linear, Accuracy: 0.5531135531135531
svr, Accuracy: 0.6490842490842491
forest, Accuracy: 0.6615384615384615


## Parkinsons Telemonitoring Data Set

In [86]:
df = pd.read_csv('../datasets/parkinsons_updrs.data')

In [87]:
print('dataset size: {}'.format(df.shape))

dataset size: (5875, 22)


In [88]:
y = MedianBinarizer(n_bins=32).fit(df.total_UPDRS.values.reshape(-1, 1)).transform_categorical(df.total_UPDRS.values.reshape(-1, 1))
df.drop(['total_UPDRS'], axis=1, inplace=True)

categorical_features = ['subject#']


features_categorical = [
    OneHotEncoder().fit_transform(df[i].values.reshape(-1, 1)).todense() for i in categorical_features
]

features_real = [
    MedianBinarizer(n_bins=32).fit(df[i].values.reshape(-1, 1)).transform_onehot(df[i].values.reshape(-1, 1))
    for i in np.setdiff1d(ar1=df.columns, ar2=categorical_features)
]

X = np.concatenate(features_categorical + features_real, axis=1)

In [89]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [90]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (5875, 682)


### Training model with ICA-processed data

In [91]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [92]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [93]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [94]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [95]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [96]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [97]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
linear, Accuracy: 0.4787188259294916
svr, Accuracy: 0.5057692087287398
forest, Accuracy: 0.6731536239856966


In [99]:
print('for decorrelated data using ICA (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for decorrelated data using ICA (validation scores):
linear, Accuracy: 0.4642857142857143
svr, Accuracy: 0.4880952380952381
forest, Accuracy: 0.6564625850340136


## Energy efficiency Data Set

In [100]:
df = pd.read_excel('../datasets/ENB2012_data.xlsx')

In [101]:
print('dataset size: {}'.format(df.shape))

dataset size: (768, 10)


In [102]:
y = MedianBinarizer(n_bins=32).fit(df.Y1.values.reshape(-1, 1)).transform_categorical(df.Y1.values.reshape(-1, 1))
df.drop(['Y1', 'Y2'], axis=1, inplace=True)

categorical_features = ['X6']


features_categorical = [
    OneHotEncoder().fit_transform(df[i].values.reshape(-1, 1)).todense() for i in categorical_features
]

features_real = [
    MedianBinarizer(n_bins=32).fit(df[i].values.reshape(-1, 1)).transform_onehot(df[i].values.reshape(-1, 1))
    for i in np.setdiff1d(ar1=df.columns, ar2=categorical_features)
]

X = np.concatenate(features_categorical + features_real, axis=1)

In [103]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [104]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (768, 228)


### Training model with ICA-processed data

In [105]:
# number of components is fixed to 4
ica = FastICA(n_components=4)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [106]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [107]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [108]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [109]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [110]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [111]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
linear, Accuracy: 0.5398623362737236
svr, Accuracy: 0.7220662924351777
forest, Accuracy: 0.7611601176156109


In [113]:
print('for decorrelated data using ICA (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for decorrelated data using ICA (validation scores):
linear, Accuracy: 0.5714285714285714
svr, Accuracy: 0.7662337662337663
forest, Accuracy: 0.7922077922077922
