In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, as_float_array
from sklearn.decomposition import FastICA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMModel
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

In [3]:
class MaxentropyDichtomizationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_splits, verbose=False):
        self.n_splits = n_splits
        
        self.n_samples = None
        self.n_features = None
        self._splits = None
        self._splits_indices = None
    
    def _check_X(self, X, n_features=None):
        _X = None
        if not hasattr(X, 'dtype'):
            _X = check_array(as_float_array(X))
        _X = check_array(X)
        
        if self.n_features:
            if _X.shape[1] != self.n_features:
                raise Exception('X has {} columns while {} are expected'.format(_X.shape[1], self.n_features))
        return _X
    
    def _get_maxentropy_split(self, X):
        block_size = X.shape[0]
        ix_max_entropy = -1
        max_entropy = -1
        max_probas = None
        for i in range(1, block_size - 1):
            a = np.sum(X < X[i])
            b = np.sum(X >= X[i])

            p = np.array([a / block_size, b / block_size])
            e = -np.sum(np.log(p + 1) * p)

            if max_entropy < e:
                max_probas = p
                max_entropy = e
                ix_max_entropy = i
        
        if max_probas is None or 0 in max_probas:
            return 0, -1

        return max_entropy, ix_max_entropy

    def _dichtomize(self, X):
        _iters = np.log2(self.n_splits)
        if _iters - int(_iters) != 0:
            raise Exception('number of bins should be of a power of 2')
        
        # make first maxentropy split
        _, initial_bin = self._get_maxentropy_split(X)
        splits_current_feature = [(0, initial_bin), (initial_bin, self.n_samples - 1)]
        for i in range(int(_iters) - 1):
            # an empty list for splits in current iteration
            _splits = list()
            for j in splits_current_feature:
                entropy, index = self._get_maxentropy_split(X[j[0]: j[1]])
                if entropy == 0:
                    _splits += [(j[0], j[1])]
                else:
                    _splits += [(j[0], j[0] + index), (j[0] + index, j[1])]

            splits_current_feature = _splits
            
        return splits_current_feature
    
    def _convert(self, X, ix):
        result = list()
        for x in X.flatten():
            result.append(np.argwhere([k[0] <= x and x < k[1] for k in self._splits[ix]]))
        return np.array(result).reshape(-1, 1) 
    
    def fit(self, X):
        X = self._check_X(X)
        self.n_samples, self.n_features = X.shape
        
        self._splits = list()
        self._splits_indices = list()
        
        for ix in range(self.n_features):
            x = np.sort(X[:, ix].flatten())
            _indices = self._dichtomize(x.flatten())
            
            self._splits_indices.append(_indices)
            self._splits.append([[x[i[0]], x[i[1]]] for i in _indices])
            
            self._splits[-1][0][0] = -np.inf
            self._splits[-1][-1][1] = np.inf
            
            self._splits = np.array(self._splits)
            
        return self
    
    def transform(self, X):
        _, n_features = X.shape
        X = self._check_X(X, n_features)
        
        X_categorical = list()
        for ix in range(n_features):
            X_categorical.append(self._convert(X, ix))
            
        return np.hstack(X_categorical)

# RF regressor optimization code

In [4]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

## Facebook Comment Volume Dataset

In [5]:
df = pd.read_csv('../datasets/facebook_comments.csv', index_col=0)

In [6]:
print('dataset size: {}'.format(df.shape))

dataset size: (40949, 54)


In [7]:
# encoding categorical variables
# H Local - category
# Post Promotion Status - category
# Base Time - time variable
# Page Category - category

H_Local = OneHotEncoder().fit_transform(df['H Local'].values.reshape(-1, 1)).todense()
Post_Promotion_Status = OneHotEncoder().fit_transform(df['Post Promotion Status'].values.reshape(-1, 1)).todense()
Base_Time = OneHotEncoder().fit_transform(df['Base Time'].values.reshape(-1, 1)).todense()
Page_Category = OneHotEncoder().fit_transform(df['Page Category'].values.reshape(-1, 1)).todense()

In [8]:
dichtomizer = MaxentropyDichtomizationTransformer(n_splits=32)
y = dichtomizer.fit(df.Target.values.reshape(-1, 1)).transform(df.Target.values.reshape(-1, 1))

In [9]:
X = df.drop(['H Local', 'Post Promotion Status', 'Base Time', 'Page Category', 'Target'], axis=1).values
X = np.hstack([X, H_Local, Post_Promotion_Status, Base_Time, Page_Category])

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [11]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (40949, 228)


### Training model with ICA-processed data

In [12]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [13]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [14]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [15]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [16]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [17]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [18]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
linear, Accuracy: 0.550197864338481
svr, Accuracy: 0.5646610493141277
forest, Accuracy: 0.5771152890770817


In [19]:
print('for ICA data (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for ICA data (validation scores):
linear, Accuracy: 0.5623931623931624
svr, Accuracy: 0.5758241758241758
forest, Accuracy: 0.5885225885225885


## Parkinsons Telemonitoring Data Set

In [27]:
df = pd.read_csv('../datasets/parkinsons_updrs.data')

In [28]:
print('dataset size: {}'.format(df.shape))

dataset size: (5875, 22)


In [29]:
subject = df['subject#'].unique().tolist()
subject_binary = np.zeros((df.shape[0], len(subject)))
for k, i in df.iterrows():
    subject_binary[k, int(i['subject#']) - 1] = 1

In [30]:
dichtomizer = MaxentropyDichtomizationTransformer(n_splits=32)
y = dichtomizer.fit(df.total_UPDRS.values.reshape(-1, 1)).transform(df.total_UPDRS.values.reshape(-1, 1))

In [31]:
# replacing categorical features with binary values
X = df.drop(['motor_UPDRS', 'total_UPDRS', 'subject#'], axis=1).values
X = np.concatenate([subject_binary, X], axis=1)

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [33]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (5875, 61)


### Training model with ICA-processed data

In [34]:
# number of components is fixed to 10
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [35]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [36]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [37]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [38]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [39]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [40]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
linear, Accuracy: 0.13183050245266584
svr, Accuracy: 0.22470143950854993
forest, Accuracy: 0.6487616329711641


In [41]:
print('for decorrelated data using ICA (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for decorrelated data using ICA (validation scores):
linear, Accuracy: 0.13945578231292516
svr, Accuracy: 0.23979591836734693
forest, Accuracy: 0.7278911564625851


## Energy efficiency Data Set

In [44]:
df = pd.read_excel('../datasets/ENB2012_data.xlsx')

In [45]:
print('dataset size: {}'.format(df.shape))

dataset size: (768, 10)


In [46]:
subject = df['X6'].unique().tolist()
subject_map = dict(zip(subject, range(len(subject))))
subject_binary = np.zeros((df.shape[0], len(subject)))
for k, i in df.iterrows():
    subject_binary[k, subject_map[i['X6']]] = 1

In [47]:
dichtomizer = MaxentropyDichtomizationTransformer(n_splits=32)
y = dichtomizer.fit(df.Y1.values.reshape(-1, 1)).transform(df.Y1.values.reshape(-1, 1))

In [48]:
# replacing categorical features with binary values
X = df.drop(['Y1', 'Y2', 'X6'], axis=1).values
X = np.concatenate([subject_binary, X], axis=1)

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [50]:
print('dataset size after preprocessing: {}'.format(X.shape))

dataset size after preprocessing: (768, 11)


### Training model with ICA-processed data

In [51]:
# number of components is fixed to 4
ica = FastICA(n_components=4)
X_ica = ica.fit_transform(X_train)
X_val_ica = ica.transform(X_val)

In [52]:
X_ica, y_train = shuffle(X_ica, y_train)

### * Estimating LightGBM params

In [53]:
params_opt = bayesian_optimization_lightgbm(X_ica, y_train, cv=6, max_iter_opt=15)

In [54]:
params_opt['bagging_freq'] = int(params_opt['bagging_freq'])
params_opt['n_estimators'] = int(params_opt['n_estimators'])

### * Evaluate with CV

In [55]:
models = {
    'linear': LogisticRegression(),
    'svr': LinearSVC(C=100, dual=False, max_iter=200),
    'forest': LGBMClassifier(boosting_type='rf', **params_opt)
}

In [56]:
stats = {}

for k, model in models.items():
    stats[k] = []
    kfold = KFold(n_splits=8, shuffle=True)
    
    for train_ix, test_ix in kfold.split(X_ica, y_train):
        X_crossval_train, X_crossval_test = X_ica[train_ix], X_ica[test_ix]
        y_crossval_train, y_crossval_test = y_train[train_ix], y_train[test_ix]
        
        # here must be sume sort of optimization
        model.fit(X_crossval_train, y_crossval_train.ravel())
        stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))

In [57]:
print('for ICA data (averate scores):')
for model, model_stats in stats.items():
    print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))

for ICA data (averate scores):
linear, Accuracy: 0.03331328521785619
svr, Accuracy: 0.11003074044373162
forest, Accuracy: 0.1476209569633788


In [58]:
print('for decorrelated data using ICA (validation scores):')
for name, model in models.items():
    model.fit(X_ica, y_train.squeeze())
    print('{}, Accuracy: {}'.format(name, accuracy_score(model.predict(X_val_ica), y_val.squeeze())))

for decorrelated data using ICA (validation scores):
linear, Accuracy: 0.025974025974025976
svr, Accuracy: 0.12987012987012986
forest, Accuracy: 0.19480519480519481
