In [22]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMModel
from lightgbm import LGBMClassifier 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, train_test_split

from hyperopt import fmin, tpe, hp
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

from scipy.stats import entropy
from copy import deepcopy as copy

# Класс дискретизации по принципу деления по медиане

In [3]:
class MedianBinarizer(object):
    def __init__(self, n_bins: int):
        self.n_bins = n_bins
        self.x_bins = None
        self.encoder = None
    
    def _get_feature_bins(self, X):
        #extra_bin = 0
        #if X.shape[0] % self.n_bins != 0:
        #    extra_bin = 1
        #    print('number of bins does not fit to dataset'
        #          ', extending number of bins to extra one')
        #
        #n_batch = X.shape[0] // self.n_bins
        #X_sorted = np.sort(X)
        #x_bins = [X[i*n_batch:(i+1)*n_batch] for i in range(self.n_bins + extra_bin)]
        
        iterations = np.log2(self.n_bins)
        if iterations - int(iterations) != 0:
            raise Exception('bins should be a power of 2')
        
        # получаем вариационный ряд
        x_bins = [np.sort(X)]
        for i in range(int(iterations)):
            new_bins = list()
            for j in x_bins:
                # середина среза вариационного ряда
                index = j.shape[0] // 2
                new_bins += [j[:index], j[index:]]
            x_bins = new_bins
        return x_bins
    
    def _lookup(self, x):
        for k, _bin in enumerate(self.x_bins):
            if x <= _bin[-1]:
                return k
        return k
    
    def _real_to_category(self, X):
        # X should not be a column vector
        X_new = []
        for x in X:
            i = self._lookup(x)
            
            if i != -1:
                X_new.append(i)
            else:
                raise Exception('something went wrong')

        return np.array(X_new).reshape(-1, 1)
    
    def fit(self, X):
        self.x_bins = self._get_feature_bins(X)
        X_cat = self._real_to_category(X)

        self.encoder = OneHotEncoder(n_values=self.n_bins, sparse=False)
        self.encoder.fit(X_cat)
        
        return self
    
    def transform_categorical(self, X):
        return self._real_to_category(X)
    
    def transform_onehot(self, X):
        x_cat = self._real_to_category(X)
        return self.encoder.transform(x_cat)
    
    def onehot_to_categorical(self, X):
        return np.argwhere(X)[:, 1]
    
    def categorical_to_onehot(self, X):
        return self.encoder.transform(X)

# RF regressor optimization code

In [4]:
# hyperopt for lightgbm shows terrible results
def hyperopt_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    space = hp.choice('clr_type', [
        {
            'type': 'lightgbm',
            'feature_fraction': hp.uniform('feature_fraction', 0.05, 0.95),
            'bagging_fraction': hp.uniform('bagging_fraction', 0.05, 0.95),
            'bagging_freq': hp.uniform('bagging_freq', 1, 50),
            'n_estimators': hp.uniform('n_estimators', 5, 50),
            #'max_bin': hp.uniform('max_bin', )
        }
    ])

    best = fmin(
        fn=lambda args: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=args['feature_fraction'], 
                bagging_freq=int(args['bagging_freq']), 
                bagging_fraction=args['bagging_fraction'],
                n_estimators=int(args['n_estimators'])
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=max_iter_opt
    )
    
    return best

def bayesian_optimization_lightgbm(X, y, cv=6, max_iter_opt=15):
    svr_opt = BayesianOptimization(
        lambda feature_fraction, bagging_freq, bagging_fraction, n_estimators: cross_val_score(
            LGBMClassifier(
                boosting_type='rf', 
                feature_fraction=feature_fraction, 
                bagging_freq=int(bagging_freq), 
                bagging_fraction=bagging_fraction,
                n_estimators=int(n_estimators)
            ),
            X, y.squeeze(), cv=KFold(n_splits=cv).split(X), scoring='accuracy'
        ).mean(),
        {'feature_fraction': (0.05, 0.95),
         'bagging_fraction': (0.05, 0.95),
         'bagging_freq': (1, 50),
         'n_estimators': (5, 50) },
        verbose=0
    )
    
    svr_opt.init(10)
    svr_opt.maximize(n_iter=max_iter_opt)
    
    return svr_opt.res['max']['max_params']#['C']

In [30]:
def synchronize_counts(_from: dict, _to: dict):
    subset = np.setdiff1d(list(_from.keys()), list(_to.keys()))
    for i in subset:
        _to[i] = 0
        
def cross_entropy(p, q):
    p = np.array(p)
    q = np.array(q)
    q[q == 0] = 0.0001
    return -np.sum(p * np.log(q))

# Facebook Comment Volume Dataset

In [64]:
df = pd.read_csv('../datasets/facebook_comments.csv', index_col=0)

In [65]:
print('dataset size: {}'.format(df.shape))

dataset size: (40949, 54)


In [66]:
y = MedianBinarizer(n_bins=32).fit(df.Target.values.reshape(-1, 1)).transform_categorical(df.Target.values.reshape(-1, 1))
df.drop(['Target'], axis=1, inplace=True)

categorical_features = ['H Local', 'Post Promotion Status', 'Base Time', 'Page Category']


features_categorical = list()
features_real = list()
encoders = list()

for i in categorical_features:
    encoder = OneHotEncoder().fit(df[i].values.reshape(-1, 1))
    
    onehot_repr = encoder.transform(df[i].values.reshape(-1, 1)).todense()
    categorical_repr = df[i].values.reshape(-1, 1)
    
    if np.unique(categorical_repr).size < 2:
        print('feature {} is ignored'.format(i))
        continue
        
    encoders.append(encoder)
    
    features_categorical.append({
        'onehot': onehot_repr,
        'category': categorical_repr
    })

for i in np.setdiff1d(ar1=df.columns, ar2=categorical_features):
    binarizer = MedianBinarizer(n_bins=32).fit(df[i].values.reshape(-1, 1))
    
    onehot_repr = binarizer.transform_onehot(df[i].values.reshape(-1, 1)),
    categorical_repr = binarizer.transform_categorical(df[i].values.reshape(-1, 1))
    
    if np.unique(categorical_repr).size < 2:
        print('feature {} is ignored'.format(i))
        continue
        
    encoders.append(binarizer.encoder)
    
    features_real.append({
        'onehot': binarizer.transform_onehot(df[i].values.reshape(-1, 1)),
        'category': binarizer.transform_categorical(df[i].values.reshape(-1, 1))
    })

feature_space = features_categorical + features_real

feature Post Promotion Status is ignored
feature base_weekday_1 is ignored
feature published_weekday_0 is ignored


In [67]:
models = {
    'rf': LGBMClassifier(boosting_type='rf', bagging_freq=5, bagging_fraction=.05, feature_fraction=.1),
    'lr': LogisticRegression(random_state=42, multi_class='ovr'),
    'svc': LinearSVC(multi_class='ovr'),
}

def score_models(models, X, y, folds = 8):
    stats = {}

    for k, model in models.items():
        stats[k] = []
        kfold = KFold(n_splits=8, shuffle=True)

        for train_ix, test_ix in kfold.split(X, y):
            X_crossval_train, X_crossval_test = X[train_ix], X[test_ix]
            y_crossval_train, y_crossval_test = y[train_ix], y[test_ix]

            # here must be sume sort of optimization
            model.fit(X_crossval_train, y_crossval_train.ravel())
            stats[k].append(accuracy_score(model.predict(X_crossval_test), y_crossval_test, normalize=True))
    
    for model, model_stats in stats.items():
        print('{}, Accuracy: {}'.format(model, np.mean(model_stats)))


In [77]:
np.random.seed(42)

global_stats = []
ix_subsets = []

initial_feature = 0# np.random.randint(0, len(feature_space))

dataset_size = X.shape[0]
# free_features is a list of observation indices
free_features = [i for i in range(len(feature_space)) if i != initial_feature]
# subset is a list of observations
subset = [feature_space[initial_feature]['onehot']]
# indices of the features in the subset
subset_indices = [initial_feature]

while len(subset) != len(feature_space):
    max_entropy = -1
    feature_index = -1
    new_feature = None

    local_stats = []


    for ix_feature, feature in enumerate(free_features):
        if len(subset) > 1:
            input_features = np.concatenate(subset, axis=1)
        else:
            input_features = subset[0]

        clr = copy(models['lr'])
        #train model on the given subset and new feature
        clr.fit(input_features, feature_space[feature]['category'].squeeze())
        #predict that feature with the given subset
        predicted = clr.predict(input_features)

        predicted_onehot = encoders[feature].transform(predicted.reshape(-1, 1))
        pred_difference = (predicted_onehot != feature_space[feature]['onehot']).astype(np.int32) # 0110
        #pred_difference = ((predicted_onehot == 1) & (feature_space[feature]['onehot'] == 0)).astype(np.int32) # 0010

        pred_category, pred_counts = np.unique(predicted, return_counts=True)
        real_category, real_counts = np.unique(feature_space[feature]['category'], return_counts=True)

        pred_proba = pred_counts / dataset_size
        real_proba = real_counts / dataset_size

        real_stats = dict(zip(real_category, real_proba))
        pred_stats =  dict(zip(pred_category, pred_proba))

        synchronize_counts(real_stats, pred_stats)

        ce_r_r = cross_entropy(list(real_stats.values()), list(real_stats.values()))
        ce_p_p = cross_entropy(list(pred_stats.values()), list(pred_stats.values()))
        ce_r_p = cross_entropy(list(real_stats.values()), list(pred_stats.values()))

        if max_entropy < ce_r_p:
            max_entropy = ce_r_p
            feature_index = ix_feature
            new_feature = pred_difference

        local_stats.append({'rr': ce_r_r, 'pp': ce_p_p, 'rp': ce_r_p})
    #print('#'*100)
    #print('subset', subset)
    #print(pd.DataFrame(stats, index=free_features))
    #print('#'*100)

    subset.append(new_feature)
    subset_indices.append(free_features[feature_index])
    del free_features[feature_index]

#print('generated ordered list of variables')
global_stats.append(local_stats)
ix_subsets.append(subset_indices)
print(subset_indices)
print('score on new features:')
score_models(models, np.hstack(subset), y, 8)

[0, 2, 1, 6, 11, 34, 12, 25, 48, 18, 44, 3, 5, 41, 49, 24, 37, 4, 14, 42, 31, 29, 36, 23, 28, 43, 22, 30, 38, 32, 7, 10, 20, 33, 40, 27, 45, 8, 17, 9, 16, 46, 15, 47, 13, 19, 26, 35, 39, 21]
score on new features:
rf, Accuracy: 0.5729566371167312
lr, Accuracy: 0.6778919626145108
svc, Accuracy: 0.6784047437688753


## Parkinsons Telemonitoring Data Set

In [87]:
df = pd.read_csv('../datasets/parkinsons_updrs.data')

In [88]:
print('dataset size: {}'.format(df.shape))

dataset size: (5875, 22)


In [89]:
y = MedianBinarizer(n_bins=32).fit(df.total_UPDRS.values.reshape(-1, 1)).transform_categorical(df.total_UPDRS.values.reshape(-1, 1))
df.drop(['total_UPDRS'], axis=1, inplace=True)

categorical_features = ['subject#']


features_categorical = list()
features_real = list()
encoders = list()

for i in categorical_features:
    encoder = OneHotEncoder().fit(df[i].values.reshape(-1, 1))
    
    onehot_repr = encoder.transform(df[i].values.reshape(-1, 1)).todense()
    categorical_repr = df[i].values.reshape(-1, 1)
    
    if np.unique(categorical_repr).size < 2:
        print('feature {} is ignored'.format(i))
        continue
        
    encoders.append(encoder)
    
    features_categorical.append({
        'onehot': onehot_repr,
        'category': categorical_repr
    })

for i in np.setdiff1d(ar1=df.columns, ar2=categorical_features):
    binarizer = MedianBinarizer(n_bins=32).fit(df[i].values.reshape(-1, 1))
    
    onehot_repr = binarizer.transform_onehot(df[i].values.reshape(-1, 1)),
    categorical_repr = binarizer.transform_categorical(df[i].values.reshape(-1, 1))
    
    if np.unique(categorical_repr).size < 2:
        print('feature {} is ignored'.format(i))
        continue
        
    encoders.append(binarizer.encoder)
    
    features_real.append({
        'onehot': binarizer.transform_onehot(df[i].values.reshape(-1, 1)),
        'category': binarizer.transform_categorical(df[i].values.reshape(-1, 1))
    })

feature_space = features_categorical + features_real

In [90]:
np.random.seed(42)

global_stats = []
ix_subsets = []

initial_feature = 0# np.random.randint(0, len(feature_space))

dataset_size = X.shape[0]
# free_features is a list of observation indices
free_features = [i for i in range(len(feature_space)) if i != initial_feature]
# subset is a list of observations
subset = [feature_space[initial_feature]['onehot']]
# indices of the features in the subset
subset_indices = [initial_feature]

while len(subset) != len(feature_space):
    max_entropy = -1
    feature_index = -1
    new_feature = None

    local_stats = []


    for ix_feature, feature in enumerate(free_features):
        if len(subset) > 1:
            input_features = np.concatenate(subset, axis=1)
        else:
            input_features = subset[0]

        clr = copy(models['lr'])
        #train model on the given subset and new feature
        clr.fit(input_features, feature_space[feature]['category'].squeeze())
        #predict that feature with the given subset
        predicted = clr.predict(input_features)

        predicted_onehot = encoders[feature].transform(predicted.reshape(-1, 1))
        pred_difference = (predicted_onehot != feature_space[feature]['onehot']).astype(np.int32) # 0110
        #pred_difference = ((predicted_onehot == 1) & (feature_space[feature]['onehot'] == 0)).astype(np.int32) # 0010

        pred_category, pred_counts = np.unique(predicted, return_counts=True)
        real_category, real_counts = np.unique(feature_space[feature]['category'], return_counts=True)

        pred_proba = pred_counts / dataset_size
        real_proba = real_counts / dataset_size

        real_stats = dict(zip(real_category, real_proba))
        pred_stats =  dict(zip(pred_category, pred_proba))

        synchronize_counts(real_stats, pred_stats)

        ce_r_r = cross_entropy(list(real_stats.values()), list(real_stats.values()))
        ce_p_p = cross_entropy(list(pred_stats.values()), list(pred_stats.values()))
        ce_r_p = cross_entropy(list(real_stats.values()), list(pred_stats.values()))

        if max_entropy < ce_r_p:
            max_entropy = ce_r_p
            feature_index = ix_feature
            new_feature = pred_difference

        local_stats.append({'rr': ce_r_r, 'pp': ce_p_p, 'rp': ce_r_p})
    #print('#'*100)
    #print('subset', subset)
    #print(pd.DataFrame(stats, index=free_features))
    #print('#'*100)

    subset.append(new_feature)
    subset_indices.append(free_features[feature_index])
    del free_features[feature_index]

#print('generated ordered list of variables')
global_stats.append(local_stats)
ix_subsets.append(subset_indices)
print(subset_indices)
print('score on new features:')
score_models(models, np.hstack(subset), y, 8)

[0, 3, 20, 6, 4, 9, 18, 13, 7, 8, 10, 11, 5, 12, 2, 17, 15, 14, 16, 1, 19]
score on new features:
rf, Accuracy: 0.4090235685554876
lr, Accuracy: 0.8280848579213702
svc, Accuracy: 0.8326769263563736


## Energy efficiency Data Set

In [98]:
df = pd.read_excel('../datasets/ENB2012_data.xlsx')

In [99]:
print('dataset size: {}'.format(df.shape))

dataset size: (768, 10)


In [100]:
y = MedianBinarizer(n_bins=32).fit(df.Y1.values.reshape(-1, 1)).transform_categorical(df.Y1.values.reshape(-1, 1))
df.drop(['Y1', 'Y2'], axis=1, inplace=True)

categorical_features = ['X6']


features_categorical = list()
features_real = list()
encoders = list()

for i in categorical_features:
    encoder = OneHotEncoder().fit(df[i].values.reshape(-1, 1))
    
    onehot_repr = encoder.transform(df[i].values.reshape(-1, 1)).todense()
    categorical_repr = df[i].values.reshape(-1, 1)
    
    if np.unique(categorical_repr).size < 2:
        print('feature {} is ignored'.format(i))
        continue
        
    encoders.append(encoder)
    
    features_categorical.append({
        'onehot': onehot_repr,
        'category': categorical_repr
    })

for i in np.setdiff1d(ar1=df.columns, ar2=categorical_features):
    binarizer = MedianBinarizer(n_bins=32).fit(df[i].values.reshape(-1, 1))
    
    onehot_repr = binarizer.transform_onehot(df[i].values.reshape(-1, 1)),
    categorical_repr = binarizer.transform_categorical(df[i].values.reshape(-1, 1))
    
    if np.unique(categorical_repr).size < 2:
        print('feature {} is ignored'.format(i))
        continue
        
    encoders.append(binarizer.encoder)
    
    features_real.append({
        'onehot': binarizer.transform_onehot(df[i].values.reshape(-1, 1)),
        'category': binarizer.transform_categorical(df[i].values.reshape(-1, 1))
    })

feature_space = features_categorical + features_real

feature X3 is ignored
feature X5 is ignored


In [102]:
np.random.seed(42)

global_stats = []
ix_subsets = []

initial_feature = 0# np.random.randint(0, len(feature_space))

dataset_size = X.shape[0]
# free_features is a list of observation indices
free_features = [i for i in range(len(feature_space)) if i != initial_feature]
# subset is a list of observations
subset = [feature_space[initial_feature]['onehot']]
# indices of the features in the subset
subset_indices = [initial_feature]

while len(subset) != len(feature_space):
    max_entropy = -1
    feature_index = -1
    new_feature = None

    local_stats = []


    for ix_feature, feature in enumerate(free_features):
        if len(subset) > 1:
            input_features = np.concatenate(subset, axis=1)
        else:
            input_features = subset[0]

        clr = copy(models['lr'])
        #train model on the given subset and new feature
        clr.fit(input_features, feature_space[feature]['category'].squeeze())
        #predict that feature with the given subset
        predicted = clr.predict(input_features)

        predicted_onehot = encoders[feature].transform(predicted.reshape(-1, 1))
        pred_difference = (predicted_onehot != feature_space[feature]['onehot']).astype(np.int32) # 0110
        #pred_difference = ((predicted_onehot == 1) & (feature_space[feature]['onehot'] == 0)).astype(np.int32) # 0010

        pred_category, pred_counts = np.unique(predicted, return_counts=True)
        real_category, real_counts = np.unique(feature_space[feature]['category'], return_counts=True)

        pred_proba = pred_counts / dataset_size
        real_proba = real_counts / dataset_size

        real_stats = dict(zip(real_category, real_proba))
        pred_stats =  dict(zip(pred_category, pred_proba))

        synchronize_counts(real_stats, pred_stats)

        ce_r_r = cross_entropy(list(real_stats.values()), list(real_stats.values()))
        ce_p_p = cross_entropy(list(pred_stats.values()), list(pred_stats.values()))
        ce_r_p = cross_entropy(list(real_stats.values()), list(pred_stats.values()))

        if max_entropy < ce_r_p:
            max_entropy = ce_r_p
            feature_index = ix_feature
            new_feature = pred_difference

        local_stats.append({'rr': ce_r_r, 'pp': ce_p_p, 'rp': ce_r_p})
    #print('#'*100)
    #print('subset', subset)
    #print(pd.DataFrame(stats, index=free_features))
    #print('#'*100)

    subset.append(new_feature)
    subset_indices.append(free_features[feature_index])
    del free_features[feature_index]

#print('generated ordered list of variables')
global_stats.append(local_stats)
ix_subsets.append(subset_indices)
print(subset_indices)
print('score on new features:')
score_models(models, np.hstack(subset), y, 8)

[0, 5, 3, 4, 2, 1]
score on new features:
rf, Accuracy: 0.54296875
lr, Accuracy: 0.7604166666666666
svc, Accuracy: 0.7591145833333334
