In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools

import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from brew.base import Ensemble, EnsembleClassifier
from brew.stacking.stacker import EnsembleStack, EnsembleStackClassifier
from brew.combination.combiner import Combiner
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from mlxtend.data import wine_data, iris_data

from mlxtend.plotting import plot_decision_regions


from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import ExtraTreesClassifier
from itertools import combinations
import random
random.seed(10)

from sklearn.tree import DecisionTreeClassifier
from progress.bar import Bar

import sys, time
try:
    from IPython.core.display import clear_output
    have_ipython = True
except ImportError:
    have_ipython = False

class ProgressBar:
    def __init__(self, iterations):
        self.iterations = iterations
        self.prog_bar = '[]'
        self.fill_char = '*'
        self.width = 40
        self.__update_amount(0)
        if have_ipython:
            self.animate = self.animate_ipython
        else:
            self.animate = self.animate_noipython

    def animate_ipython(self, iter):
        try:
            pass
            #clear_output()
        except Exception:
            # terminal IPython has no clear_output
            pass
        print '\r', self,
        #sys.stdout.flush()
        self.update_iteration(iter + 1)

    def update_iteration(self, elapsed_iter):
        self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
        self.prog_bar += '  %d of %s complete' % (elapsed_iter, self.iterations)

    def __update_amount(self, new_amount):
        percent_done = int(round((new_amount / 100.0) * 100.0))
        all_full = self.width - 2
        num_hashes = int(round((percent_done / 100.0) * all_full))
        self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
        pct_place = (len(self.prog_bar) / 2) - len(str(percent_done))
        pct_string = '%d%%' % percent_done
        self.prog_bar = self.prog_bar[0:pct_place] + \
            (pct_string + self.prog_bar[pct_place + len(pct_string):])

    def __str__(self):
        return str(self.prog_bar)




class Multiviewer(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_level=3, num_at_each_level=4, base_estimator=ExtraTreesClassifier(n_estimators=50)):
        self.max_level = max_level
        self.base_estimator = base_estimator
        self.num_at_each_level = num_at_each_level
        self.estimators = []
        self.estim_features = []
        self.classes_ = None
        
    def fit(self, X, y):
        if self.max_level > X.shape[1]:
            print "Max level of feature combinations can't be bigger than num of features"
            print "%d > %d" % (self.max_level, X.shape[1])
            raise ValueError
        if not(isinstance(self.num_at_each_level, list)):
            self.num_at_each_level = [self.num_at_each_level for i in xrange(1, self.max_level)]
            self.num_at_each_level = [X.shape[1]] + self.num_at_each_level 
        #print self.num_at_each_level
        self.classes_ = list(set(y))
        rang = np.arange(X.shape[1])
        total = 0
        for i in xrange(1, self.max_level+1):
            total += i* self.num_at_each_level[i-1]
        print "Will create %d trees!" % total
        cc = 0
        bar =ProgressBar(total)
        for level in xrange(self.max_level):
            #print [comb for comb in combinations(rang, level+1)]
            wanted_feature_sets = get_cols(rang, level+1, self.num_at_each_level[level] )
            for wanted_features in wanted_feature_sets:
                c = sklearn.clone(self.base_estimator)
                c.fit(X[:, wanted_features], y)
                self.estimators.append(c)
                #print self.estimators[-1].n_features_
                self.estim_features.append(wanted_features)
                bar.animate(cc)
                cc += 1
        #bar.finish()
        return self
    
    def predict(self, X):
        if not(isinstance(X, np.ndarray)):
            X = np.array(X)
        print X.shape
        print X.shape[0]
        predictions = np.empty((X.shape[0], len(self.estimators)))
        for i, est in enumerate(self.estimators):
#             print est.n_features_
#             print i, self.estim_features[i]
#             print X.shape
#             print X[:, self.estim_features[i]].shape
            predictions[:, i] = est.predict(X[:, self.estim_features[i]])
        final_pred = []
        #print predictions
        for sample in xrange(X.shape[0]):
            votes = []
            for i, mod_vote in enumerate(predictions[sample,:]):
                votes.extend([predictions[sample, i] for j in xrange(1)])
            final_pred.append(most_common(votes))
        return np.array(final_pred).reshape(-1,)
    
def get_cols(iterable, level_, total_times_):
    wanted = [random.sample(iterable, k=level_) for time in xrange(total_times_)]
    return wanted

def most_common(lst):
    return max(set(lst), key=lst.count)



# Loading some example data
X, y = wine_data()
#X, y = iris_data()
#X = X[:,[0, 2]]


print('Dimensions: %s x %s' % (X.shape[0], X.shape[1]))
print('1st row', X[0])




# Initializing Classifiersa
clf1 = LogisticRegression(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = SVC(random_state=0, probability=True)
mu = Multiviewer(max_level=3, num_at_each_level=[10, 10, 10])
# Creating Ensemble
ensemble = Ensemble([clf1, clf2, clf3])
eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean'))
# Creating Stacking
layer_1 = Ensemble([clf1, clf2, clf3])
layer_2 = Ensemble([sklearn.clone(clf1)])

stack = EnsembleStack(cv=3)

stack.add_layer(layer_1)
stack.add_layer(layer_2)

sclf = EnsembleStackClassifier(stack)

clf_list = [clf1, clf2, clf3, eclf, sclf, mu]
lbl_list = ['Logistic Regression', 'Random Forest', 'RBF kernel SVM', 'Ensemble', 'Stacking', 'MULTIVIEWER']



# WARNING, WARNING, WARNING
# brew requires classes from 0 to N, no skipping allowed
d = {yi : i for i, yi in enumerate(set(y))}
y = np.array([d[yi] for yi in y])

# Plotting Decision Regions
#gs = gridspec.GridSpec(2, 3)
#fig = plt.figure(figsize=(10, 8))

itt = itertools.product([0, 1, 2], repeat=2)

from sklearn.model_selection import train_test_split
split = 0.2
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, stratify=y, random_state=100)

for clf, lab, grd in zip(clf_list, lbl_list, itt):
    clf.fit(X_train, y_train)
#    ax = plt.subplot(gs[grd[0], grd[1]])
#    fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
#    plt.title(lab)
    print "Results for: %s" % lab
    pred = clf.predict(X_cv)
    print accuracy_score(y_cv, pred, normalize=True)
    print confusion_matrix(y_cv, pred, labels=list(set(y)))
    print classification_report(y_cv, pred, labels=list(set(y)))

#plt.show()



Dimensions: 178 x 13
('1st row', array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03]))
Results for: Logistic Regression
0.9722222222222222
[[11  1  0]
 [ 0 14  0]
 [ 0  0 10]]
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       0.93      1.00      0.97        14
          2       1.00      1.00      1.00        10

avg / total       0.97      0.97      0.97        36

Results for: Random Forest
0.9166666666666666
[[12  0  0]
 [ 1 13  0]
 [ 0  2  8]]
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        12
          1       0.87      0.93      0.90        14
          2       1.00      0.80      0.89        10

avg / total       0.92      0.92      0.92        36

Results for: RBF kernel SVM
0.4444444444444444
[[ 1 11  0]
 [ 0 14  0]
 [ 0  9  1]]
         

In [16]:
clf1.fit(X_train, y_train)
clf1.predict(X_cv).shape

(36,)

In [5]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import ExtraTreesClassifier
from itertools import combinations
import random
random.seed(10)

class Multiviewer(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_level=3, num_at_each_level=4, base_estimator=ExtraTreesClassifier(n_estimators=50)):
        self.max_level = max_level
        self.base_estimator = base_estimator
        self.num_at_each_level = num_at_each_level
        self.estimators = []
        self.estim_features = []
        self.classes_ = None
        
    def fit(self, X, y):
        if self.max_level > X.shape[1]:
            print "Max level of feature combinations can't be bigger than num of features"
            print "%d > %d" % (self.max_level, X.shape[1])
            raise ValueError
        if not(isinstance(self.num_at_each_level, list)):
            self.num_at_each_level = [self.num_at_each_level for i in xrange(1, self.max_level)]
            self.num_at_each_level = [X.shape[1]] + self.num_at_each_level 
        #print self.num_at_each_level
        self.classes_ = list(set(y))
        rang = np.arange(X.shape[1])
        total = 0
        for i in xrange(1, self.max_level+1):
            total += i* self.num_at_each_level[i-1]
        print "Will create %d trees!" % total
        cc = 0
        bar =ProgressBar(total)
        for level in xrange(self.max_level):
            #print [comb for comb in combinations(rang, level+1)]
            wanted_feature_sets = get_cols(rang, level+1, self.num_at_each_level[level])
            print wanted_feature_sets
            for wanted_features in wanted_feature_sets:
                c = sklearn.clone(self.base_estimator)
                c.fit(X[:, wanted_features], y)
                self.estimators.append(c)
                #print self.estimators[-1].n_features_
                self.estim_features.append(wanted_features)
                bar.animate(cc)
                cc += 1
        #bar.finish()
        return self
    
    def predict(self, X):
        if not(isinstance(X, np.ndarray)):
            X = np.array(X)
        print X.shape
        print X.shape[0]
        predictions = np.empty((X.shape[0], len(self.estimators)))
        for i, est in enumerate(self.estimators):
#             print est.n_features_
#             print i, self.estim_features[i]
#             print X.shape
#             print X[:, self.estim_features[i]].shape
            predictions[:, i] = est.predict(X[:, self.estim_features[i]])
        final_pred = []
        #print predictions
        for sample in xrange(X.shape[0]):
            votes = []
            for i, mod_vote in enumerate(predictions[sample,:]):
                votes.extend([predictions[sample, i] for j in xrange(1)])
            final_pred.append(most_common(votes))
        return np.array(final_pred).reshape(-1,)
    
def get_cols(iterable, level_, total_times_):
    wanted = [random.sample(iterable, k=level_) for time in xrange(total_times_)]
    return wanted

def most_common(lst):
    return max(set(lst), key=lst.count)

In [12]:
print X.shape[1]
mu = Multiviewer(max_level=4)
mu.fit(X,y)
pred = mu.predict(X_cv)
print accuracy_score(y_cv, pred, normalize=True)
print confusion_matrix(y_cv, pred, labels=list(set(y)))
print classification_report(y_cv, pred, labels=list(set(y)))

13
Will create 49 trees!
[[1], [2], [2], [1], [12], [11], [6], [9], [9], [6], [5], [8], [7]]
[*********        24%                  ]  12 of 49 complete [[3, 4], [8, 7], [2, 7], [11, 10]]
[*************    33%                  ]  16 of 49 complete [[8, 0, 4], [1, 0, 2], [5, 9, 6], [3, 8, 5]]
[**************** 41%                  ]  20 of 49 complete [[6, 3, 4, 11], [9, 5, 4, 10], [6, 8, 7, 11], [5, 9, 10, 11]]
[*****************49%                  ]  24 of 49 complete (36, 13)
36
1.0
[[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        12
          1       1.00      1.00      1.00        14
          2       1.00      1.00      1.00        10

avg / total       1.00      1.00      1.00        36



(36, 13)
36


array([1., 1., 0., 0., 0., 1., 1., 0., 2., 0., 2., 1., 2., 2., 0., 2., 2.,
       1., 2., 0., 0., 1., 1., 0., 0., 1., 1., 2., 1., 1., 0., 2., 0., 1.,
       1., 2.])

In [80]:
mu.estimators

[ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=

In [16]:
rang = np.arange(X.shape[1])

In [42]:
from itertools import combinations
import random
random.seed(10)
ss = random.sample([comb for comb in combinations(rang, 2)],2)


In [58]:
X[:, ss[1]].shape

[(7,),
 (5,),
 (6,),
 (2,),
 (12,),
 (10,),
 (4,),
 (0,),
 (9,),
 (1,),
 (11,),
 (3,),
 (8,)]