# Liquid Democracy
Trying to implement an ensemble methodology for voting based on the [Liquid Democracy](http://procaccia.info/papers/liquid.pdf) paper.

The important bits i see in this process is:
- the *competence level* should be an estimate based on train/valid accuracy or distance to each new sample (or a mix of both)
- the cap(n) is constant in  our case? This should be optimized?

In [1]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

#############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80, 150),
}

X = data.data
y = data.target

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

#     print("Performing grid search...")
#     print("pipeline:", [name for name, _ in pipeline.steps])
#     print("parameters:")
#     pprint(parameters)
#     t0 = time()
#     # grid_search.fit(data.data, data.target)
#     grid_search.fit(X, y)
#     print("done in %0.3fs" % (time() - t0))
#     print()

#     print("Best score: %0.3f" % grid_search.best_score_)
#     print("Best parameters set:")
#     best_parameters = grid_search.best_estimator_.get_params()
#     for param_name in sorted(parameters.keys()):
#         print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
857 documents
2 categories



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42, test_size=0.3)
# X_valid, X_test, y_valid, y_test= train_test_split(X_test, y_test, 
#                                                    random_state=42, test_size=0.5)
grid_search.fit(X_train, y_train)
cur_mod = grid_search.best_estimator_
pred = cur_mod.predict(X_test)
print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    6.5s finished


0.9302325581395349


In [3]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer

pip = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer())])

tr = pip.fit_transform(X_train, y_train)
mi = mutual_info_classif(tr, y_train)
print(len(mi),tr.shape[1])
mi = mi/sum(mi)
#tr_valid = pip.transform(X_valid)
tr_test = pip.transform(X_test)

15443 15443


In [33]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100 ,random_state=42)
clf.fit(tr, y_train)
pred = clf.predict(tr_test)
print(accuracy_score(y_test, pred))

0.9147286821705426


In [97]:
k = np.copy(competence)
alpha = 0.5
print(np.where(k - k[0]-alpha>0))
print(k[11], k[0])
if np.where(k - k[0]-alpha>0)[0]:
    print('lala')

(array([], dtype=int64),)
0.7674418604651163 0.6744186046511628


  """


In [149]:
from collections import defaultdict
competence = []
for est in clf.estimators_:
    competence.append(est.score(tr_valid, y_valid))
alpha = 0.2
approves = {}
approved_by = {}
approved_by_length = {}
for i, sc_i in enumerate(competence):
    k = np.copy(competence)
    approves[i] = np.where(k - sc_i - alpha>0)[0].tolist()
    approved_by[i] = np.where(k - sc_i + alpha<0)[0].tolist()
    approved_by_length[i] = len(approved_by[i])
import operator
sorted_approved = sorted(approved_by_length.items(), key=operator.itemgetter(1))[::-1]

    


In [150]:
cap = int(len(competence)*0.2)
cap_per_est = {}
for i in xrange(len(competence)):
    cap_per_est[i] = cap
delegated = {}
poss_nodes = set([i for i in xrange(len(competence))])
for est_id, _ in sorted_approved:
    print(est_id)
    to_deleg = []
    poss_deleg = list(poss_nodes.intersection(approved_by[est_id]))
    if len(poss_deleg) <= cap_per_est[est_id] - 1:
        to_deleg = poss_deleg
    else:
        to_deleg = np.random.choice(poss_deleg, cap_per_est[est_id] - 1).tolist()
    to_deleg = to_deleg + [est_id]
    cap_per_est[est_id] -= len(to_deleg)
    poss_nodes = poss_nodes.difference(to_deleg)
    delegated[est_id] = to_deleg
    if len(poss_nodes) == 0:
        break
delegated

40
11
95
99
93
16
77
71
49
45
31
26
19
18
92
75
61
58
47
38
30
98
97
96
94
91
90
89
88
87
86
85
84
83
82
81
80
79
78
76
74
73
72
70
69
68
67
66
65
64
63
62
60
59
57
56
55
54
53
52
51
50
48
46
44
43
42
41
39
37
36
35
34
33
32
29
28
27
25
24
23
22
21
20
17
15
14
13
12
10
9
8
7
6
5
4
3
2
1
0


{0: [0],
 1: [1],
 2: [2],
 3: [3],
 4: [4],
 5: [5],
 6: [6],
 7: [7],
 8: [8],
 9: [9],
 10: [10],
 11: [11],
 12: [12],
 13: [13],
 14: [14],
 15: [15],
 16: [16],
 17: [17],
 18: [18],
 19: [19],
 20: [20],
 21: [21],
 22: [22],
 23: [23],
 24: [24],
 25: [25],
 26: [26],
 27: [27],
 28: [28],
 29: [29],
 30: [30],
 31: [31],
 32: [32],
 33: [33],
 34: [34],
 35: [35],
 36: [36],
 37: [37],
 38: [38],
 39: [39],
 40: [37, 8, 41, 80, 22, 23, 89, 40],
 41: [41],
 42: [42],
 43: [43],
 44: [44],
 45: [45],
 46: [46],
 47: [47],
 48: [48],
 49: [49],
 50: [50],
 51: [51],
 52: [52],
 53: [53],
 54: [54],
 55: [55],
 56: [56],
 57: [57],
 58: [58],
 59: [59],
 60: [60],
 61: [61],
 62: [62],
 63: [63],
 64: [64],
 65: [65],
 66: [66],
 67: [67],
 68: [68],
 69: [69],
 70: [70],
 71: [71],
 72: [72],
 73: [73],
 74: [74],
 75: [75],
 76: [76],
 77: [77],
 78: [78],
 79: [79],
 80: [80],
 81: [81],
 82: [82],
 83: [83],
 84: [84],
 85: [85],
 86: [86],
 87: [87],
 88: [88],
 89: [89],
 90

In [151]:
import collections
def predict_delegate(ensemble, del_map, X):
    final_pred = []
    for sample in xrange(X.shape[0]):
        sample_pred = []
        for est_id, delegates in del_map.items():
            n_votes = len(delegates)
            est_predict = ensemble.estimators_[est_id].predict(X[sample,:][0])[0]
            sample_pred.extend([est_predict for i in xrange(n_votes)])
        final_pred.append(collections.Counter(sample_pred).most_common()[0][0])
    return np.array(final_pred)  
y_test = predict_delegate(clf, delegated, tr_test)
print(accuracy_score(y_test, pred))

0.9922480620155039


In [32]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.utils import check_X_y, check_random_state, check_array
from sklearn.metrics import get_scorer
from sklearn.utils.validation import column_or_1d, check_is_fitted
from sklearn.multiclass import check_classification_targets
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
import numpy as np

class Liquid_Voter(BaseEstimator, ClassifierMixin):
    """
    Liquid Voter delegation mechanism. It builds a standard random forest but
    votes according to the liquid voting mechanism of action. Currently, the competence
    is expressed through the accuracy of the trees based on a validation split on the 
    train set. The alpha and cap parameters essentially control a) how much better does
    one model to be with regards to another so as to become it's delegate and b) the maximum
    capacity of votes one delegate can carry.
    """
    def __init__(self, 
                 ensemble=RandomForestClassifier(), 
                 n_estimators=100,  
                 random_state=42,
                 #competence = 'valid_acc',
                 valid_size = 0.1,
                 alpha = 0.2,
                 cap = 0.2,
                 metric='accuracy'):  
        self.n_estimators = n_estimators
        self.random_state = check_random_state(random_state)
        self.ensemble = ensemble.set_params(**{'random_state':self.random_state,
                                            'n_estimators':self.n_estimators})
        self.alpha = alpha
        self.cap = cap
        self.scoring = get_scorer(metric)
        #self.competence = competence
        self.valid_size = valid_size
        self.delegation_map = {}
        self.competence = []
    
    def fit(self, X, y):
        return self._fit(X, y)
    
    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        return y
    
    def _fit(self,X,y):
        X, y = check_X_y(
            X, y, ['csr', 'csc'], dtype=None, force_all_finite=False,
            multi_output=True)
        y = self._validate_y(y)
        n_samples, self.n_features_ = X.shape
        X_train, X_valid, y_train, y_valid = train_test_split(X, y,random_state= self.random_state,
                         test_size = self.valid_size)
        self.ensemble.fit(X_train, y_train)
        from collections import defaultdict
        import operator
        for est in self.ensemble.estimators_:
            self.competence.append(est.score(X_valid, y_valid))
        self.ensemble.fit(X,y)
        approves = {}
        approved_by = {}
        approved_by_length = {}
        for i, sc_i in enumerate(self.competence):
            k = np.copy(self.competence)
            approves[i] = np.where(k - sc_i - self.alpha>0)[0].tolist()
            approved_by[i] = np.where(k - sc_i + self.alpha<0)[0].tolist()
            approved_by_length[i] = len(approved_by[i])
        
        sorted_approved = sorted(approved_by_length.items(), key=operator.itemgetter(1))[::-1]
        cap = int(self.n_estimators*self.cap)
        cap_per_est = {}
        for i in xrange(self.n_estimators):
            cap_per_est[i] = cap
        poss_nodes = set([i for i in xrange(self.n_estimators)])
        for est_id, _ in sorted_approved:
            to_deleg = []
            poss_deleg = list(poss_nodes.intersection(approved_by[est_id]))
            if len(poss_deleg) <= cap_per_est[est_id] - 1:
                to_deleg = poss_deleg
            else:
                to_deleg = np.random.choice(poss_deleg, cap_per_est[est_id] - 1).tolist()
            to_deleg = to_deleg + [est_id]
            cap_per_est[est_id] -= len(to_deleg)
            poss_nodes = poss_nodes.difference(to_deleg)
            self.delegation_map[est_id] = to_deleg
            if len(poss_nodes) == 0:
                break
        return self
    
    
    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        return y
    
    def predict(self, X):
        """Predict class for X.
        The predicted class of an input sample is computed as the class with
        the highest mean predicted probability. If base estimators do not
        implement a ``predict_proba`` method, then it resorts to voting.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        y : array of shape = [n_samples]
            The predicted classes.
        """
        
        return self.predict_delegate_(X)
    
    def predict_delegate_(self, X):
        import collections
        final_pred = []
        for sample in xrange(X.shape[0]):
            sample_pred = []
            for est_id, delegates in self.delegation_map.items():
                n_votes = len(delegates)
                est_predict = self.ensemble.estimators_[est_id].predict(X[sample,:][0])[0]
                sample_pred.extend([est_predict for i in xrange(n_votes)])
            final_pred.append(collections.Counter(sample_pred).most_common()[0][0])
        return np.array(final_pred)  

lv = Liquid_Voter(alpha=0.2, cap=0.1, n_estimators=100)
lv.fit(tr,y_train)
y_test = lv.predict(tr_test)
print(accuracy_score(y_test, pred))

0.810077519379845


In [None]:
parameters2 = {
    "alpha":np.linspace(0.05, 0.3, 10),
    "cap":[0.1,0.2,0.05,0.3]
}
grid_search = GridSearchCV(Liquid_Voter(n_estimators=100), 
                           parameters2, 
                           n_jobs=-1, 
                           verbose=2)
grid_search.fit(tr, y_train)
cur_mod = grid_search.best_estimator_
pred = cur_mod.predict(tr_test)

print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   49.8s


In [36]:
print(grid_search.best_params_, grid_search.best_score_)

{'alpha': 0.3, 'cap': 0.1} 0.8530884808013356


In [7]:


# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'n_neighbors': [2,4,6,8,10],
    'metric':['euclidean', 'manhattan', 'cosine', 'l2']
}


parameters2 = {
    'clf__max_features': [50, 0.2, 0.3, 0.4,0.8, 'auto', 'log2'],
    'clf__bootstrap_feats': [True, False],
    'clf__n_estimators': [100,250,500],
    'clf__feat_importance':[mi, []]
}

# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', Knn_Forest(n_estimators=500, 
                       max_features=0.2, 
                       bootstrap_feats=False,
                       optim=False, 
                       parameters=parameters,
                       feat_importance=mi
                       )),
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

# grid_search = GridSearchCV(pipeline, parameters2, n_jobs=1, verbose=2)
# grid_search.fit(X_train, y_train)
# cur_mod = grid_search.best_estimator_
# pred = cur_mod.predict(X_test)

print(accuracy_score(y_test, pred))



(15676, 15676, 15676, 15676)
(15676, 15676, 15676)
500 ESTIMATORS -- 94.393
0.8511627906976744


In [128]:
np.array(mi).shape

(15285,)

In [107]:
if mi == []:
    feat_probas = [1/float(15285) for i in xrange(15285)]
else:
    feat_probas = mi/float(sum(mi))
print(feat_probas)

[3.66458532e-05 1.17863765e-04 2.08337729e-05 ... 7.93856492e-06
 1.04044869e-05 1.04044869e-05]


  """Entry point for launching an IPython kernel.


In [89]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer
tr = TfidfVectorizer().fit_transform(X_train, y_train)
mi = mutual_info_classif(tr, y_train)

array([0.00434618, 0.01397857, 0.00247087, ..., 0.00094151, 0.00123397,
       0.00123397])