In [1]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.utils import check_X_y, check_random_state, check_array
from sklearn.metrics import get_scorer
from sklearn.utils.validation import column_or_1d, check_is_fitted
from sklearn.multiclass import check_classification_targets
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
import numpy as np

class Knn_Forest(BaseEstimator, ClassifierMixin):
    """
    Random feature selection for ensemble of knn classifiers.
    Each knn model will view the samples from different perspectives.
    Aggregating their views will result in a good ensemble result.
    Can also bootstrap the features, optimize the knn params per each
    different model and also sample the features based on their initial importance.
    """
    def __init__(self, 
                 base_estimator=KNeighborsClassifier(), 
                 n_estimators=10,  
                 random_state=42, 
                 optim=False, 
                 parameters=None,
                 max_features = 'auto',
                 bootstrap_feats = False,
                 feat_importance = [],
                 metric='accuracy'):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.random_state = check_random_state(random_state)
        self.bootstrap_feats = bootstrap_feats
        self.optim = optim
        self.feat_importance = feat_importance
        if self.optim:
            self.parameters = parameters
        else:
            self.parameters = None
        self.scoring = get_scorer(metric)
        self.ensemble = []
        self.selected_feat_indices= []
    
    def fit(self, X, y):
        return self._fit(X, y)
    
    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        return y
    
    def _fit(self,X,y):
        X, y = check_X_y(
            X, y, ['csr', 'csc'], dtype=None, force_all_finite=False,
            multi_output=True)
        y = self._validate_y(y)
        n_samples, self.n_features_ = X.shape
        if self.max_features == 'auto':
            self.N_feats_per_knn = int(np.sqrt(self.n_features_))
        elif self.max_features == 'log2':
            self.N_feats_per_knn = int(np.log2(self.n_features_))
        elif type(self.max_features) == float:
            self.N_feats_per_knn = int(self.max_features*self.n_features_)
        elif type(self.max_features) == int:
            self.N_feats_per_knn = int(self.max_features)
        if self.feat_importance == []:
            self.feat_probas = [1/float(self.n_features_) for i in xrange(self.n_features_)]
        else:

            self.feat_probas = self.feat_importance#MinMaxScaler().fit_transform(mutual_info_classif(X, y).reshape(1, -1),y)
        print(X.shape[1], self.n_features_, len(self.feat_importance), len(self.feat_probas)) 
        print(len(self.feat_probas), self.n_features_, len(self.feat_importance))
        for i_est in xrange(self.n_estimators):
            self.selected_feat_indices.append(np.random.choice(np.arange(self.n_features_),
                                                          self.N_feats_per_knn,
                                                          replace=self.bootstrap_feats,
                                                              p=self.feat_probas))
            cur_X, cur_y = X[:, self.selected_feat_indices[i_est]], y
            cur_mod = clone(self.base_estimator)
            if self.optim:
                grid_search = GridSearchCV(cur_mod, self.parameters, n_jobs=-1, verbose=0, refit=True)
                grid_search.fit(cur_X, cur_y)
                cur_mod = grid_search.best_estimator_
            else:
                cur_mod.fit(cur_X, cur_y)
            self.ensemble.append(cur_mod)
            #print(cur_X.shape, cur_y.shape)
        print("%d ESTIMATORS -- %0.3f" % (len(self.ensemble), 100*accuracy_score(y, self.predict(X), normalize=True)))
        return self
    
   
    
    def _validate_y(self, y):
        y = column_or_1d(y, warn=True)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        return y
    
    def predict(self, X):
        """Predict class for X.
        The predicted class of an input sample is computed as the class with
        the highest mean predicted probability. If base estimators do not
        implement a ``predict_proba`` method, then it resorts to voting.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        y : array of shape = [n_samples]
            The predicted classes.
        """
        
        if hasattr(self.base_estimator, "predict_proba"):
            predicted_probability = self.predict_proba(X)
            return self.classes_.take((np.argmax(predicted_probability, axis=1)),
                                  axis=0)
        else:
            predicted_probability = np.zeros((X.shape[0],1), dtype=int)
            for i, ens in enumerate(self.ensemble):
                predicted_probability = np.hstack((predicted_probability, 
                                                   ens.predict(X[:, self.selected_feat_indices[i]]).reshape(-1,1)))
            predicted_probability = np.delete(predicted_probability,0,axis=1)
            final_pred = []
            for sample in xrange(X.shape[0]):
                final_pred.append(most_common(predicted_probability[sample,:]))
            return np.array(final_pred)   


    def predict_proba(self, X):
        """Predict class probabilities for X.
        The predicted class probabilities of an input sample is computed as
        the mean predicted class probabilities of the base estimators in the
        ensemble. If base estimators do not implement a ``predict_proba``
        method, then it resorts to voting and the predicted class probabilities
        of an input sample represents the proportion of estimators predicting
        each class.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        p : array of shape = [n_samples, n_classes]
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, "classes_")
        # Check data
        X = check_array(
            X, accept_sparse=['csr', 'csc'], dtype=None,
            force_all_finite=False
        )

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {0} and "
                             "input n_features is {1}."
                             "".format(self.n_features_, X.shape[1]))

        all_proba = np.zeros((X.shape[0], self.n_classes_))
        for i, ens in enumerate(self.ensemble):
            all_proba += ens.predict_proba(X[:, self.selected_feat_indices[i]])
        all_proba /= self.n_estimators
        return all_proba


    @if_delegate_has_method(delegate='base_estimator')
    def decision_function(self, X):
        """Average of the decision functions of the base classifiers.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        score : array, shape = [n_samples, k]
            The decision function of the input samples. The columns correspond
            to the classes in sorted order, as they appear in the attribute
            ``classes_``. Regression and binary classification are special
            cases with ``k == 1``, otherwise ``k==n_classes``.
        """
        check_is_fitted(self, "classes_")

        # Check data
        X = check_array(
            X, accept_sparse=['csr', 'csc'], dtype=None,
            force_all_finite=False
        )

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {0} and "
                             "input n_features is {1} "
                             "".format(self.n_features_, X.shape[1]))
        all_decisions = np.zeros((X.shape[0], self.n_classes_))
        for i, ens in enumerate(self.ensemble):
            all_decisions += ens.predict_proba(X) 
        decisions = sum(all_decisions) / self.n_estimators

        return decisions

In [2]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

#############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80, 150),
}

X = data.data
y = data.target

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

#     print("Performing grid search...")
#     print("pipeline:", [name for name, _ in pipeline.steps])
#     print("parameters:")
#     pprint(parameters)
#     t0 = time()
#     # grid_search.fit(data.data, data.target)
#     grid_search.fit(X, y)
#     print("done in %0.3fs" % (time() - t0))
#     print()

#     print("Best score: %0.3f" % grid_search.best_score_)
#     print("Best parameters set:")
#     best_parameters = grid_search.best_estimator_.get_params()
#     for param_name in sorted(parameters.keys()):
#         print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
857 documents
2 categories



In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test= train_test_split(X, y)
grid_search.fit(X_train, y_train)
cur_mod = grid_search.best_estimator_
pred = cur_mod.predict(X_test)
print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    6.6s finished


0.9348837209302325


In [6]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer

# pip = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer())])

# tr = pip.fit_transform(X_train, y_train)
# mi = mutual_info_classif(tr, y_train)
# print(len(mi),tr.shape[1])
mi = mi/sum(mi)

15676 15676


In [7]:


# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'n_neighbors': [2,4,6,8,10],
    'metric':['euclidean', 'manhattan', 'cosine', 'l2']
}


parameters2 = {
    'clf__max_features': [50, 0.2, 0.3, 0.4,0.8, 'auto', 'log2'],
    'clf__bootstrap_feats': [True, False],
    'clf__n_estimators': [100,250,500],
    'clf__feat_importance':[mi, []]
}

# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', Knn_Forest(n_estimators=500, 
                       max_features=0.2, 
                       bootstrap_feats=False,
                       optim=False, 
                       parameters=parameters,
                       feat_importance=mi
                       )),
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

# grid_search = GridSearchCV(pipeline, parameters2, n_jobs=1, verbose=2)
# grid_search.fit(X_train, y_train)
# cur_mod = grid_search.best_estimator_
# pred = cur_mod.predict(X_test)

print(accuracy_score(y_test, pred))



(15676, 15676, 15676, 15676)
(15676, 15676, 15676)
500 ESTIMATORS -- 94.393
0.8511627906976744


In [128]:
np.array(mi).shape

(15285,)

In [107]:
if mi == []:
    feat_probas = [1/float(15285) for i in xrange(15285)]
else:
    feat_probas = mi/float(sum(mi))
print(feat_probas)

[3.66458532e-05 1.17863765e-04 2.08337729e-05 ... 7.93856492e-06
 1.04044869e-05 1.04044869e-05]


  """Entry point for launching an IPython kernel.


In [89]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer
tr = TfidfVectorizer().fit_transform(X_train, y_train)
mi = mutual_info_classif(tr, y_train)

array([0.00434618, 0.01397857, 0.00247087, ..., 0.00094151, 0.00123397,
       0.00123397])