In [2]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
857 documents
2 categories

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   17.2s finished


done in 18.127s

Best score: 0.936
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'l2'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)




In [156]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
from copy import deepcopy
from sklearn.metrics import accuracy_score, get_scorer
from sklearn.neighbors import BallTree

class Cascador(BaseEstimator, TransformerMixin):
    
    def __init__(self, model=None, cv=5, mem=True, policy='majority', 
                 random_state=42, optim=False, parameters=None, metric='accuracy', max_steps=3):
        self.base = model
        self.cv = cv
        self.mem = mem
        if self.mem:
            self.tree = 'Will be fitted!'
        else:
            self.tree = None
        self.rs = random_state
        self.optim = optim
        if self.optim:
            self.parameters = parameters
        else:
            self.parameters = None
        self.scoring = get_scorer(metric)
        self.max_steps = max_steps
        self.acc = []
        self.models = []
        
    def fit(self, X, y):
        if not(isinstance(X, np.ndarray)):
            X_skf_next = np.array(X)
        else:
            X_skf_next = X
        if not(isinstance(y, np.ndarray)):
            y_skf_next = np.array(y)
        else:
            y_skf_next = y
        #if self.tree:
        #    self.tree = BallTree(X, leaf_size=20)
        skf = StratifiedKFold(n_splits=self.cv, random_state=self.rs, shuffle=False)
        for i in xrange(self.max_steps):
            print('STEP %d' %i)
            X_skf = X_skf_next[1:]
            y_skf = y_skf_next[1:]
            print(X_skf.shape, y_skf.shape)
            #print(y_skf)
            print(type(y_skf))
            print(y_skf.shape)
            #print(X_skf)
            print('Y~'*50)
            print(y_skf)
            print(set(y_skf))
            split = skf.split(X_skf, y_skf)
            #print(split.next())
            X_skf_next = None
            y_skf_next = None
            for train_index, test_index in split:
                print(train_index.shape, test_index.shape)
                X_train, X_test = X_skf[train_index], X_skf[test_index]
                y_train, y_test = y_skf[train_index], y_skf[test_index]
                cur_mod = deepcopy(self.base)
                if self.optim:
                    grid_search = GridSearchCV(cur_mod, self.parameters, n_jobs=-1, verbose=1, refit=True)
                    grid_search.fit(X_train, y_train)
                    cur_mod = grid_search.best_estimator_
                else:
                    cur_mod.fit(X_train, y_train)
                cur_pred = cur_mod.predict(X_test)
                self.acc.append(accuracy_score(y_test, cur_pred))
                self.models.append(cur_mod)
                if X_skf is None:
                    X_skf_next = X_test[cur_pred != y_test]
                    y_skf_next = y_test[cur_pred != y_test]
                else:
                    X_skf_next = np.hstack((X_skf_next, X_test[cur_pred != y_test]))
                    y_skf_next = np.hstack((y_skf_next, y_test[cur_pred != y_test]))
                print(X_skf_next.shape, y_skf_next.shape)
        self.acc = [100*acc/float(sum(self.acc)) for acc in self.acc]
        return self
        
    def predict(self, X):
        if not(isinstance(X, np.ndarray)):
            X = np.array(X)
        predictions = np.empty((X.shape[0], len(self.models)))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        final_pred = []
        for sample in xrange(X.shape[0]):
            votes = []
            for i, mod_vote in predictions[sample,:]:
                votes.extend([predictions[sample, i] for j in xrange(int(self.acc[i]))])
            final_pred = most_common(votes)
        return final_pred
    
def most_common(lst):
    return max(set(lst), key=lst.count)
            
            
                
cs = Cascador(pipeline, cv=3, mem=True, policy='majority', 
                 random_state=42, optim=False, parameters=None, metric='accuracy', max_steps=2)           
            

In [180]:
def make_batch(model, X, y, cv=6, rs=42, optim=False, parameters=None, scoring=accuracy_score):
    if not(isinstance(X, np.ndarray)):
        X= np.array(X)
    else:
        X = X
    if not(isinstance(y, np.ndarray)):
        y = np.array(y)
    else:
        y = y
    skf = StratifiedKFold(n_splits=cv, random_state=rs, shuffle=False)
    split = skf.split(X, y)
    X_next = []
    y_next = []
    for train_index, test_index in split:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        cur_mod = deepcopy(model)
        if optim:
            grid_search = GridSearchCV(cur_mod, parameters, n_jobs=-1, verbose=1, refit=True)
            grid_search.fit(X_train, y_train)
            cur_mod = grid_search.best_estimator_
        else:
            cur_mod.fit(X_train, y_train)
        cur_pred = cur_mod.predict(X_test)
        X_next.extend(X_test[cur_pred != y_test])
        y_next.extend(y_test[cur_pred != y_test])
    cur_mod = deepcopy(model)
    if optim:
        grid_search = GridSearchCV(cur_mod, parameters, n_jobs=-1, verbose=1, refit=True)
        grid_search.fit(X_train, y_train)
        cur_mod = grid_search.best_estimator_
    else:
        cur_mod.fit(X, y)
    return cur_mod, X_next, y_next

models = []
X_next = deepcopy(X_train)
y_next = deepcopy(y_train)
while len(X_next)> 10:
    print(len(X_next))
    mod, X_next, y_next = make_batch(pipeline, X_next, y_next, optim=True, parameters=parameters)
    #print(len(X_next))
    models.append(mod)
    print("~"*50)

642
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   22.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   22.9s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   20.7s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   20.0s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   20.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   23.1s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   21.4s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
40
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.6s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.6s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.7s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.8s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.7s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.2s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
21
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    1.2s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.4s finished


In [175]:
def similarity_samples(sample, X):
    from fuzzywuzzy import process
    process.extractOne(sample, X)

In [183]:
def predict(models, X_test, policy='voting'):
    if policy == 'voting':
        pred = []
        for x in X_test:
            votes = []
            for model in models:
                votes.append(model.predict([x])[0])
            pred.append(most_common(votes))
    return pred
pred2 = predict(models, X_test)
print(accuracy_score(y_test, pred2))

0.604651162791


In [173]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y)
grid_search.fit(X_train, y_train)
cur_mod = grid_search.best_estimator_
pred = cur_mod.predict(X_test)
print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   25.9s finished


0.920930232558


In [140]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)
split = skf.split(X, y)

In [157]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y)
cs.fit(X_train, y_train)
cs.predict(X_test)

STEP 0
(641,) (641,)
<type 'numpy.ndarray'>
(641,)
Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~
[0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 0 0 0
 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 0 0 1
 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0
 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 1 0
 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0 1 1
 0 0 0 1 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0
 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 0 1 0 0 0
 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 1
 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 0 1 1
 1 0 0 1 0 0 0 1 0 1 0 

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.

In [22]:
y = data.target
X = data.data
X = np.array(X)
y = np.array(y)

In [23]:
skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=False)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
grid_search.fit(X_train, y_train)

In [25]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   15.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': ((1, 1), (1, 2)), 'clf__penalty': ('l2', 'elasticnet'), 'clf__alpha': (1e-05, 1e-06), 'vect__max_df': (0.5, 0.75, 1.0)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [94]:

1 - accuracy_score(y_test, pred)

0.032558139534883734

In [98]:
np.where(pred != y_test)[0]
#print(X_test.shape)

array([ 17,  61,  63,  89,  90, 135, 171])

In [55]:
y_test.

NameError: global name 'max_steps' is not defined