In [4]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', SGDClassifier()),
# ])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    # grid_search.fit(data.data, data.target)
    grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
857 documents
2 categories

Performing grid search...
pipeline: ['std', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (10, 50, 80, 150),
 'clf__penalty': ('l2', 'elasticnet')}


NameError: name 'X' is not defined

Fitting 3 folds for each of 16 candidates, totalling 48 fits
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      8129
          1       0.95      0.78      0.86       819

avg / total       0.98      0.98      0.97      8948

[[8097   32]
 [ 182  637]]


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.4s finished


In [3]:
from sklearn.preprocessing import StandardScaler
parameters = {
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80, 150),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
pipeline = Pipeline([ ('std', StandardScaler()), ('clf', SGDClassifier())])

In [2]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
from copy import deepcopy
from sklearn.metrics import accuracy_score, get_scorer
from sklearn.neighbors import BallTree

class Cascador(BaseEstimator, TransformerMixin):
    
    def __init__(self, model=None, cv=5, mem=True, policy='majority', 
                 random_state=42, optim=False, parameters=None, metric='accuracy', max_steps=3):
        self.base = model
        self.cv = cv
        self.mem = mem
        if self.mem:
            self.tree = 'Will be fitted!'
        else:
            self.tree = None
        self.rs = random_state
        self.optim = optim
        if self.optim:
            self.parameters = parameters
        else:
            self.parameters = None
        self.scoring = get_scorer(metric)
        self.max_steps = max_steps
        self.acc = []
        self.models = []
        
    def fit(self, X, y):
        if not(isinstance(X, np.ndarray)):
            X_skf_next = np.array(X)
        else:
            X_skf_next = X
        if not(isinstance(y, np.ndarray)):
            y_skf_next = np.array(y)
        else:
            y_skf_next = y
        #if self.tree:
        #    self.tree = BallTree(X, leaf_size=20)
        skf = StratifiedKFold(n_splits=self.cv, random_state=self.rs, shuffle=False)
        for i in xrange(self.max_steps):
            print('STEP %d' %i)
            X_skf = X_skf_next[1:]
            y_skf = y_skf_next[1:]
            print(X_skf.shape, y_skf.shape)
            #print(y_skf)
            print(type(y_skf))
            print(y_skf.shape)
            #print(X_skf)
            print('Y~'*50)
            print(y_skf)
            print(set(y_skf))
            split = skf.split(X_skf, y_skf)
            #print(split.next())
            X_skf_next = None
            y_skf_next = None
            for train_index, test_index in split:
                print(train_index.shape, test_index.shape)
                X_train, X_test = X_skf[train_index], X_skf[test_index]
                y_train, y_test = y_skf[train_index], y_skf[test_index]
                cur_mod = deepcopy(self.base)
                if self.optim:
                    grid_search = GridSearchCV(cur_mod, self.parameters, n_jobs=-1, verbose=1, refit=True)
                    grid_search.fit(X_train, y_train)
                    cur_mod = grid_search.best_estimator_
                else:
                    cur_mod.fit(X_train, y_train)
                cur_pred = cur_mod.predict(X_test)
                self.acc.append(accuracy_score(y_test, cur_pred))
                self.models.append(cur_mod)
                if X_skf is None:
                    X_skf_next = X_test[cur_pred != y_test]
                    y_skf_next = y_test[cur_pred != y_test]
                else:
                    X_skf_next = np.hstack((X_skf_next, X_test[cur_pred != y_test]))
                    y_skf_next = np.hstack((y_skf_next, y_test[cur_pred != y_test]))
                print(X_skf_next.shape, y_skf_next.shape)
        self.acc = [100*acc/float(sum(self.acc)) for acc in self.acc]
        return self
        
    def predict(self, X):
        if not(isinstance(X, np.ndarray)):
            X = np.array(X)
        predictions = np.empty((X.shape[0], len(self.models)))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        final_pred = []
        for sample in xrange(X.shape[0]):
            votes = []
            for i, mod_vote in predictions[sample,:]:
                votes.extend([predictions[sample, i] for j in xrange(int(self.acc[i]))])
            final_pred = most_common(votes)
        return final_pred
    
def most_common(lst):
    return max(set(lst), key=lst.count)
            
            
                
cs = Cascador(pipeline, cv=3, mem=True, policy='majority', 
                 random_state=42, optim=False, parameters=None, metric='accuracy', max_steps=2)           
            

In [104]:
def make_batch(model, X, y, cv=6, rs=42, optim=False, parameters=None, scoring=accuracy_score):
    if not(isinstance(X, np.ndarray)):
        X= np.array(X)
    else:
        X = X
    if not(isinstance(y, np.ndarray)):
        y = np.array(y)
    else:
        y = y
    skf = StratifiedKFold(n_splits=cv, random_state=rs, shuffle=False)
    split = skf.split(X, y)
    X_next = []
    y_next = []
    for train_index, test_index in split:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        cur_mod = deepcopy(model)
        if optim:
            grid_search = GridSearchCV(cur_mod, parameters, n_jobs=-1, verbose=1, refit=True)
            grid_search.fit(X_train, y_train)
            cur_mod = grid_search.best_estimator_
        else:
            cur_mod.fit(X_train, y_train)
        cur_pred = cur_mod.predict(X_test)
        X_next.extend(X_test[cur_pred != y_test])
        y_next.extend(y_test[cur_pred != y_test])
    cur_mod = deepcopy(model)
    if optim:
        grid_search = GridSearchCV(cur_mod, parameters, n_jobs=-1, verbose=1, refit=True)
        grid_search.fit(X_train, y_train)
        cur_mod = grid_search.best_estimator_
    else:
        cur_mod.fit(X, y)
    return cur_mod, X_next, y_next

models = []
X_next = deepcopy(X_train)
y_next = deepcopy(y_train)
trees = []

while len(X_next)> 20:
    trees.append(BallTree(X_next))
    print(len(X_next))
    mod, X_next, y_next = make_batch(pipeline, X_next, y_next, optim=True, parameters=parameters)
    #print(len(X_next))
    models.append(mod)
    print("~"*50)

8950
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Done  33 out of  48 | elapsed:    1.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.0s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.0s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.0s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.1s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.1s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
201
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.1s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.2s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
36
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.3s finished


In [114]:
from sklearn.metrics import classification_report, confusion_matrix

def predict_from_forest(sample, models, trees):
    dist = []
    ind = []
    for i, tree in enumerate(trees):
        dist.append(tree.query(sample.reshape(1,-1))[0][0][0])
        #ind.append(tree.query(sample.reshape(1,-1))[1][0][0])
    wanted_mod = np.argmin(dist)
    order = np.argsort(dist)
    votes = []
    for i, or_ in enumerate(order):
        votes.extend([models[or_].predict(sample.reshape(1,-1))[0] for ii in xrange(len(order)-i)])
    #print(dist, ind)
    #print(len(models), wanted_mod)
    return most_common(votes)
    #return models[wanted_mod].predict(sample.reshape(1,-1))[0]

pred2 = []
for x in X_test:
    #print(x.shape)
    pred2.append(predict_from_forest(x, models, trees))
    #return models
print(accuracy_score(y_test, pred2))
print(classification_report(y_test, pred2))
print(confusion_matrix(y_test, pred2))

0.974631202503
             precision    recall  f1-score   support

          0       0.98      0.99      0.99      8129
          1       0.89      0.83      0.86       819

avg / total       0.97      0.97      0.97      8948

[[8043   86]
 [ 141  678]]


In [103]:
parameters = {
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80, 150),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, refit=True)
grid_search.fit(X_train, y_train)
pred = grid_search.best_estimator_.predict(X_test)
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))


Fitting 3 folds for each of 16 candidates, totalling 48 fits
0.979213232007
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      8129
          1       0.91      0.86      0.88       819

avg / total       0.98      0.98      0.98      8948

[[8061   68]
 [ 118  701]]


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    2.3s finished


In [112]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
#pipeline2 = Pipeline([('clf', SVC())])
#ac = AdaBoostClassifier(SVC(), algorithm='SAMME')
#ac.fit(X_train, y_train)
pred3 = ac.predict(X_test)
print(accuracy_score(y_test, pred3))
print(classification_report(y_test, pred3))
print(confusion_matrix(y_test, pred3))

0.908471166741
             precision    recall  f1-score   support

          0       0.91      1.00      0.95      8129
          1       0.00      0.00      0.00       819

avg / total       0.83      0.91      0.86      8948

[[8129    0]
 [ 819    0]]


  'precision', 'predicted', average, warn_for)


In [49]:
from sklearn.metrics import euclidean_distances
def similarity_samples(sample, X):
    from fuzzywuzzy import process
    process.extractOne(sample, X)

closest = np.argmin(euclidean_distances(X[0].reshape(1,-1), X))



0

In [41]:
def predict(models, X_test, policy='voting'):
    if policy == 'voting':
        pred = []
        for x in X_test:
            votes = []
            for model in models:
                votes.append(model.predict([x])[0])
            pred.append(most_common(votes))
    return pred
pred2 = predict(models, X_test)
print(accuracy_score(y_test, pred2))

0.90891819401


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y)
grid_search.fit(X_train, y_train)
cur_mod = grid_search.best_estimator_
pred = cur_mod.predict(X_test)
print(accuracy_score(y_test, pred))

NameError: name 'X' is not defined

In [7]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)
split = skf.split(X, y)

NameError: name 'X' is not defined

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y)
cs.fit(X_train, y_train)
cs.predict(X_test)

STEP 0
(641,) (641,)
<type 'numpy.ndarray'>
(641,)
Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~Y~
[0 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 1 0 0
 1 0 1 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0
 1 1 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 1
 0 0 1 1 1 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 1 1 1 0
 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 1 1 1
 1 0 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 1
 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1
 1 0 1 0 0 1 0 0 1 1 1 0 0 1 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0
 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 0
 0 1 0 0 0 1 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1
 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0
 1 0 1 1 0 0 0 0 1 0 1 

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.

In [26]:
import pandas as pd
with open('/home/bogas/workspace/DATA/HTRU2/HTRU_2.csv', 'r') as f:
    df = pd.read_csv(f, header=None)
y = df[8]
del df[8]
X = df.as_matrix()

In [27]:
print(X.shape, y.shape)
print(type(X))

(17898, 8) (17898,)
<type 'numpy.ndarray'>


In [9]:
y = data.target
X = data.data
X = np.array(X)
y = np.array(y)

In [28]:
skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=False)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


JoblibAttributeError: JoblibAttributeError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/usr/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/usr/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x7fb1c07b48b0, file "/...2.7/dist-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x7fb1c07b48b0, file "/...2.7/dist-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py in <module>()
      1 if __name__ == '__main__':
      2     from ipykernel import kernelapp as app
----> 3     app.launch_new_instance()

...........................................................................
/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    400         
    401         if self.poller is not None:
    402             self.poller.start()
    403         self.kernel.start()
    404         try:
--> 405             ioloop.IOLoop.instance().start()
    406         except KeyboardInterrupt:
    407             pass
    408 
    409 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    255         if self.control_stream:
    256             self.control_stream.on_recv(self.dispatch_control, copy=False)
    257 
    258         def make_dispatcher(stream):
    259             def dispatcher(msg):
--> 260                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    261             return dispatcher
    262 
    263         for s in self.shell_streams:
    264             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'skf = StratifiedKFold(n_splits=2, random_state=4..., y[test_index]\ngrid_search.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2018-02-13T00:00:49.117907', 'msg_id': '1AC0E6EDC71C41D5A0A739F92346209C', 'msg_type': 'execute_request', 'session': '4FC3A118EDB349AFA22A73B0C1416B74', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '1AC0E6EDC71C41D5A0A739F92346209C', 'msg_type': 'execute_request', 'parent_header': {}})
    207             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    208         else:
    209             self.log.debug("%s: %s", msg_type, msg)
    210             self.pre_handler_hook()
    211             try:
--> 212                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['4FC3A118EDB349AFA22A73B0C1416B74']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'skf = StratifiedKFold(n_splits=2, random_state=4..., y[test_index]\ngrid_search.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2018-02-13T00:00:49.117907', 'msg_id': '1AC0E6EDC71C41D5A0A739F92346209C', 'msg_type': 'execute_request', 'session': '4FC3A118EDB349AFA22A73B0C1416B74', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '1AC0E6EDC71C41D5A0A739F92346209C', 'msg_type': 'execute_request', 'parent_header': {}}
    213             except Exception:
    214                 self.log.error("Exception in message handler:", exc_info=True)
    215             finally:
    216                 self.post_handler_hook()

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['4FC3A118EDB349AFA22A73B0C1416B74'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'skf = StratifiedKFold(n_splits=2, random_state=4..., y[test_index]\ngrid_search.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2018-02-13T00:00:49.117907', 'msg_id': '1AC0E6EDC71C41D5A0A739F92346209C', 'msg_type': 'execute_request', 'session': '4FC3A118EDB349AFA22A73B0C1416B74', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '1AC0E6EDC71C41D5A0A739F92346209C', 'msg_type': 'execute_request', 'parent_header': {}})
    365         if not silent:
    366             self.execution_count += 1
    367             self._publish_execute_input(code, parent, self.execution_count)
    368 
    369         reply_content = self.do_execute(code, silent, store_history,
--> 370                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    371 
    372         # Flush output before sending the reply.
    373         sys.stdout.flush()
    374         sys.stderr.flush()

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u'skf = StratifiedKFold(n_splits=2, random_state...y[test_index]\ngrid_search.fit(X_train, y_train)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u'skf = StratifiedKFold(n_splits=2, random_state...y[test_index]\ngrid_search.fit(X_train, y_train)'
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u'skf = StratifiedKFold(n_splits=2, random_state...y[test_index]\ngrid_search.fit(X_train, y_train)', store_history=True, silent=False, shell_futures=True)
   2897                 self.displayhook.exec_result = result
   2898 
   2899                 # Execute the user code
   2900                 interactivity = "none" if silent else self.ast_node_interactivity
   2901                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2902                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2903 
   2904                 # Reset this so later displayed values do not modify the
   2905                 # ExecutionResult
   2906                 self.displayhook.exec_result = None

...........................................................................
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.For object>, <_ast.Expr object>], cell_name='<ipython-input-28-476ca1e50d1e>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3007                     return True
   3008 
   3009             for i, node in enumerate(to_run_interactive):
   3010                 mod = ast.Interactive([node])
   3011                 code = compiler(mod, cell_name, "single")
-> 3012                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fb1528e19b0, file "<ipython-input-28-476ca1e50d1e>", line 5>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   3013                     return True
   3014 
   3015             # Flush softspace
   3016             if softspace(sys.stdout, 0):

...........................................................................
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fb1528e19b0, file "<ipython-input-28-476ca1e50d1e>", line 5>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3061         outflag = 1  # happens in more places, so it's easier as default
   3062         try:
   3063             try:
   3064                 self.hooks.pre_run_code_hook()
   3065                 #rprint('Running code', repr(code_obj)) # dbg
-> 3066                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fb1528e19b0, file "<ipython-input-28-476ca1e50d1e>", line 5>
        self.user_global_ns = {'BallTree': <type 'sklearn.neighbors.ball_tree.BallTree'>, 'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'Cascador': <class '__main__.Cascador'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u'from __future__ import print_function\n\nfrom ...%r" % (param_name, best_parameters[param_name]))', u"import numpy as np\nfrom sklearn.base import B...accuracy', max_steps=2)           \n            ", u'def make_batch(model, X, y, cv=6, rs=42, optim...ext))\n    models.append(mod)\n    print("~"*50)', u'def similarity_samples(sample, X):\n    from f...mport process\n    process.extractOne(sample, X)', u"def predict(models, X_test, policy='voting'):\...s, X_test)\nprint(accuracy_score(y_test, pred2))", u'from sklearn.model_selection import train_test...ict(X_test)\nprint(accuracy_score(y_test, pred))', u'skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)\nsplit = skf.split(X, y)', u'from sklearn.model_selection import train_test...y)\ncs.fit(X_train, y_train)\ncs.predict(X_test)', u'y = data.target\nX = data.data\nX = np.array(X)\ny = np.array(y)', u'from sklearn.model_selection import train_test...y)\ncs.fit(X_train, y_train)\ncs.predict(X_test)', u'def make_batch(model, X, y, cv=6, rs=42, optim...ext))\n    models.append(mod)\n    print("~"*50)', u"import pandas as pd\nwith open('/home/bogas/workspace/DATA/HTRU2/HTRU_2.csv', 'r') as f:\n    pd", u"import pandas as pd\nwith open('/home/bogas/wo..._2.csv', 'r') as f:\n    df = pd.read_csv(f)\ndf", u"import pandas as pd\nwith open('/home/bogas/wo...r') as f:\n    df = pd.read_csv(f, header=False)", u"import pandas as pd\nwith open('/home/bogas/wo...'r') as f:\n    df = pd.read_csv(f, header=None)", u"import pandas as pd\nwith open('/home/bogas/wo...n    df = pd.read_csv(f, header=None)\ndf.head()", u"import pandas as pd\nwith open('/home/bogas/wo...r=None)\ny = df[8]\ndel df[8]\nX = df.to_dense()", u'X', u'X.to_xarray', ...], 'Out': {13:          140.5625  55.68378214  -0.234571412  -0...97527     1.429475  0  

[17897 rows x 9 columns], 16:             0          1         2         3    ...171909  0  
3   53.593661  0  
4  252.567306  0  , 18:                 0          1         2         3...-1.597527    1.429475  

[17898 rows x 8 columns], 19: <bound method DataFrame.to_xarray of            ...1.597527    1.429475  

[17898 rows x 8 columns]>, 21: array([[ 140.5625    ,   55.68378214,   -0.23457...71256228,
          -1.59752658,    1.42947536]]), 22: (17898, 8)}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'SGDClassifier': <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>, 'StratifiedKFold': <class 'sklearn.model_selection._split.StratifiedKFold'>, ...}
        self.user_ns = {'BallTree': <type 'sklearn.neighbors.ball_tree.BallTree'>, 'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'Cascador': <class '__main__.Cascador'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u'from __future__ import print_function\n\nfrom ...%r" % (param_name, best_parameters[param_name]))', u"import numpy as np\nfrom sklearn.base import B...accuracy', max_steps=2)           \n            ", u'def make_batch(model, X, y, cv=6, rs=42, optim...ext))\n    models.append(mod)\n    print("~"*50)', u'def similarity_samples(sample, X):\n    from f...mport process\n    process.extractOne(sample, X)', u"def predict(models, X_test, policy='voting'):\...s, X_test)\nprint(accuracy_score(y_test, pred2))", u'from sklearn.model_selection import train_test...ict(X_test)\nprint(accuracy_score(y_test, pred))', u'skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)\nsplit = skf.split(X, y)', u'from sklearn.model_selection import train_test...y)\ncs.fit(X_train, y_train)\ncs.predict(X_test)', u'y = data.target\nX = data.data\nX = np.array(X)\ny = np.array(y)', u'from sklearn.model_selection import train_test...y)\ncs.fit(X_train, y_train)\ncs.predict(X_test)', u'def make_batch(model, X, y, cv=6, rs=42, optim...ext))\n    models.append(mod)\n    print("~"*50)', u"import pandas as pd\nwith open('/home/bogas/workspace/DATA/HTRU2/HTRU_2.csv', 'r') as f:\n    pd", u"import pandas as pd\nwith open('/home/bogas/wo..._2.csv', 'r') as f:\n    df = pd.read_csv(f)\ndf", u"import pandas as pd\nwith open('/home/bogas/wo...r') as f:\n    df = pd.read_csv(f, header=False)", u"import pandas as pd\nwith open('/home/bogas/wo...'r') as f:\n    df = pd.read_csv(f, header=None)", u"import pandas as pd\nwith open('/home/bogas/wo...n    df = pd.read_csv(f, header=None)\ndf.head()", u"import pandas as pd\nwith open('/home/bogas/wo...r=None)\ny = df[8]\ndel df[8]\nX = df.to_dense()", u'X', u'X.to_xarray', ...], 'Out': {13:          140.5625  55.68378214  -0.234571412  -0...97527     1.429475  0  

[17897 rows x 9 columns], 16:             0          1         2         3    ...171909  0  
3   53.593661  0  
4  252.567306  0  , 18:                 0          1         2         3...-1.597527    1.429475  

[17898 rows x 8 columns], 19: <bound method DataFrame.to_xarray of            ...1.597527    1.429475  

[17898 rows x 8 columns]>, 21: array([[ 140.5625    ,   55.68378214,   -0.23457...71256228,
          -1.59752658,    1.42947536]]), 22: (17898, 8)}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'SGDClassifier': <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>, 'StratifiedKFold': <class 'sklearn.model_selection._split.StratifiedKFold'>, ...}
   3067             finally:
   3068                 # Reset our crash handler in place
   3069                 sys.excepthook = old_excepthook
   3070         except SystemExit as e:

...........................................................................
/home/bogas/workspace/GIT/Cascada/<ipython-input-28-476ca1e50d1e> in <module>()
      1 skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=False)
      2 for train_index, test_index in skf.split(X, y):
      3     X_train, X_test = X[train_index], X[test_index]
      4     y_train, y_test = y[train_index], y[test_index]
----> 5 grid_search.fit(X_train, y_train)

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...ain_score='warn',
       scoring=None, verbose=1), X=array([[  1.40562500e+02,   5.56837821e+01,  -2....915249e+01,   7.58592123e+00,   8.34460142e+01]]), y=0       0
1       0
2       0
3       0
4       ...  0
9322    0
Name: 8, Length: 8950, dtype: int64, groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=3, random_state=None, shuffle=False)>
        X = array([[  1.40562500e+02,   5.56837821e+01,  -2....915249e+01,   7.58592123e+00,   8.34460142e+01]])
        y = 0       0
1       0
2       0
3       0
4       ...  0
9322    0
Name: 8, Length: 8950, dtype: int64
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
AttributeError                                     Tue Feb 13 00:00:49 2018
PID: 5000                                     Python 2.7.6: /usr/bin/python
...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(memory=None,
     steps=[('vect', Count...e=True, tol=None, verbose=0, warm_start=False))]), array([[  1.40562500e+02,   5.56837821e+01,  -2....915249e+01,   7.58592123e+00,   8.34460142e+01]]), 0       0
1       0
2       0
3       0
4       ...  0
9322    0
Name: 8, Length: 8950, dtype: int64, {'score': <function _passthrough_scorer>}, array([2977, 2981, 2982, ..., 8947, 8948, 8949]), array([   0,    1,    2, ..., 2984, 2985, 2986]), 1, {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
        self.items = [(<function _fit_and_score>, (Pipeline(memory=None,
     steps=[('vect', Count...e=True, tol=None, verbose=0, warm_start=False))]), array([[  1.40562500e+02,   5.56837821e+01,  -2....915249e+01,   7.58592123e+00,   8.34460142e+01]]), 0       0
1       0
2       0
3       0
4       ...  0
9322    0
Name: 8, Length: 8950, dtype: int64, {'score': <function _passthrough_scorer>}, array([2977, 2981, 2982, ..., 8947, 8948, 8949]), array([   0,    1,    2, ..., 2984, 2985, 2986]), 1, {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(memory=None,
     steps=[('vect', Count...e=True, tol=None, verbose=0, warm_start=False))]), X=array([[  1.40562500e+02,   5.56837821e+01,  -2....915249e+01,   7.58592123e+00,   8.34460142e+01]]), y=0       0
1       0
2       0
3       0
4       ...  0
9322    0
Name: 8, Length: 8950, dtype: int64, scorer={'score': <function _passthrough_scorer>}, train=array([2977, 2981, 2982, ..., 8947, 8948, 8949]), test=array([   0,    1,    2, ..., 2984, 2985, 2986]), verbose=1, parameters={'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method Pipeline.fit of Pipeline(memory=No...=True, tol=None, verbose=0, warm_start=False))])>
        X_train = array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]])
        y_train = 2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64
        fit_params = {}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py in fit(self=Pipeline(memory=None,
     steps=[('vect', Count...e=True, tol=None, verbose=0, warm_start=False))]), X=array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), y=2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64, **fit_params={})
    243         Returns
    244         -------
    245         self : Pipeline
    246             This estimator
    247         """
--> 248         Xt, fit_params = self._fit(X, y, **fit_params)
        Xt = undefined
        fit_params = {}
        self._fit = <bound method Pipeline._fit of Pipeline(memory=N...=True, tol=None, verbose=0, warm_start=False))])>
        X = array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]])
        y = 2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64
    249         if self._final_estimator is not None:
    250             self._final_estimator.fit(Xt, y, **fit_params)
    251         return self
    252 

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py in _fit(self=Pipeline(memory=None,
     steps=[('vect', Count...e=True, tol=None, verbose=0, warm_start=False))]), X=array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), y=2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64, **fit_params={})
    208                 else:
    209                     cloned_transformer = clone(transformer)
    210                 # Fit or load from cache the current transfomer
    211                 Xt, fitted_transformer = fit_transform_one_cached(
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
        fit_params_steps = {'clf': {}, 'tfidf': {}, 'vect': {}}
        name = 'vect'
    214                 # Replace the transformer of the step with the fitted
    215                 # transformer. This is necessary when loading the transformer
    216                 # from the cache.
    217                 self.steps[step_idx] = (name, fitted_transformer)

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/memory.py in __call__(self=NotMemorizedFunc(func=<function _fit_transform_one at 0x7fb17f6077d0>), *args=(CountVectorizer(analyzer=u'word', binary=False, ...\w+\\b',
        tokenizer=None, vocabulary=None), None, array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), 2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64), **kwargs={})
    357     # Should be a light as possible (for speed)
    358     def __init__(self, func):
    359         self.func = func
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
        self.func = <function _fit_transform_one>
        args = (CountVectorizer(analyzer=u'word', binary=False, ...\w+\\b',
        tokenizer=None, vocabulary=None), None, array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), 2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64)
        kwargs = {}
    363 
    364     def call_and_shelve(self, *args, **kwargs):
    365         return NotMemorizedResult(self.func(*args, **kwargs))
    366 

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer=CountVectorizer(analyzer=u'word', binary=False, ...\w+\\b',
        tokenizer=None, vocabulary=None), weight=None, X=array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), y=2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64, **fit_params={})
    576 
    577 
    578 def _fit_transform_one(transformer, weight, X, y,
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
        res = undefined
        transformer.fit_transform = <bound method CountVectorizer.fit_transform of C...w+\\b',
        tokenizer=None, vocabulary=None)>
        X = array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]])
        y = 2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64
        fit_params = {}
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)
    584     # if we have a weight for this transformer, multiply output
    585     if weight is None:

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py in fit_transform(self=CountVectorizer(analyzer=u'word', binary=False, ...\w+\\b',
        tokenizer=None, vocabulary=None), raw_documents=array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), y=2977    1
2981    1
2982    1
2987    1
2988    ...  0
9322    0
Name: 8, Length: 5966, dtype: int64)
    864         max_df = self.max_df
    865         min_df = self.min_df
    866         max_features = self.max_features
    867 
    868         vocabulary, X = self._count_vocab(raw_documents,
--> 869                                           self.fixed_vocabulary_)
        self.fixed_vocabulary_ = False
    870 
    871         if self.binary:
    872             X.data.fill(1)
    873 

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py in _count_vocab(self=CountVectorizer(analyzer=u'word', binary=False, ...\w+\\b',
        tokenizer=None, vocabulary=None), raw_documents=array([[  8.03515625e+01,   3.97448090e+01,   1....915249e+01,   7.58592123e+00,   8.34460142e+01]]), fixed_vocab=False)
    787         indptr = _make_int_array()
    788         values = _make_int_array()
    789         indptr.append(0)
    790         for doc in raw_documents:
    791             feature_counter = {}
--> 792             for feature in analyze(doc):
        feature = undefined
        analyze = <function <lambda>>
        doc = array([ 80.3515625 ,  39.74480899,   1.16691178,...86957,  26.66195868,   6.62937054,  44.88527139])
    793                 try:
    794                     feature_idx = vocabulary[feature]
    795                     if feature_idx not in feature_counter:
    796                         feature_counter[feature_idx] = 1

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py in <lambda>(doc=array([ 80.3515625 ,  39.74480899,   1.16691178,...86957,  26.66195868,   6.62937054,  44.88527139]))
    261         elif self.analyzer == 'word':
    262             stop_words = self.get_stop_words()
    263             tokenize = self.build_tokenizer()
    264 
    265             return lambda doc: self._word_ngrams(
--> 266                 tokenize(preprocess(self.decode(doc))), stop_words)
        doc = array([ 80.3515625 ,  39.74480899,   1.16691178,...86957,  26.66195868,   6.62937054,  44.88527139])
    267 
    268         else:
    269             raise ValueError('%s is not a valid tokenization scheme/analyzer' %
    270                              self.analyzer)

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/feature_extraction/text.py in <lambda>(x=array([ 80.3515625 ,  39.74480899,   1.16691178,...86957,  26.66195868,   6.62937054,  44.88527139]))
    227         else:
    228             raise ValueError('Invalid value for "strip_accents": %s' %
    229                              self.strip_accents)
    230 
    231         if self.lowercase:
--> 232             return lambda x: strip_accents(x.lower())
        x = array([ 80.3515625 ,  39.74480899,   1.16691178,...86957,  26.66195868,   6.62937054,  44.88527139])
        x.lower = undefined
    233         else:
    234             return strip_accents
    235 
    236     def build_tokenizer(self):

AttributeError: 'numpy.ndarray' object has no attribute 'lower'
___________________________________________________________________________

In [None]:
grid_search.fit(X_train, y_train)

In [None]:

1 - accuracy_score(y_test, pred)

In [None]:
np.where(pred != y_test)[0]
#print(X_test.shape)

In [None]:
y_test.