In [1]:
from scribe_classifier.data.canada import AllCodes, CodeRecord, TitleRecord, TitleSet, SimpleModel
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, cross_validate
from floatrange import floatrange
from sklearn.model_selection import cross_val_predict
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
target_level = 1
emptyset_label = "NA"
all_codes = AllCodes.load_from_pickle("./source_data/pickles/canada/tidy_sets/all_codes.P", is_path=True)
code_vec = sorted(all_codes.get_codes_for_level(target_level=target_level))
if emptyset_label is not None:
    code_vec.append(emptyset_label)

print(code_vec)
lbl_enc = LabelEncoder()
labels = lbl_enc.fit_transform(code_vec)
print(labels)
cv = StratifiedShuffleSplit(n_splits=10, train_size=0.10, test_size=0.90)

mdl=SimpleModel(target_level=target_level, emptyset_label=emptyset_label, use_bayes=False, cv=cv)
mdl.clf.verbose=3

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'NA']
[ 0  1  2  3  4  5  6  7  8  9 10]


In [3]:
test = TitleSet.load_from_pickle('source_data/pickles/canada/test_sets/test.set.lvl%d.P' % target_level, is_path=True)
valid = TitleSet.load_from_pickle('source_data/pickles/canada/test_sets/valid.set.lvl%d.P' % target_level, is_path=True)
train = TitleSet.load_from_pickle('source_data/pickles/canada/test_sets/train.set.lvl%d.P' % target_level, is_path=True)

valid = valid.copy_and_append_empty_string_class(label=emptyset_label, prop_records=0.25)
test = test.copy_and_append_empty_string_class(label=emptyset_label, prop_records=0.25)

T_titles = train.get_title_vec()
t_titles = test.get_title_vec()
v_titles = valid.get_title_vec()
T_labels = lbl_enc.transform(train.get_code_vec(target_level=target_level))
t_labels = lbl_enc.transform(train.get_code_vec(target_level=target_level))
v_labels = lbl_enc.transform(valid.get_code_vec(target_level=target_level))

In [4]:
mdl.clf.fit(T_titles, T_labels)

Fitting 10 folds for each of 72 candidates, totalling 720 fits
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7653676582374702, total=   0.6s
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, cl

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.1s


[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7584249733948208, total=   1.6s
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7599959458774641, total=   1.8s
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7666852480616226, total=   1.6s
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7552830284295343, total=   1.5s
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV] clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.001, vect__ngram

[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7649622459838848, total=   0.9s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7564992651902904, total=   0.9s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7553337049612324, total=   1.5s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7511782293619824, total=   0.8s
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7547255865808544, total=   0.6s
[CV] clf__alpha=0.001, clf__m

[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7535600263517965, total=   0.7s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7566512947853848, total=   0.7s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7567526478487813, total=   0.7s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7532559671616075, total=   0.9s
[CV] clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, v

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   22.0s


[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7648102163887903, total=   1.6s
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.764708863325394, total=   1.7s
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7584249733948208, total=   1.8s
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7681548674808696, total=   0.8s
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7499619926012264, total=   0.6s
[CV] clf__alpha=0.001, clf__

[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7598945928140678, total=   1.7s
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7551309988344398, total=   1.6s
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7631378908427507, total=   0.8s
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7548269396442507, total=   1.8s
[CV] clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.001, clf__max_iter=7000, clf__tol=0.0001, vect

[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7512795824253788, total=   1.8s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7534586732884001, total=   1.4s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7467186945725435, total=   1.6s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7493538742208483, total=   1.8s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.001, v

[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.753712055946891, total=   1.3s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3), score=0.7539147620736837, total=   1.9s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7551816753661379, total=   1.9s
[CV] clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7440835149242386, total=   2.0s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7468200476359398, total=   2.0s
[CV] clf__alph

[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7513809354887752, total=   0.8s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7479856078649977, total=   1.6s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7535093498200983, total=   0.9s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7516849946789642, total=   1.7s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0

[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   53.1s


[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.750722140576699, total=   0.7s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7396746566664978, total=   0.7s
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7518877008057568, total=   0.8s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.751026199766888, total=   0.6s
[CV] clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=0.0001, clf__max_iter=4000, clf__tol=0

[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7455024578117874, total=   1.6s
[CV] clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7509248467034916, total=   1.8s
[CV] clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7515836416155678, total=   1.8s
[CV] clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.752546495717833, total=   1.4s
[CV] clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7405868342370648, total=   1.7s
[CV] clf__alpha=0.0001

[CV] clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7491511680940557, total=   1.7s
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7489991384989612, total=   1.5s
[CV] clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7422084832514063, total=   2.2s
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7513809354887752, total=   2.0s
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=0.0001, clf__max_iter=7000, clf__tol=0.000

[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7245223736887447, total=   1.6s
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7250798155374246, total=   0.7s
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7180357776313789, total=   1.7s
[CV]  clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.72132975219176, total=   0.6s
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV] clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-05, clf__max_iter=1000, clf__tol=0.0001, vect_

[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7099782090913698, total=   0.8s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7146404500076015, total=   1.3s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7262960522981807, total=   1.6s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7147418030709978, total=   1.7s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.001, vect__ngram

[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3), score=0.7221912532306289, total=   1.5s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 3), score=0.7220899001672325, total=   1.6s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7245730502204429, total=   1.9s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7159073633000558, total=   2.0s
[CV] clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=4000, clf__tol=0.0001, v

[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  1.6min


[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7122586530177875, total=   1.6s
[CV] clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.721633811381949, total=   1.4s
[CV] clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7141336846906198, total=   1.5s
[CV] clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7215324583185527, total=   2.0s
[CV] clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7293873207317692, total=   1.6s
[CV] clf__alpha=1e-05, clf__max

[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.723103430801196, total=   1.8s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7178837480362844, total=   1.4s
[CV]  clf__alpha=1e-05, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7163634520853392, total=   1.5s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7273095829321441, total=   0.9s
[CV]  clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7170222469974155, total=   0.7s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf_

[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7057213804287235, total=   0.7s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7056200273653271, total=   0.9s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.710028885623068, total=   1.1s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.6969036639132418, total=   0.9s
[CV] clf__alpha=1e-06, clf__max_iter=1000, clf__tol=0.0001, vec

[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7256372573861045, total=   1.5s
[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7099275325596717, total=   1.8s
[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7095727968377844, total=   1.8s
[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 5) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_range=(1, 3), score=0.7199614858359094, total=   1.7s
[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.001, vect__ngram_

[CV] clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5) 
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7075457355698576, total=   1.8s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7184411898849643, total=   1.7s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1), score=0.7148431561343942, total=   1.0s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf__max_iter=4000, clf__tol=0.0001, vect__ngram_range=(1, 5), score=0.7054173212385344, total=   2.0s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ng

[CV]  clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.6970050169766381, total=   1.6s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 1) 
[CV]  clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.001, vect__ngram_range=(1, 5), score=0.7160087163634521, total=   1.9s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7061267926823088, total=   0.9s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.7037449956924948, total=   0.8s
[CV]  clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 1), score=0.719556073582324, total=   0.8s
[CV] clf__alpha=1e-06, clf__max_iter=7000, clf__tol=0.0001, vect__ngram_range=(1, 3) 
[CV]  clf__alpha=1e-06, 

[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 40.4min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=0.9,
            train_size=0.1),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 3), (1, 5)], 'clf__alpha': (0.001, 0.0001, 1e-05, 1e-06), 'clf__max_iter': range(1000, 10000, 3000), 'clf__tol': (0.001, 0.0001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [None]:
t_preds = mdl.predict(test)
v_preds = mdl.predict(valid)

In [6]:
ngram_start=1
ngram_stop=6
ngram_step=2
[(1, x) for x in range(ngram_start, ngram_stop, ngram_step)]

[(1, 1), (1, 3), (1, 5)]

In [5]:
mdl.clf.best_parameters_

AttributeError: 'GridSearchCV' object has no attribute 'best_parameters_'