In [1]:
import re
from collections import namedtuple

class Tokenizer:

    Token = namedtuple('Token', 'name text span')

    def __init__(self, tokens):
        self.tokens = tokens
        pat_list = []
        for tok, pat in self.tokens:
            pat_list.append('(?P<%s>%s)' % (tok, pat))
            self.re = re.compile('|'.join(pat_list))

    def iter_tokens(self, input, ignore_ws=True):
        for match in self.re.finditer(input):
            if ignore_ws and match.lastgroup == 'WHITESPACE':
                continue
            yield Tokenizer.Token(match.lastgroup, match.group(0), match.span(0))
    
    def iter_sentences(self, input, ignore_ws=True):
        for sentence in re.compile('[.;!?] ').split(input):
            yield ('SENTENCE', list(self.iter_tokens(sentence, ignore_ws)))
            
    def tokenize(self, input, ignore_ws=True):
        return list(iter_sentences(input, ignore_ws))

# test program
if __name__ == "__main__":

    TOKENS = [
        ('NIL'        , r"nil|\'()"),
        ('TRUE'       , r'true|#t'),
        ('FALSE'      , r'false|#f'),
        ('NUMBER'     , r'\d+'),
        ('STRING'     , r'"(\\.|[^"])*"'),
        ('WORD'     , r'[A-ZĄĆĘŁŃÓŚŹŻa-ząćęłńóśźż]+'),
        ('QUOTE'      , r"'"),
        ('LPAREN'     , r'\('),
        ('RPAREN'     , r'\)'),
        ('DOT'        , r'\.'),
        ('WHITESPACE' , r'\w+'),
    ]

    for t in Tokenizer(TOKENS).iter_sentences('Linux 5.9 miał być mniejszą aktualizacją niż ostatecznie jest. Na większą liczbę nowinek nikt chyba jednak nie będzie narzekać. Szczególnie, że są to nowinki nie bez znaczenia, takie jak (dodana po ponad 5 latach prób) obsługa FSGSBASE, oznaczająca spory wzrost wydajności – zarówno w przypadku platform opartych na procesorach Intela, jak i tych, których sercami są układy AMD. Najbardziej odczujesz to w sytuacjach dużego obciążenia.'):
        print(t)

('SENTENCE', [Token(name='WORD', text='Linux', span=(0, 5)), Token(name='NUMBER', text='5', span=(6, 7)), Token(name='DOT', text='.', span=(7, 8)), Token(name='NUMBER', text='9', span=(8, 9)), Token(name='WORD', text='miał', span=(10, 14)), Token(name='WORD', text='być', span=(15, 18)), Token(name='WORD', text='mniejszą', span=(19, 27)), Token(name='WORD', text='aktualizacją', span=(28, 40)), Token(name='WORD', text='niż', span=(41, 44)), Token(name='WORD', text='ostatecznie', span=(45, 56)), Token(name='WORD', text='jest', span=(57, 61))])
('SENTENCE', [Token(name='WORD', text='Na', span=(0, 2)), Token(name='WORD', text='większą', span=(3, 10)), Token(name='WORD', text='liczbę', span=(11, 17)), Token(name='WORD', text='nowinek', span=(18, 25)), Token(name='WORD', text='nikt', span=(26, 30)), Token(name='WORD', text='chyba', span=(31, 36)), Token(name='WORD', text='jednak', span=(37, 43)), Token(name='WORD', text='nie', span=(44, 47)), Token(name='WORD', text='będzie', span=(48, 54)), 

In [15]:
import pickle
import gzip
import numpy

def load_poleval2019():
    with open('data.pkl', 'rb') as f:
        u = pickle._Unpickler(f)
        u.encoding = 'utf-8'
        data = u.load()
    return data

In [3]:
def cardinality(data):
    return {'data': len(data)}

def class_frequencies(data):
    class_freq = {}
    for d in data:
        if d[-1] not in class_freq:
            class_freq[d[-1]] = 1
        else:
            class_freq[d[-1]] += 1
    return class_freq

def class_balance(data):
    freq = class_frequencies(data)
    total = sum([v for k, v in freq.items()])
    return {k: round(float(v)*100/total, 2) for k, v in freq.items()}

In [4]:
def visualize_quality(loader):
    data = loader()
    cls = [d[-1] for d in data]
    print('Sample data:')
    print(data[0])

    print('Total: {}'.format(cardinality(data)))
    print('Freq: {}'.format(class_frequencies(data)))
    print('Balance: {}'.format(class_balance(data)))

visualize_quality(load_poleval2019)

Sample data:
['Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.', '0']
Total: {'data': 11041}
Freq: {'0': 10056, '2': 707, '1': 278}
Balance: {'0': 91.08, '2': 6.4, '1': 2.52}


In [22]:
from twitter_preprocessor import TwitterPreprocessor
import demoji
demoji.download_codes()
def save_sentences(loader, sentences, labels):
    data = loader()
    with open(sentences, "w+") as f:
        for index, text in enumerate(data):
            tweet = TwitterPreprocessor(text[0])
            tweet.remove_mentions().remove_urls().remove_hashtags()
            tweetText = demoji.replace(tweet.text, "")
            tweetText = tweetText.replace("\\", "")
            tweetText = tweetText.replace(":)", "")
            tweetText = tweetText.replace(";)", "")
            tweetText = tweetText.replace(":-)", "")
            tweetText = tweetText.replace(";-)", "")
            tweetText = tweetText.replace(":D", "")
            tweetText = tweetText.replace(":-D", "")
            f.write(tweetText + ' # ' + '\n')
        
    with open(labels, "w+") as f:
        for index, text in enumerate(data):
            f.write(text[1] + '\n')

save_sentences(load_poleval2019, "tweets.txt", "labels.txt")

Downloading emoji data ...
... OK (Got response in 0.30 seconds)
Writing emoji data to /home/karol/.demoji/codes.json ...
... OK


In [12]:
import xml.etree.ElementTree as ET
def write_part_of_speech(readfile, filewrite, parts_of_speech):
    tree = ET.parse(readfile)
    chunklist = tree.getroot()
    with open(filewrite, "w+") as f:
        tweet = []
        for token in chunklist.iter('tok'):
            lex = token.find('lex')
            base = lex.find('base').text
            if base == '#':
                tweetStr = ' '.join(tweet)
                f.write(tweetStr + '\n')
                tweet = []
            else:
                ctag = lex.find('ctag').text
                print(ctag)
                pos = ctag.split(':')[0]
                if pos in parts_of_speech:
                    tweet.append(base)

In [13]:
parts_of_speech = ['subst', 'depr']
write_part_of_speech("MorphoDita.xml", 'MorphoDitaRzecz.txt', parts_of_speech)

prep:gen
ppron12:sg:gen:m1:pri:akc
subst:sg:inst:m3
prep:gen
subst:sg:gen:m3
bedzie:sg:ter:imperf
subst:sg:nom:f
interp
fin:pl:pri:perf
interp
qub
subst:sg:nom:m1
qub
fin:sg:ter:perf
interp
subst:sg:acc:n
ppron12:sg:nom:m1:sec
subst:sg:nom:f
subst:sg:nom:m1
fin:sg:ter:imperf
inf:imperf
prep:acc
adj:pl:acc:n:pos
conj
adj:pl:acc:n:pos
adj:sg:nom:m1:pos
interp
adj:sg:nom:m1:pos
subst:sg:nom:m1
fin:sg:ter:imperf
subst:pl:acc:m3
prep:loc
subst:pl:loc:m3
subst:pl:gen:m1
interp
qub
praet:pl:m1:perf
aglt:pl:pri:perf:nwok
subst:pl:gen:m3
interp
fin:sg:ter:imperf
interp
adj:sg:gen:f:pos
subst:sg:gen:f
qub
fin:pl:pri:imperf
interp
subst:sg:acc:m3
adj:sg:acc:m3:pos
interp
adj:sg:nom:f:pos
subst:sg:nom:f
interp
fin:sg:pri:imperf
subst:sg:acc:m3
adj:sg:nom:m3:pos
ppron3:sg:nom:m3:ter:akc:npraep
praet:sg:m3:imperf
adj:sg:nom:m3:pos
subst:sg:nom:n
fin:sg:pri:imperf
interp
comp
praet:sg:f:perf
aglt:sg:pri:perf:nwok
qub
prep:acc
ppron3:sg:gen:m1:ter:akc:npraep
adj:pl:acc:n:pos
subst:pl:acc:n
conj
subst:

In [8]:
parts_of_speech = ['adj', 'adja', 'adjp', 'adjc']
write_part_of_speech("MorphoDita.xml", 'MorphoDitaPrzym.txt', parts_of_speech)

In [9]:
parts_of_speech = ['fin', 'bedzie', 'aglt', 'praet', 'impt', 'imps', 'inf', 'pcon', 'pant', 'ger', 'pact', 'ppas', 'winien']
write_part_of_speech("MorphoDita.xml", 'MorphoDitaCzas.txt', parts_of_speech)

In [10]:
parts_of_speech = ['NOUN', 'PROPN']
write_part_of_speech("Spacy.xml", 'SpacyRzecz.txt', parts_of_speech)

In [11]:
parts_of_speech = ['ADJ']
write_part_of_speech("Spacy.xml", 'SpacyPrzym.txt', parts_of_speech)

In [12]:
parts_of_speech = ['VERB']
write_part_of_speech("Spacy.xml", 'SpacyCzas.txt', parts_of_speech)

In [5]:
from time import time

import pandas as pd
from sklearn.metrics import f1_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline

def test_classifier(text_file):
    columns = ['tweet', 'target']
    rows = []
    with open(text_file) as texts, open("labels.txt") as labels:
        for text, label in zip(texts, labels):
            rows.append([text.rstrip('\n'), label.rstrip('\n')])

    data = pd.DataFrame(rows, columns=columns)

    categories = ['0','2']

    print(categories)
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', ComplementNB(fit_prior=True, class_prior=None)),
    ])
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': (0.3, 0.6, 1.0)
    }
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1_micro')
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time()
    grid_search.fit(list(data.tweet), list(data.target))
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [6]:
test_classifier("MorphoDitaCzas.txt")

['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:   28.2s
done in 34.167s

Best score: 0.908
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_ran

In [7]:
test_classifier("MorphoDitaPrzym.txt")

['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   15.2s
done in 21.945s

Best score: 0.911
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l1'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   21.9s finished


In [64]:
test_classifier("MorphoDitaRzecz.txt")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   30.8s
done in 43.688s

Best score: 0.906
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   43.5s finished


In [65]:
test_classifier("WCRFT2czas.txt")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   19.3s
done in 27.068s

Best score: 0.905
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   27.0s finished


In [66]:
test_classifier("WCRFT2przym.txt")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   12.3s
done in 17.580s

Best score: 0.903
Best parameters set:
	clf__alpha: 0.3
	tfidf__norm: 'l1'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   17.5s finished


In [67]:
test_classifier("WCRFT2rzecz.txt")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   31.4s
done in 44.325s

Best score: 0.913
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   44.2s finished


In [18]:
test_classifier("SpacyRzecz.txt")

['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   52.1s finished
done in 52.425s

Best score: 0.908
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l2'
	tfidf__use

In [19]:
test_classifier("SpacyPrzym.txt")

['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   20.7s finished
done in 20.741s

Best score: 0.911
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l1'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)


In [20]:
test_classifier("SpacyCzas.txt")

['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 828 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   30.8s finished
done in 30.945s

Best score: 0.911
Best parameters set:
	clf__alpha: 1.0
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)


In [71]:
test_classifier("tweets.txt")

['0', '2']
Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1.0), 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.3, 0.6, 1.0)}
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:  2.5min finished
done in 152.600s

Best score: 0.935
Best parameters set:
	clf__alpha: 0.6
	tfidf__norm: 'l1'
	tfidf__us