In [1]:
import sklearn

In [2]:
categories = ['ANGER', 'FEAR', 'JOY', 'LOVE', 'SADNESS', 'SURPRISE']

In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [6]:
twenty_train['target']

array([1, 1, 3, ..., 2, 2, 2])

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts[0]

<1x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 73 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [8]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [9]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'believe in the sky', 'linear transform']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [10]:
predicted=clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print("%r => %s" % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'believe in the sky' => soc.religion.christian
'linear transform' => comp.graphics


In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [12]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [13]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', 
                                 categories=categories, shuffle=True, 
                                 random_state=42)
doc_test = twenty_test.data
predicted = text_clf.predict(doc_test)
np.mean(predicted == twenty_test.target)

0.83488681757656458

In [14]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None))
                    ])
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(doc_test)
np.mean(predicted == twenty_test.target)

0.9127829560585885

In [15]:
from sklearn.model_selection import GridSearchCV
parameters = {'vector__ngram_range':[(1,1), (1,2)],
              'tfidf__use_idf':(True, False),
              'clf__alpha':(1e-2, 1e-3),}

In [18]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

In [19]:
gs_clf=gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

ValueError: Invalid parameter vector for estimator Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.