In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
twenty_train = fetch_20newsgroups(subset ='train',categories = categories,shuffle=True,random_state =42)

In [3]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
len(twenty_train.data)

2257

In [5]:
len(twenty_train.filenames)

2257

In [6]:
print('\n'.join(twenty_train.data[0].split('\n')[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [7]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [8]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [9]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(twenty_train.data)
x_train_counts.shape

(2257, 35788)

In [11]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf = False).fit(x_train_counts)
x_train_tf = tf_transformer.transform(x_train_counts)
x_train_tf.shape

(2257, 35788)

In [15]:
tfidf_transformer = TfidfTransformer(use_idf = False)
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

(2257, 35788)

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(x_train_tfidf,twenty_train.target)
docs_new = ['God is love','OpenGL on the GPU is fast']
x_new_counts = count_vect.transform(docs_new)
x_new_tfidf = tfidf_transformer.transform(x_new_counts)
predicted = clf.predict(x_new_tfidf)
for doc, category in zip(docs_new,predicted):
    print('%r => %s' % (doc,twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [18]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline ([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB())])
text_clf.fit(twenty_train.data,twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted==twenty_test.target)

0.83488681757656458

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline ([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(
    loss='hinge',penalty ='l2',alpha=1e-3,random_state=42,max_iter=5,tol =None))])
text_clf.fit(twenty_train.data,twenty_train.target)
print(text_clf)
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted==twenty_test.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])


0.9127829560585885

In [27]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target,predicted,target_names = twenty_test.target_names))
print(metrics.classification_report(twenty_test.target,predicted,labels = twenty_test.target))
print(metrics.classification_report(twenty_test.target,predicted))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502

             precision    recall  f1-score   support

          2       0.94      0.90      0.92       396
          2       0.94      0.90      0.92       396
          2       0.94      0.90      0.92       396
          0       0.95      0.81      0.87       319
          3       0.90      0.95      0.93       398
          0       0.95      0.81      0.87       319
          1       0.88      0.97      0.92       389
          3       0.90      0.95      0.93       398
          2       0.94      0.90      0.92       396
          2       0.94      0.90      0.92       396
          1       0.88      0

In [25]:
metrics.confusion_matrix(twenty_test.target,predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]], dtype=int64)

In [32]:
from sklearn.model_selection import GridSearchCV
parameters ={'vect__ngram_range':[(1,1),(1,2)],
            'tfidf__use_idf':(True,False),
            'clf__alpha':(1e-2,1e-3)}
gs_clf = GridSearchCV(text_clf,parameters,n_jobs=-1)
gs_clf.fit(twenty_train.data[:400],twenty_train.target[:400])
print(gs_clf.predict(['God is love']))
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

[3]


'soc.religion.christian'

In [33]:
gs_clf.best_score_

0.90000000000000002

In [35]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [36]:
print(parameters)

{'clf__alpha': (0.01, 0.001), 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)}


In [37]:
print(parameters.keys())

dict_keys(['clf__alpha', 'vect__ngram_range', 'tfidf__use_idf'])


In [38]:
print(gs_clf.best_params_)

{'clf__alpha': 0.001, 'vect__ngram_range': (1, 1), 'tfidf__use_idf': True}


In [1]:
% cp -r skeletons workspace

ERROR:root:Line magic function `%cp` not found.
