# scikit-learn Working with Text Data Tutorial

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#tutorial-setup

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
            'comp.graphics', 'sci.med']

In [3]:
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)

In [4]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(twenty_train.data)

2257

In [6]:
len(twenty_train.filenames)

2257

In [7]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
count_vect = CountVectorizer()

In [10]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [11]:
X_train_counts.shape

(2257, 35788)

In [12]:
count_vect.vocabulary_.get(u'algorithm')


4690

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [14]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [15]:
X_train_tf = tf_transformer.transform(X_train_counts)

In [16]:
X_train_tf.shape

(2257, 35788)

In [17]:
tfidf_transformer = TfidfTransformer()

In [18]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [19]:
X_train_tfidf.shape

(2257, 35788)

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [22]:
 docs_new = ['Science', 'OpenGL on the GPU is fast']

In [23]:
X_new_counts = count_vect.transform(docs_new)

In [24]:
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [25]:
predicted = clf.predict(X_new_tfidf)

In [26]:
for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, twenty_train.target_names[category]))

'Science' => sci.med
'OpenGL on the GPU is fast' => comp.graphics


In [27]:
from sklearn.pipeline import Pipeline

In [28]:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

In [29]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [30]:
import numpy as np

In [31]:
twenty_test = fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)

In [32]:
docs_test = twenty_test.data

In [33]:
predicted = text_clf.predict(docs_test)

In [34]:
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [35]:
from sklearn.linear_model import SGDClassifier

In [36]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
 ])

In [37]:
text_clf.fit(twenty_train.data, twenty_train.target)


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [38]:
predicted = text_clf.predict(docs_test)

In [39]:
np.mean(predicted == twenty_test.target)

0.9101198402130493

In [40]:
from sklearn import metrics

In [41]:
print(metrics.classification_report(twenty_test.target, predicted,
     target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



In [42]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[256,  11,  16,  36],
       [  4, 380,   3,   2],
       [  5,  35, 353,   3],
       [  5,  11,   4, 378]], dtype=int64)