In [1]:
import numpy as np
import pandas as pd

In [12]:
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups()

In [7]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [15]:
print(newsgroups.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [11]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [26]:
print(newsgroups.data[0])

From: HADCRJAM@admin.uh.edu (MILLER, JIMMY A.)
Subject: Re: BATF/FBI revenge
Organization: University of Houston Administrative Computing
Lines: 38
Distribution: world
NNTP-Posting-Host: uhad2.admin.uh.edu
X-News-Reader: VMS NEWS 1.24
In-Reply-To: donb@netcom.com's message of Tue, 20 Apr 1993 17:10:52 GMT

In <donbC5sL24.Ewu@netcom.com> donb@netcom.com writes:

> Anyway, here's how I see the Waco affair; I'd be interested in other peoples'
> interpretations...
> 
> 1. Koresh and his people were basically minding their own business.
> 2. Some weapons violations may have been committed and I wouldn't have
>    disapproved of prosecuting him for those violations.  However, I think
>    the BATF was criminal for starting negotiations with a military style
>    assault and for firing into a house where there were children and other
>    noncombatants.
> 3. I don't see they couldn't just leave a token guard on the place and wait
>    the BDs out; I don't approve of the tear gas approach and,

In [10]:
newsgroups.target_names[newsgroups.target[0]]

'rec.autos'

In [72]:
categories = ['talk.politics.guns', 'talk.religion.misc', 'comp.graphics', 'rec.autos']
newsgroups = fetch_20newsgroups(categories=categories)
#newsgroups_train = fetch_20newsgroups(categories=categories, subset='train')
#newsgroups_test = fetch_20newsgroups(categories=categories, subset='test')

In [73]:
#newsgroups_train.target.shape

In [74]:
#newsgroups_test.target.shape

In [75]:
print(newsgroups.data[0])

From: HADCRJAM@admin.uh.edu (MILLER, JIMMY A.)
Subject: Re: BATF/FBI revenge
Organization: University of Houston Administrative Computing
Lines: 38
Distribution: world
NNTP-Posting-Host: uhad2.admin.uh.edu
X-News-Reader: VMS NEWS 1.24
In-Reply-To: donb@netcom.com's message of Tue, 20 Apr 1993 17:10:52 GMT

In <donbC5sL24.Ewu@netcom.com> donb@netcom.com writes:

> Anyway, here's how I see the Waco affair; I'd be interested in other peoples'
> interpretations...
> 
> 1. Koresh and his people were basically minding their own business.
> 2. Some weapons violations may have been committed and I wouldn't have
>    disapproved of prosecuting him for those violations.  However, I think
>    the BATF was criminal for starting negotiations with a military style
>    assault and for firing into a house where there were children and other
>    noncombatants.
> 3. I don't see they couldn't just leave a token guard on the place and wait
>    the BDs out; I don't approve of the tear gas approach and,

In [76]:
newsgroups.target_names[newsgroups.target[0]]

'talk.politics.guns'

In [117]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(newsgroups.data,
                                                                    newsgroups.target,
                                                                    test_size=0.2,
                                                                    random_state=42)

In [118]:
newsgroups.target.shape

(2101,)

In [119]:
target_train.shape

(1680,)

In [120]:
target_test.shape

(421,)

In [121]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(['I am teapot', 'Short and stout'])
vectorizer.get_feature_names()

['am', 'and', 'short', 'stout', 'teapot']

In [122]:
pd.DataFrame(data=vectorizer.transform(['I am a little teapot', 'Short and short']).toarray(),
            columns=vectorizer.get_feature_names())

Unnamed: 0,am,and,short,stout,teapot
0,1,0,0,0,1
1,0,1,2,0,0


In [174]:
tf_vectorizer = CountVectorizer()
tf_train = tf_vectorizer.fit_transform(data_train)
tf_train.shape

(1680, 29450)

In [213]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(tf_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [214]:
from sklearn import metrics

tf_test = tf_vectorizer.transform(data_test)
pred = clf.predict(tf_test)
metrics.accuracy_score(target_test, pred)

0.9738717339667459

In [215]:
pred = clf.predict(tf_train)
metrics.accuracy_score(target_train, pred)

0.9964285714285714

In [237]:
def show_most_informative_features(classifier, vectorizer, categories):
    for i, c in enumerate(categories):
        print("%s: %s" % (c, ", ".join(np.asarray(vectorizer.get_feature_names())[np.argsort(classifier.coef_[i])[-10:]])))

In [238]:
show_most_informative_features(clf, tf_vectorizer, newsgroups.target_names)

comp.graphics: edu, from, it, for, is, in, of, and, to, the
rec.autos: for, you, that, is, it, in, and, of, to, the
talk.politics.guns: for, it, you, is, that, in, and, of, to, the
talk.religion.misc: not, it, you, is, in, that, and, to, of, the


In [239]:
tf_vectorizer = CountVectorizer(
    stop_words='english'
)
tf_train = tf_vectorizer.fit_transform(data_train)
clf = MultinomialNB()
clf.fit(tf_train, target_train)
show_most_informative_features(clf, tf_vectorizer, newsgroups.target_names)

comp.graphics: host, posting, university, image, com, organization, subject, graphics, lines, edu
rec.autos: just, cars, writes, article, organization, lines, subject, com, car, edu
talk.politics.guns: guns, article, writes, organization, lines, subject, people, com, gun, edu
talk.religion.misc: article, writes, lines, organization, jesus, subject, people, god, com, edu


In [233]:
def predict(classifier, vectorizer, categories, doc):
    return categories[classifier.predict(vectorizer.transform([doc]))[0]]

predict(clf, tf_vectorizer, newsgroups.target_names, 'it is for you')

'rec.autos'

In [None]:
#import pyLDAvis
#import pyLDAvis.sklearn
#pyLDAvis.enable_notebook()