### Load text dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [2]:
from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [3]:
newsgroups_train.filenames.shape

(11314,)

In [4]:
newsgroups_train.target.shape

(11314,)

In [5]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

### Converting text into numeric vectors

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [7]:
vectors.nnz / float(vectors.shape[0]) # sparse vector

159.0132743362832

In [8]:
# load vectorized data set directly
from sklearn.datasets import fetch_20newsgroups_vectorized
newsgroups_train_vec = fetch_20newsgroups_vectorized(subset='train')

In [9]:
newsgroups_train_vec.data.shape

(11314, 130107)

In [10]:
newsgroups_train_vec.target.shape

(11314,)

In [11]:
newsgroups_train_vec.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Filtering text for more realistic training

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.88213592402729568

In [13]:
# find the most informative features 
import numpy as np
def show_top50(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top50 = np.argsort(classifier.coef_[i])[-50:]
        print("%s: %s" % (category, " ".join(feature_names[top50])))

In [14]:
show_top50(clf, vectorizer, newsgroups_train.target_names)

alt.atheism: all atheism don with no there by can sgi atheists livesey on so do writes re from one people an he or your they we but what if was caltech for com as have this be god are keith not edu it and in you that is of to the
comp.graphics: am at anyone would program help as polygon not need me nntp host windows posting but are file 3d there if organization files thanks with image com subject be lines university any can or have this on you that from edu in graphics it is for and of to the
sci.space: launch lines organization subject there about writes orbit article re by with will but pat or if digex not have moon they would are toronto at as gov alaska com access from this henry was be you on for nasa edu it that is in and space to of the
talk.religion.misc: article apple were re me there one no kent but do my if all by or people on his christian your from what with who was we have they jesus for be this as he sandvik are edu god com not it you in is that and to of the


In [15]:
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(pred, newsgroups_test.target, average='macro')

0.77310350681274775

In [16]:
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.76995175184521725

In [17]:
show_top50(clf, vectorizer, newsgroups_train.target_names)

alt.atheism: which some atheism at say just my about by would who there think with all no don people can he on one we was so or an your but do god if they for what as have this be are not in and it you is that of to the
comp.graphics: some format about will use out need hi please windows my at does looking not am program from as know would anyone me file are image files there but be if with thanks can or any have this that on graphics you in it is for of and to the
sci.space: which could so has do more get moon orbit launch just what one some we like an about will can by there with nasa but not from if or at would have are they as this was be you on for that it space is in and of to the
talk.religion.misc: christians will an out one would about christian there so were no from people me do on all we my by if his or what who your but jesus with have was for they god as be this are he not it in you is and that to of the
