### Load text dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [2]:
from pprint import pprint
pprint(list(train_Xy.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [3]:
len(train_Xy.data)

11314

In [4]:
train_Xy.target.shape

(11314,)

In [5]:
train_Xy.data[:3]

[u'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 u"A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't 

In [6]:
train_Xy.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

### Converting text into numeric vectors

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(train_Xy.data)
vectors.shape

(11314, 101631)

In [8]:
vectors.nnz / float(vectors.shape[0]) # sparse vector

97.54525366802191

In [9]:
train_Xy.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
len(test_Xy.data)

7532

In [11]:
test_vec = vectorizer.transform(test_Xy.data)
test_vec.shape

(7532, 101631)

### Filtering text for more realistic training

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB(alpha=.01)
clf.fit(vectors, train_Xy.target)
pred = clf.predict(test_vec)
metrics.f1_score(test_Xy.target, pred, average='macro')

0.68286112952505695

In [13]:
# find the most informative features 
import numpy as np
def show_top50(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top50 = np.argsort(classifier.coef_[i])[-50:]
        print("%s: %s" % (category, " ".join(feature_names[top50])))

In [14]:
show_top50(clf, vectorizer, train_Xy.target_names)

alt.atheism: islam which some just say with about religion would who by all there think no atheism can don one on he people or was we an so your but if they do for what have as god this be are not in and it you is that of to the
comp.graphics: ftp do out some about software need hi at please does looking not am format 3d from as know anyone would me program file are there files with but if be thanks image or can any have this on that you in graphics it is for of and to the
comp.os.ms-windows.misc: am does cica at get as version know me anyone from will using ftp program problem are card any ax thanks not be use if drivers there driver my or but files this can on dos with have that in file of you for and is it to windows the
comp.sys.ibm.pc.hardware: all dos one what get does there at do me isa drives has monitor would an system as are disk but not thanks pc be if any or ide can controller bus this in my on card you that have with scsi for of drive is it and to the
comp.sys.mac.hardware