In [1]:
# Import packages and libraries
import numpy as np
import random as rnd

from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# Convert all text data into tf-idf vectors
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)

In [4]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.2 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio)
print X_l.shape, X_u.shape

(2262, 26747) (9052, 26747)


In [5]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
clf = MultinomialNB(alpha=1e-8)
clf.fit(X_l, y_l)
# clf.fit(train_vec, train_Xy.target)

MultinomialNB(alpha=1e-08, class_prior=None, fit_prior=True)

In [6]:
# Evaluate NB classifier using test data set
pred = clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.45      0.31      0.37       319
           comp.graphics       0.61      0.29      0.39       389
 comp.os.ms-windows.misc       0.53      0.30      0.39       394
comp.sys.ibm.pc.hardware       0.54      0.51      0.53       392
   comp.sys.mac.hardware       0.65      0.36      0.46       385
          comp.windows.x       0.43      0.77      0.56       395
            misc.forsale       0.73      0.51      0.60       390
               rec.autos       0.43      0.70      0.53       396
         rec.motorcycles       0.78      0.33      0.46       398
      rec.sport.baseball       0.85      0.62      0.71       397
        rec.sport.hockey       0.85      0.83      0.84       399
               sci.crypt       0.39      0.72      0.50       396
         sci.electronics       0.40      0.43      0.42       393
                 sci.med       0.62      0.64      0.63       396
         

In [7]:
pprint(metrics.confusion_matrix(test_Xy.target, pred))

array([[ 99,   1,   0,   2,   1,   1,   3,  11,   1,   1,   2,  14,   1,
         10,  18,  57,   8,  43,  19,  27],
       [  3, 113,  15,  17,   8, 110,   5,   7,   3,   2,   2,  51,  15,
         18,  10,   3,   1,   5,   1,   0],
       [  0,   9, 120,  52,   8, 118,   2,  16,   2,   1,   0,  26,   4,
          7,  16,   4,   1,   2,   5,   1],
       [  0,   7,  36, 201,  18,  42,   8,  12,   0,   0,   0,  19,  36,
          5,   4,   2,   0,   1,   1,   0],
       [  1,  10,  16,  44, 138,  33,  11,  23,   1,   0,   1,  30,  54,
          7,   7,   2,   0,   5,   2,   0],
       [  1,  14,   9,   7,   1, 306,   2,   8,   0,   4,   0,  26,   5,
          5,   6,   0,   1,   0,   0,   0],
       [  1,   7,   3,  30,  16,  15, 197,  30,   3,   0,   2,  14,  28,
         15,  15,   1,   3,   6,   3,   1],
       [  4,   1,   2,   0,   3,   7,   4, 277,   8,   5,   6,  14,  20,
          7,  12,   2,   7,  13,   4,   0],
       [  9,   3,   1,   0,   3,   5,   5,  82, 132,   7,   6,  

In [8]:
print(metrics.accuracy_score(test_Xy.target, pred))

0.532926181625


In [9]:
# from scipy.linalg import get_blas_funcs
# b_w_d = (X_u > 0).T.toarray()
# lp_w_c = (clf.feature_log_prob_)
# lp_d_c = get_blas_funcs("gemm", (lp_w_c, b_w_d))
# print type(lp_w_c), type(b_w_d), type(lp_d_c)
# # lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d.T, trans_a=True, trans_b=True)
# lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d).shape

In [10]:
# find the most informative features 
import numpy as np
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        print("%s: %s" % (category, " ".join(feature_names[topK])))

In [11]:
show_topK(clf, vectorizer, train_Xy.target_names, K=20)

alt.atheism: talking islamic satan atheist evidence claim cheat does hillary moral freewill objective morality just people think islam don atheism god
comp.graphics: images unix code don software using video windows help looking format use know program 3d image files file thanks graphics
comp.os.ms-windows.misc: like hi drivers ms use using know microsoft problem program does mail card thanks ftp dos ax files file windows
comp.sys.ibm.pc.hardware: board irq just does computer bios 486 monitor cards isa card use pc drives thanks ide bus controller scsi drive
comp.sys.mac.hardware: mail disk lc cable fpu keyboard monitor speed problem serial c650 adb card know use thanks drive does mac apple
comp.windows.x: windows widgets mouse mit ncd error clients manager running use using display xterm does file hi thanks motif server window
misc.forsale: monitor best floppy 10 50 includes excellent asking used new sell drive price email 00 interested shipping condition offer sale
rec.autos: vw tires