Update: use unlabeled sample to train classifier by EM algorithm.

In [1]:
# Import packages and libraries
import numpy as np
import random as rnd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

from Semi_EM_NB import Semi_EM_MultinomialNB

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# Convert all text data into tf-idf vectors 
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)
print train_vec.shape, test_vec.shape

(11314, 26747) (7532, 26747)


In [4]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.2 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print X_l.shape, X_u.shape

(2262, 26747) (9052, 26747)


In [5]:
# Train Naive Bayes classifier (imported) 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-8)
nb_clf.fit(X_l, y_l)

MultinomialNB(alpha=1e-08, class_prior=None, fit_prior=True)

In [6]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-8) # semi supervised EM based Naive Bayes classifier
em_nb_clf.fit(X_l, y_l, X_u)
# em_nb_clf.fit_with_clustering(X_l, y_l, X_u)

Initial expected log likelihood = -6698867.611

EM iteration #1
	Expected log likelihood = -4481121.514
EM iteration #2
	Expected log likelihood = -4480017.280
EM iteration #3
	Expected log likelihood = -4479941.228
EM iteration #4
	Expected log likelihood = -4479930.903
EM iteration #5
	Expected log likelihood = -4479930.903


<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x107d71908>

In [7]:
# Evaluate original NB classifier using test data set
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.45      0.46      0.45       319
           comp.graphics       0.44      0.59      0.50       389
 comp.os.ms-windows.misc       0.50      0.27      0.35       394
comp.sys.ibm.pc.hardware       0.65      0.36      0.46       392
   comp.sys.mac.hardware       0.49      0.52      0.51       385
          comp.windows.x       0.70      0.59      0.64       395
            misc.forsale       0.72      0.49      0.59       390
               rec.autos       0.73      0.39      0.50       396
         rec.motorcycles       0.62      0.48      0.54       398
      rec.sport.baseball       0.87      0.60      0.71       397
        rec.sport.hockey       0.54      0.76      0.63       399
               sci.crypt       0.40      0.71      0.51       396
         sci.electronics       0.44      0.49      0.46       393
                 sci.med       0.57      0.63      0.60       396
         

In [8]:
# Evaluate semi-supervised EM NB classifier using test data set
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.54      0.29      0.37       319
           comp.graphics       0.42      0.62      0.50       389
 comp.os.ms-windows.misc       0.54      0.15      0.24       394
comp.sys.ibm.pc.hardware       0.61      0.33      0.43       392
   comp.sys.mac.hardware       0.49      0.47      0.48       385
          comp.windows.x       0.71      0.63      0.66       395
            misc.forsale       0.73      0.41      0.52       390
               rec.autos       0.77      0.43      0.56       396
         rec.motorcycles       0.69      0.33      0.45       398
      rec.sport.baseball       0.95      0.58      0.72       397
        rec.sport.hockey       0.88      0.78      0.83       399
               sci.crypt       0.33      0.76      0.46       396
         sci.electronics       0.45      0.44      0.44       393
                 sci.med       0.68      0.66      0.67       396
         

In [9]:
# find the most informative features 
import numpy as np
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        print("%s: %s" % (category, " ".join(feature_names[topK])))

In [10]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by original NB classifier

alt.atheism: islam islamic just atheists say don atheism people religion god
comp.graphics: software vesa images 3d files thanks program file graphics image
comp.os.ms-windows.misc: use drivers problem card files driver ax file dos windows
comp.sys.ibm.pc.hardware: 486 vlb thanks board drive card controller port ide scsi
comp.sys.mac.hardware: simms fpu vram thanks monitor does drive problem apple mac
comp.windows.x: thanks color windows use application program motif widget window server
misc.forsale: new interested best brand condition asking shipping offer 00 sale
rec.autos: highway engine thanks dealer new right like make cars car
rec.motorcycles: make motorcycle dog just like helmet ride bikes dod bike
rec.sport.baseball: braves players think time cubs game team baseball year pitching
rec.sport.hockey: chicago nhl season games play detroit leafs team game hockey
sci.crypt: algorithm des crypto nsa government keys clipper encryption chip key
sci.electronics: want chip know circuit a

In [11]:
show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier

alt.atheism: atheism just believe morality say think objective don people god
comp.graphics: looking hi know image program files file thanks windows graphics
comp.os.ms-windows.misc: ftp driver card ax thanks drivers files file dos windows
comp.sys.ibm.pc.hardware: dos windows drives thanks ide controller bus card scsi drive
comp.sys.mac.hardware: know problem monitor does scsi thanks card apple drive mac
comp.windows.x: xterm x11r5 application thanks program widget use motif server window
misc.forsale: drive email new sell asking condition offer shipping 00 sale
rec.autos: honda know new dealer just ford like engine cars car
rec.motorcycles: good know riding like just motorcycle ride bikes dod bike
rec.sport.baseball: players hit braves runs games pitching baseball game team year
rec.sport.hockey: year league nhl season play games players hockey team game
sci.crypt: don just use people keys government chip clipper encryption key
sci.electronics: just used does output know voltage circ

In [12]:
print nb_clf.class_log_prior_, em_nb_clf.clf.class_log_prior_

[-3.15965647 -2.96183072 -2.95332003 -2.95332003 -2.97041447 -2.94488116
 -2.96183072 -2.94488116 -2.94488116 -2.94488116 -2.93651291 -2.94488116
 -2.95332003 -2.94488116 -2.95332003 -2.93651291 -3.03265677 -2.99661684
 -3.19140516 -3.40651654] [-3.1231961  -2.81317505 -3.28406272 -3.23123758 -2.9056909  -2.97422231
 -3.19823128 -3.32744302 -3.21349876 -3.16627969 -3.07230449 -2.5225518
 -2.89125601 -2.87702652 -3.00585939 -2.81170338 -3.13126066 -2.26647633
 -3.12119008 -4.22180839]
