Update: use unlabeled sample to train classifier by EM algorithm.

In [1]:
# Import packages and libraries
import numpy as np
import random as rnd
import nltk as nk

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

from Semi_EM_NB import Semi_EM_MultinomialNB

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# Convert all text data into tf-idf vectors 
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
# vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)
print train_vec.shape, test_vec.shape

(11314, 26747) (7532, 26747)


In [4]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.5 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print X_l.shape, X_u.shape

(5657, 26747) (5657, 26747)


In [5]:
# Train Naive Bayes classifier (imported) 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-2)
nb_clf.fit(X_l, y_l)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [6]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2) # semi supervised EM based Naive Bayes classifier
em_nb_clf.fit(X_l, y_l, X_u)
# em_nb_clf.fit_with_clustering(X_l, y_l, X_u)
# em_nb_clf.partial_fit(X_l, y_l, X_u)

Initial expected log likelihood = -2954689.899

EM iteration #1
	Expected log likelihood = -2798528.882
EM iteration #2
	Expected log likelihood = -2798484.321
EM iteration #3
	Expected log likelihood = -2798484.321


<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x117b2ac68>

In [7]:
# Evaluate original NB classifier using test data set
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.49      0.41      0.45       319
           comp.graphics       0.61      0.66      0.63       389
 comp.os.ms-windows.misc       0.61      0.54      0.57       394
comp.sys.ibm.pc.hardware       0.58      0.64      0.61       392
   comp.sys.mac.hardware       0.69      0.62      0.66       385
          comp.windows.x       0.78      0.73      0.75       395
            misc.forsale       0.75      0.70      0.72       390
               rec.autos       0.73      0.68      0.70       396
         rec.motorcycles       0.73      0.68      0.70       398
      rec.sport.baseball       0.88      0.78      0.82       397
        rec.sport.hockey       0.58      0.92      0.71       399
               sci.crypt       0.72      0.73      0.72       396
         sci.electronics       0.62      0.52      0.56       393
                 sci.med       0.82      0.74      0.78       396
         

In [8]:
# Evaluate semi-supervised EM NB classifier using test data set
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.55      0.41      0.47       319
           comp.graphics       0.62      0.67      0.64       389
 comp.os.ms-windows.misc       0.65      0.49      0.56       394
comp.sys.ibm.pc.hardware       0.56      0.66      0.61       392
   comp.sys.mac.hardware       0.68      0.63      0.66       385
          comp.windows.x       0.77      0.74      0.76       395
            misc.forsale       0.75      0.69      0.72       390
               rec.autos       0.72      0.70      0.71       396
         rec.motorcycles       0.75      0.70      0.73       398
      rec.sport.baseball       0.93      0.80      0.86       397
        rec.sport.hockey       0.58      0.94      0.71       399
               sci.crypt       0.73      0.73      0.73       396
         sci.electronics       0.64      0.52      0.58       393
                 sci.med       0.85      0.76      0.80       396
         

In [9]:
# find the most informative features 
import numpy as np
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        print("%s: %s" % (category, " ".join(feature_names[topK])))

In [10]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by original NB classifier

alt.atheism: bobby say atheism bible just religion don think people god
comp.graphics: know file 3d program does software files thanks image graphics
comp.os.ms-windows.misc: using thanks driver use program drivers files dos file windows
comp.sys.ibm.pc.hardware: dos thanks monitor ide pc card bus controller scsi drive
comp.sys.mac.hardware: know lc software use problem does thanks drive apple mac
comp.windows.x: application xterm using display thanks x11r5 widget motif server window
misc.forsale: drive interested email condition price new offer shipping 00 sale
rec.autos: ford oil just don new like engine dealer cars car
rec.motorcycles: riding dog like bmw just bikes motorcycle ride dod bike
rec.sport.baseball: think pitching braves games hit runs game baseball year team
rec.sport.hockey: like year games play nhl players season hockey team game
sci.crypt: people use nsa escrow keys government chip clipper encryption key
sci.electronics: ve electronics good used amp know does circuit 

In [11]:
show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier

alt.atheism: islam say atheists religion just atheism don people think god
comp.graphics: 3d program looking file hi know image files thanks graphics
comp.os.ms-windows.misc: ftp card thanks use driver drivers files dos file windows
comp.sys.ibm.pc.hardware: monitor thanks disk pc ide controller bus scsi card drive
comp.sys.mac.hardware: quadra know monitor simms use problem thanks drive apple mac
comp.windows.x: x11r5 program windows use application thanks widget motif server window
misc.forsale: asking sell price new email condition offer shipping 00 sale
rec.autos: don ford new good dealer just like engine cars car
rec.motorcycles: don helmet like riding motorcycle just ride bikes dod bike
rec.sport.baseball: players braves pitching hit runs games game baseball team year
rec.sport.hockey: teams year nhl season players play games hockey team game
sci.crypt: people escrow use nsa keys government chip clipper encryption key
sci.electronics: electronics thanks used voltage does know lik

In [12]:
print nb_clf.class_log_prior_, em_nb_clf.clf.class_log_prior_

[-3.16001007 -2.96389519 -2.95367364 -2.95367364 -2.97422231 -2.94691686
 -2.96389519 -2.94691686 -2.94020542 -2.94020542 -2.93686652 -2.94355551
 -2.95367364 -2.94691686 -2.95028954 -2.93686652 -3.0311772  -2.99874192
 -3.18961054 -3.40420703] [-3.21130337 -2.98465718 -3.03853017 -2.9040767  -3.00229433 -2.93022198
 -2.99874192 -2.94860178 -2.94523477 -2.99343687 -2.70840381 -2.94355551
 -3.00585939 -2.97076807 -2.99167476 -2.80876652 -2.98291046 -2.98116678
 -3.22900294 -3.61676847]
