In [12]:
# Import packages and libraries
import numpy as np
import random as rnd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

from Semi_EM_NB import Semi_EM_MultinomialNB

In [13]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [14]:
# Convert all text data into tf-idf vectors 
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)

In [15]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.2 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio)
print X_l.shape, X_u.shape

(2262, 26747) (9052, 26747)


In [16]:
# Train Naive Bayes classifier (imported) 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-8)
nb_clf.fit(X_l, y_l)

MultinomialNB(alpha=1e-08, class_prior=None, fit_prior=True)

In [17]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-8) # semi supervised EM based Naive Bayes classifier
em_nb_clf.fit(X_l, y_l, X_u)
# clf.fit(train_vec, train_Xy.target, X_u)

-6641048.89471
1
-5649123.7872
2
-5648504.88052
3
-5648453.34607
4
-5648449.5753
5
-5648453.19796


<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x10df61950>

In [18]:
# Evaluate original NB classifier using test data set
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.41      0.33      0.36       319
           comp.graphics       0.67      0.31      0.43       389
 comp.os.ms-windows.misc       0.31      0.49      0.38       394
comp.sys.ibm.pc.hardware       0.58      0.43      0.49       392
   comp.sys.mac.hardware       0.61      0.43      0.50       385
          comp.windows.x       0.47      0.78      0.59       395
            misc.forsale       0.73      0.44      0.55       390
               rec.autos       0.65      0.54      0.59       396
         rec.motorcycles       0.78      0.36      0.49       398
      rec.sport.baseball       0.90      0.61      0.73       397
        rec.sport.hockey       0.87      0.78      0.82       399
               sci.crypt       0.51      0.66      0.58       396
         sci.electronics       0.50      0.37      0.43       393
                 sci.med       0.49      0.66      0.56       396
         

In [19]:
# Evaluate semi-supervised EM NB classifier using test data set
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.57      0.15      0.24       319
           comp.graphics       0.78      0.17      0.28       389
 comp.os.ms-windows.misc       0.44      0.34      0.38       394
comp.sys.ibm.pc.hardware       0.59      0.42      0.49       392
   comp.sys.mac.hardware       0.69      0.36      0.47       385
          comp.windows.x       0.35      0.84      0.49       395
            misc.forsale       0.82      0.35      0.49       390
               rec.autos       0.70      0.52      0.60       396
         rec.motorcycles       0.79      0.23      0.36       398
      rec.sport.baseball       0.96      0.53      0.68       397
        rec.sport.hockey       0.83      0.77      0.80       399
               sci.crypt       0.54      0.64      0.59       396
         sci.electronics       0.53      0.31      0.39       393
                 sci.med       0.54      0.67      0.60       396
         

In [20]:
# find the most informative features 
import numpy as np
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        print("%s: %s" % (category, " ".join(feature_names[topK])))

In [21]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=20) # keywords for each class by original NB classifier

alt.atheism: islamic faith existence belief said just moral believe does say evidence argument atheists atheism people islam religion think don god
comp.graphics: ftp hi like advance anybody good windows cview use program code file files know does software 3d image thanks graphics
comp.os.ms-windows.misc: just mail ax ms fonts drivers files mouse using know problem thanks does use dos version printer driver file windows
comp.sys.ibm.pc.hardware: use gateway bios board speed 486 using just controller thanks monitor card drives vlb scsi memory bus isa ide drive
comp.sys.mac.hardware: cpu duo monitor 610 cards want price does thanks centris nubus lciii scsi fpu lc se problem card apple mac
comp.windows.x: use mit using file widget problem thanks mouse windows error lib display application hi r5 sun x11r5 server motif window
misc.forsale: good brand plus card phone mail package modem edu asking 50 price condition interested offer new shipping sell 00 sale
rec.autos: think got fast don taur

In [22]:
show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=20) # keywords for each class by semisupervised EM NB classifier

alt.atheism: said moral agree islam like argument does evidence atheists religion atheism just believe say morality don objective think people god
comp.graphics: help format use software gif advance windows code ftp program know does 3d image file hi files looking thanks graphics
comp.os.ms-windows.misc: memory pc program ax ftp using problem know version printer use files does card thanks drivers driver file dos windows
comp.sys.ibm.pc.hardware: floppy like ram hard vlb does pc thanks use disk motherboard drives monitor isa ide controller bus card scsi drive
comp.sys.mac.hardware: nubus just duo video centris use simms fpu lc does scsi quadra know problem card thanks monitor drive apple mac
comp.windows.x: edu x11r5 sun code does display problem like application widget know using file program use windows motif server thanks window
misc.forsale: best excellent edu mail 50 10 cd drive card price asking interested email condition new sell offer 00 shipping sale
rec.autos: drive think pri