In [1]:
# Import packages and libraries
import numpy as np
import random as rnd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

from Semi_EM_NB import Semi_EM_MultinomialNB

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# Convert all text data into tf-idf vectors
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)

In [4]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.3 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio)
print X_l.shape, X_u.shape

(3394, 101631) (7920, 101631)


In [5]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
clf = Semi_EM_MultinomialNB(alpha=1e-8) # semi supervised EM based Naive Bayes classifier
clf.fit(X_l, y_l, X_u)

-8851790.55769
1
-6437159.6717
2
-6436536.34232
3
-6436334.42252
4
-6436331.61876
5
-6436331.61876


<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x7f919684e440>

In [6]:
# Evaluate NB classifier using test data set
pred = clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.61      0.18      0.28       319
           comp.graphics       0.66      0.52      0.58       389
 comp.os.ms-windows.misc       0.54      0.38      0.44       394
comp.sys.ibm.pc.hardware       0.53      0.53      0.53       392
   comp.sys.mac.hardware       0.81      0.39      0.52       385
          comp.windows.x       0.77      0.65      0.71       395
            misc.forsale       0.74      0.57      0.65       390
               rec.autos       0.76      0.55      0.64       396
         rec.motorcycles       0.73      0.43      0.54       398
      rec.sport.baseball       0.94      0.67      0.78       397
        rec.sport.hockey       0.88      0.85      0.86       399
               sci.crypt       0.23      0.85      0.36       396
         sci.electronics       0.58      0.42      0.49       393
                 sci.med       0.67      0.73      0.70       396
         

In [7]:
pprint(metrics.confusion_matrix(test_Xy.target, pred))

array([[ 59,   1,   0,   3,   0,   0,   0,   1,   3,   1,   3,  38,   1,
          3,   9, 117,   4,  44,   8,  24],
       [  0, 204,  23,  12,   3,  29,   4,   0,   2,   1,   0,  80,   9,
          5,  12,   2,   1,   1,   0,   1],
       [  1,  26, 149,  56,   2,  26,   6,   2,   2,   0,   0,  94,   3,
          8,  10,   4,   1,   2,   2,   0],
       [  0,  14,  40, 208,  11,   5,  16,   0,   1,   0,   2,  57,  32,
          1,   1,   0,   0,   1,   3,   0],
       [  1,  13,  26,  56, 149,   5,  18,   2,   0,   0,   2,  74,  21,
          6,  11,   1,   0,   0,   0,   0],
       [  0,  24,  13,   4,   1, 257,   3,   0,   0,   0,   0,  72,   2,
          9,   5,   2,   1,   2,   0,   0],
       [  1,   2,   8,  29,  12,   2, 224,  12,   6,   4,   5,  38,  12,
          7,  13,   2,   4,   8,   1,   0],
       [  1,   1,   0,   0,   1,   0,  10, 216,  25,   2,   2,  84,  13,
          6,  11,   5,   2,  12,   3,   2],
       [  2,   1,   2,   1,   0,   2,   6,  38, 173,   0,   5,  

In [8]:
print(metrics.accuracy_score(test_Xy.target, pred))

0.553505045141


In [9]:
b_w_d = X_u>0
type(b_w_d)

scipy.sparse.csr.csr_matrix

In [10]:
b_w_d.dot(b_w_d.transpose())

<7920x7920 sparse matrix of type '<type 'numpy.bool_'>'
	with 56642993 stored elements in Compressed Sparse Row format>

In [11]:
np.concatenate((y_l, y_l), axis=0).shape

(6788,)