In [1]:
# Import packages and libraries
import numpy as np
import random as rnd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from pprint import pprint

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# Convert all text data into tf-idf vectors
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)

In [4]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.3 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio)
print X_l.shape, X_u.shape

(3394, 101631) (7920, 101631)


In [5]:
# Train Naive Bayes classifier (imported) 
# using both labeled and unlabeled data set
clf = Semi_EM_NB(alpha=1e-8) # semi supervised EM based Naive Bayes classifier
clf.fit(X_l, y_l, X_u)

MultinomialNB(alpha=1e-08, class_prior=None, fit_prior=True)

In [6]:
# Evaluate NB classifier using test data set
pred = clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.39      0.28      0.32       319
           comp.graphics       0.56      0.55      0.56       389
 comp.os.ms-windows.misc       0.48      0.24      0.32       394
comp.sys.ibm.pc.hardware       0.54      0.61      0.57       392
   comp.sys.mac.hardware       0.68      0.46      0.55       385
          comp.windows.x       0.49      0.79      0.61       395
            misc.forsale       0.78      0.58      0.66       390
               rec.autos       0.76      0.45      0.57       396
         rec.motorcycles       0.71      0.57      0.64       398
      rec.sport.baseball       0.89      0.72      0.80       397
        rec.sport.hockey       0.89      0.82      0.85       399
               sci.crypt       0.59      0.69      0.64       396
         sci.electronics       0.60      0.38      0.47       393
                 sci.med       0.63      0.74      0.68       396
         

In [7]:
pprint(metrics.confusion_matrix(test_Xy.target, pred))

array([[ 88,   2,   3,   2,   1,   4,   1,   2,   3,   3,   2,   7,   1,
         10,  19,  94,   9,  30,  10,  28],
       [  2, 214,  13,  11,  11,  71,   3,   1,   1,   1,   0,  17,   8,
          7,  17,   5,   2,   3,   2,   0],
       [  2,  40,  93,  72,   6, 106,   5,   2,   0,   0,   2,  13,   3,
          9,  30,   3,   2,   1,   3,   2],
       [  0,  18,  24, 238,  26,  26,   9,   1,   0,   0,   0,   7,  22,
          3,  15,   0,   2,   0,   1,   0],
       [  0,  15,  17,  51, 176,  30,   6,   4,   1,   0,   1,  11,  14,
         12,  34,   5,   3,   1,   2,   2],
       [  0,  27,   6,   6,   3, 311,   2,   0,   2,   0,   2,   8,   2,
          8,  13,   0,   1,   1,   2,   1],
       [  1,   8,   5,  31,  13,  15, 225,  11,   4,   4,   2,   6,  11,
          9,  26,   5,   4,   7,   2,   1],
       [  1,   4,   6,   2,   3,   4,  12, 178,  57,   3,   2,   7,  11,
         16,  55,   7,  17,   4,   4,   3],
       [  5,   6,   2,   2,   2,   4,   9,  16, 228,   5,   3,  

In [8]:
print(metrics.accuracy_score(test_Xy.target, pred))

0.575146043548
