In [1]:
# Import packages and libraries
import numpy as np
import random as rnd
import progressbar as pgb

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn import metrics
from pprint import pprint

from Semi_EM_NB import Semi_EM_MultinomialNB
from time import time

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# Convert all text data into tf-idf vectors 
vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
train_vec = vectorizer.fit_transform(train_Xy.data)
test_vec = vectorizer.transform(test_Xy.data)
print train_vec.shape, test_vec.shape

(11314, 26747) (7532, 26747)


In [4]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.2 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print X_l.shape, X_u.shape

(2262, 26747) (9052, 26747)


In [5]:
def cross_validation(clf, data_X, data_y, unlabeled=None, n_folds=5):
    print('=' * 80)
    print("Validation: ")
    print(clf)
    kf = StratifiedKFold(n_splits=n_folds)
    start_time = time()
    train_accuracies= list() # training accuracy
    fold_count = 1
    for train_ids, valid_ids in kf.split(data_X):
        print("Fold # %d" % fold_count)
        fold_count += 1
        train_X, train_y, valid_X, valid_y = data_X[train_ids], data_y[train_ids], data_X[valid_ids], data_y[valid_ids]
        if unlabeled==None:
            clf.fit(train_X, train_y)
        else:
            clf.fit(train_X, train_y, unlabeled)
        pred = clf.predict(valid_X)
        train_accuracies.append(metrics.accuracy_score(valid_y, pred))
    train_time = time() - start_time
    print("Validation time: %0.3f seconds" % train_time)
    print("Average training accuracy: %0.3f" % np.mean(np.array(train_accuracies)))
    return train_accuracies, train_time

In [6]:
# Cross validation for Naive Bayes classifier 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-8)
cross_validation(nb_clf, X_l, y_l)

Validation: 
MultinomialNB(alpha=1e-08, class_prior=None, fit_prior=True)
Fold # 1
Fold # 2
Fold # 3
Fold # 4
Fold # 5
Validation time: 0.231 seconds
Average training accuracy: 0.572


([0.5938189845474614,
  0.55408388520971308,
  0.5663716814159292,
  0.56858407079646023,
  0.57743362831858402],
 0.23102092742919922)

In [7]:
# Cross validation for semisupervised EM Naive Bayes classifier 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-8) # semi supervised EM based Naive Bayes classifier
cross_validation(em_nb_clf, X_l, y_l, X_u)

Validation: 
<Semi_EM_NB.Semi_EM_MultinomialNB instance at 0x10ab1ecb0>
Fold # 1
-6899831.75582
EM iteration #1
	Expected log likelihood = -4518620.202
EM iteration #2
	Expected log likelihood = -4517269.914
EM iteration #3
	Expected log likelihood = -4517182.618
EM iteration #4
	Expected log likelihood = -4517178.472
EM iteration #5
	Expected log likelihood = -4517179.566
Fold # 2
-6914242.55779
Fold # 3
-7017133.53385
Fold # 4
-6965772.02216
EM iteration #1
	Expected log likelihood = -4512028.455
EM iteration #2
	Expected log likelihood = -4510858.478
EM iteration #3
	Expected log likelihood = -4510832.164
EM iteration #4
	Expected log likelihood = -4510808.316
EM iteration #5
	Expected log likelihood = -4510808.316
Fold # 5
-6976630.42757
Validation time: 78.752 seconds
Average training accuracy: 0.550


([0.52538631346578368,
  0.55408388520971308,
  0.5663716814159292,
  0.52876106194690264,
  0.57743362831858402],
 78.7515640258789)