In [1]:
# Import packages and libraries
import numpy as np
import random as rnd
import nltk as nk
import re

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn import metrics
from pprint import pprint

from Semi_EM_NB import Semi_EM_MultinomialNB
from time import time
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
def remove_noise(sentence):
    result = ''
    poster = PorterStemmer()
    stopword_set = set(stopwords.words('english'))
    wordlist = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', sentence) # remove punctuation
    wordlist = re.sub('\s+', ' ', wordlist) # remove extra space
    wordlist_normal = [poster.stem(word.lower()) for word in wordlist.split()] # restore word to its root form
    wordlist_clean = [word for word in wordlist_normal if word not in stopword_set] # remove stopwords
    result = ' '.join(wordlist_clean)
    return result

In [4]:
# preprocess train and test text data
train_Xy.data_clean = map(remove_noise, train_Xy.data)
test_Xy.data_clean = map(remove_noise, test_Xy.data)

In [5]:
# Convert all text data into tf-idf vectors 
# vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_Xy.data_clean)
test_vec = vectorizer.transform(test_Xy.data_clean)
print train_vec.shape, test_vec.shape

(11314, 55445) (7532, 55445)


In [6]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.5 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print X_l.shape, X_u.shape

(5657, 55445) (5657, 55445)


In [7]:
def cross_validation(clf, data_X, data_y, unlabeled=None, n_folds=5):
    print('=' * 80)
    print("Validation: ")
    print(clf)
    kf = StratifiedKFold(n_splits=n_folds)
    start_time = time()
    train_accuracies= list() # training accuracy
    fold_count = 1
    for train_ids, valid_ids in kf.split(data_X, data_y):
        print("Fold # %d" % fold_count)
        fold_count += 1
        train_X, train_y, valid_X, valid_y = data_X[train_ids], data_y[train_ids], data_X[valid_ids], data_y[valid_ids]
        if unlabeled==None:
            clf.fit(train_X, train_y)
        else:
            clf.fit(train_X, train_y, unlabeled)
        pred = clf.predict(valid_X)
        train_accuracies.append(metrics.accuracy_score(valid_y, pred))
    train_time = time() - start_time
    print("Validation time: %0.3f seconds" % train_time)
    print("Average training accuracy: %0.3f" % np.mean(np.array(train_accuracies)))
    return train_accuracies, train_time

In [8]:
# Cross validation for Naive Bayes classifier 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1)
cross_validation(nb_clf, X_l, y_l)

Validation: 
MultinomialNB(alpha=1, class_prior=None, fit_prior=True)
Fold # 1
Fold # 2
Fold # 3
Fold # 4
Fold # 5
Validation time: 0.342 seconds
Average training accuracy: 0.663


([0.66929824561403506,
  0.66402814423922607,
  0.66843501326259946,
  0.64742451154529312,
  0.66429207479964381],
 0.34151387214660645)

In [9]:
# Cross validation for semisupervised EM Naive Bayes classifier 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1) # semi supervised EM based Naive Bayes classifier
cross_validation(em_nb_clf, X_l, y_l, X_u)

Validation: 
MultinomialNB(alpha=1, class_prior=None, fit_prior=True)
Fold # 1
Initial expected log likelihood = -3820685.485

EM iteration #1
	Expected log likelihood = -3626889.086
EM iteration #2
	Expected log likelihood = -3457240.369
EM iteration #3
	Expected log likelihood = -3339599.120
EM iteration #4
	Expected log likelihood = -3331276.741
EM iteration #5
	Expected log likelihood = -3331211.154
EM iteration #6
	Expected log likelihood = -3331211.154
Fold # 2
Initial expected log likelihood = -3820368.239

Fold # 3
Initial expected log likelihood = -3819610.726

EM iteration #1
	Expected log likelihood = -3628008.032
EM iteration #2
	Expected log likelihood = -3462848.680
EM iteration #3
	Expected log likelihood = -3342948.738
EM iteration #4
	Expected log likelihood = -3331052.865
EM iteration #5
	Expected log likelihood = -3331122.336
Fold # 4
Initial expected log likelihood = -3819945.654

Fold # 5
Initial expected log likelihood = -3821841.407

Validation time: 114.253 seco

([0.052631578947368418,
  0.66402814423922607,
  0.053050397877984087,
  0.64742451154529312,
  0.66429207479964381],
 114.25321817398071)