Update: use NLTK to preprocess text data

In [1]:
# Import packages and libraries
import numpy as np
import random as rnd
import nltk as nk
import re

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn import metrics
from pprint import pprint
from copy import deepcopy

from Semi_EM_NB import Semi_EM_MultinomialNB
from time import time
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
# Load train and test data set with class labels 
train_Xy = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
test_Xy = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
def remove_noise(sentence):
    result = ''
    poster = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stopword_set = set(stopwords.words('english'))
    wordlist = re.sub(r"\n|(\\(.*?){)|}|[!$%^&*#()_+|~\-={}\[\]:\";'<>?,.\/\\]|[0-9]|[@]", ' ', sentence) # remove punctuation
    wordlist = re.sub('\s+', ' ', wordlist) # remove extra space
    wordlist_normal = [poster.stem(word.lower()) for word in wordlist.split()] # restore word to its original form (stemming)
    wordlist_normal = [lemmatizer.lemmatize(word, pos='v') for word in wordlist_normal] # restore word to its root form (lemmatization)
    wordlist_clean = [word for word in wordlist_normal if word not in stopword_set] # remove stopwords
    result = ' '.join(wordlist_clean)
    return result

In [4]:
# preprocess train and test text data
train_Xy.data_clean = map(remove_noise, train_Xy.data)
test_Xy.data_clean = map(remove_noise, test_Xy.data)

In [5]:
# Convert all text data into tf-idf vectors 
# vectorizer = TfidfVectorizer(stop_words='english', min_df=3, max_df=0.9)
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train_Xy.data_clean)
test_vec = vectorizer.transform(test_Xy.data_clean)
print train_vec.shape, test_vec.shape

(11314, 55213) (7532, 55213)


In [6]:
# Divide train data set into labeled and unlabeled data sets
n_train_data = train_vec.shape[0]
split_ratio = 0.1 # labeled vs unlabeled
X_l, X_u, y_l, y_u = train_test_split(train_vec, train_Xy.target, train_size=split_ratio, stratify=train_Xy.target)
print X_l.shape, X_u.shape

(1131, 55213) (10183, 55213)


In [7]:
def cross_validation(clf, data_X, data_y, unlabeled=None, n_folds=5):
    print('=' * 80)
    print("Validation: ")
    print(clf)
    kf = StratifiedKFold(n_splits=n_folds)
    start_time = time()
    train_accuracies= list() # training accuracy
    fold_count = 1
    original_clf = deepcopy(clf)
    for train_ids, valid_ids in kf.split(data_X, data_y):
        cv_clf = deepcopy(original_clf)
        print("Fold # %d" % fold_count)
        fold_count += 1
        train_X, train_y, valid_X, valid_y = data_X[train_ids], data_y[train_ids], data_X[valid_ids], data_y[valid_ids]
        if unlabeled==None:
            cv_clf.fit(train_X, train_y)
        else:
            cv_clf.fit(train_X, train_y, unlabeled)
        pred = cv_clf.predict(valid_X)
        train_accuracies.append(metrics.accuracy_score(valid_y, pred))
    train_time = time() - start_time
    print("Validation time: %0.3f seconds" % train_time)
    print("Average training accuracy: %0.3f" % np.mean(np.array(train_accuracies)))
    return train_accuracies, train_time

In [8]:
# Cross validation for Naive Bayes classifier 
# using labeled data set only
nb_clf = MultinomialNB(alpha=1e-2)
cross_validation(nb_clf, X_l, y_l)

Validation: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Fold # 1
Fold # 2
Fold # 3
Fold # 4
Fold # 5
Validation time: 0.231 seconds
Average training accuracy: 0.567


([0.58874458874458879,
  0.59999999999999998,
  0.58515283842794763,
  0.55111111111111111,
  0.5092592592592593],
 0.23097610473632812)

In [9]:
# Cross validation for semisupervised EM Naive Bayes classifier 
# using both labeled and unlabeled data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2) # semi supervised EM based Naive Bayes classifier
cross_validation(em_nb_clf, X_l, y_l, X_u)

Validation: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
Fold # 1
Initial expected log likelihood = -6199090.954

EM iteration #1
	Expected log likelihood = -5409398.051
EM iteration #2
	Expected log likelihood = -5404909.930
EM iteration #3
	Expected log likelihood = -5404122.379
EM iteration #4
	Expected log likelihood = -5403887.445
EM iteration #5
	Expected log likelihood = -5403840.804
EM iteration #6
	Expected log likelihood = -5403839.485
EM iteration #7
	Expected log likelihood = -5403839.485
Fold # 2
Initial expected log likelihood = -6196076.322

EM iteration #1
	Expected log likelihood = -5411278.872
EM iteration #2
	Expected log likelihood = -5401494.662
EM iteration #3
	Expected log likelihood = -5398884.683
EM iteration #4
	Expected log likelihood = -5397134.440
EM iteration #5
	Expected log likelihood = -5396168.538
EM iteration #6
	Expected log likelihood = -5396168.538
Fold # 3
Initial expected log likelihood = -6203828.985

EM iteration #1
	Expected lo

([0.61038961038961037,
  0.60434782608695647,
  0.6026200873362445,
  0.59111111111111114,
  0.58796296296296291],
 659.3714730739594)

In [10]:
# Evaluate original NB classifier using test data set
nb_clf = MultinomialNB(alpha=1e-2).fit(X_l, y_l)
pred = nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

                          precision    recall  f1-score   support

             alt.atheism       0.50      0.32      0.39       319
           comp.graphics       0.49      0.49      0.49       389
 comp.os.ms-windows.misc       0.56      0.33      0.41       394
comp.sys.ibm.pc.hardware       0.46      0.66      0.54       392
   comp.sys.mac.hardware       0.63      0.44      0.52       385
          comp.windows.x       0.67      0.66      0.67       395
            misc.forsale       0.76      0.53      0.62       390
               rec.autos       0.68      0.63      0.65       396
         rec.motorcycles       0.42      0.59      0.49       398
      rec.sport.baseball       0.83      0.69      0.75       397
        rec.sport.hockey       0.89      0.85      0.87       399
               sci.crypt       0.39      0.80      0.52       396
         sci.electronics       0.47      0.45      0.46       393
                 sci.med       0.83      0.56      0.67       396
         

In [11]:
# Evaluate semi-supervised EM NB classifier using test data set
em_nb_clf = Semi_EM_MultinomialNB(alpha=1e-2).fit(X_l, y_l, X_u)
pred = em_nb_clf.predict(test_vec)
print(metrics.classification_report(test_Xy.target, pred, target_names=test_Xy.target_names))
# pprint(metrics.confusion_matrix(test_Xy.target, pred))
print(metrics.accuracy_score(test_Xy.target, pred))

Initial expected log likelihood = -6116325.162

EM iteration #1
	Expected log likelihood = -5407907.427
EM iteration #2
	Expected log likelihood = -5403350.415
EM iteration #3
	Expected log likelihood = -5402498.342
EM iteration #4
	Expected log likelihood = -5402217.925
EM iteration #5
	Expected log likelihood = -5402159.326
EM iteration #6
	Expected log likelihood = -5402153.228
EM iteration #7
	Expected log likelihood = -5402153.228
                          precision    recall  f1-score   support

             alt.atheism       0.63      0.11      0.18       319
           comp.graphics       0.58      0.56      0.57       389
 comp.os.ms-windows.misc       0.75      0.19      0.30       394
comp.sys.ibm.pc.hardware       0.39      0.80      0.53       392
   comp.sys.mac.hardware       0.75      0.37      0.49       385
          comp.windows.x       0.75      0.76      0.75       395
            misc.forsale       0.86      0.43      0.57       390
               rec.autos       

In [12]:
# find the most informative features 
import numpy as np
def show_topK(classifier, vectorizer, categories, K=10):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        topK = np.argsort(classifier.coef_[i])[-K:]
        print("%s: %s" % (category, " ".join(feature_names[topK])))

In [13]:
show_topK(nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by original NB classifier

alt.atheism: god believ religion think thi object atheist say wa moral
comp.graphics: thi use ani polygon cview window file imag program graphic
comp.os.ms-windows.misc: system download thi ani thank problem driver use file window
comp.sys.ibm.pc.hardware: problem mous isa motherboard use thi drive card bu system
comp.sys.mac.hardware: get mb ani quadra problem mous thank drive thi mac
comp.windows.x: motif display mit list server widget thank thi use window
misc.forsale: use trade condit cd price interest ship new offer sale
rec.autos: ani anyon would buy thi gt dealer like wa car
rec.motorcycles: front harley ride dod one know wa get motorcycl bike
rec.sport.baseball: year cub brave score wa hi thi team win game
rec.sport.hockey: main hawk wa hockey season play player nhl team game
sci.crypt: bite govern protect use chip clipper secur thi encrypt key
sci.electronics: smd one radio grind data thi would line wire use
sci.med: jxp dsl diet food medic wa diseas one msg thi
sci.space: sat

In [14]:
show_topK(em_nb_clf, vectorizer, train_Xy.target_names, K=10) # keywords for each class by semisupervised EM NB classifier

alt.atheism: religion god think object atheism atheist islam say thi moral
comp.graphics: packag window ani use thi thank program imag graphic file
comp.os.ms-windows.misc: ftp thi thank font ax program use driver file window
comp.sys.ibm.pc.hardware: mb disk thank problem ani thi scsi use card drive
comp.sys.mac.hardware: monitor mb get card drive thank thi appl simm mac
comp.windows.x: ani run program thank motif widget thi server use window
misc.forsale: manual email price new condit includ sell ship offer sale
rec.autos: one ani would buy like drive get thi wa car
rec.motorcycles: go one helmet thi dod get motorcycl wa ride bike
rec.sport.baseball: hit basebal win hi player pitch wa year team game
rec.sport.hockey: thi win nhl season player hockey wa play team game
sci.crypt: peopl wa clipper chip would govern use encrypt thi key
sci.electronics: like grind would amp power circuit one get thi use
sci.med: chastiti jxp diseas dsl pitt geb food gordon msg thi
sci.space: shuttl one mo