In [1]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
from sklearn import datasets
import matplotlib.pyplot as plt

In [2]:
from nltk import WordNetLemmatizer
from nltk import pos_tag
import nltk
from nltk.corpus import wordnet
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
#Define Stopwords, here we use stopwords in text package
stop_words=text.ENGLISH_STOP_WORDS
print("number of stop_words in sklearn text:%s" % len(stop_words))

number of stop_words in sklearn text:318


In [3]:
#Lemmatization
lemmatizer=WordNetLemmatizer()
#Define lemmatizer pos_tag to deal with adj, verb, noun and adv separately
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#Preprocessing: exclude some symbols and numbers in each sentence
exclude_sign="~#$%^&*(){}[]<>|+=1234567890"
replace_sign="@,.?!-;"
def preprocessing(data):
    processed_sentence=[]
    for i in range(len(data)):
        sentence=data[i]
        for c in exclude_sign:
            sentence=sentence.replace(c,"")
        for c in replace_sign:
            sentence=sentence.replace(c," ")
        processed_token=[lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]
        processed_sentence.append(" ".join(processed_token))
    return processed_sentence
#Training

vectorizer=CountVectorizer(stop_words='english',min_df=3)

In [4]:
#implement new dataset
categories_computer=['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
categories_rec=['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
computer_train=datasets.fetch_20newsgroups(subset = 'train', categories = categories_computer, shuffle = True, random_state = None).data
computer_test=datasets.fetch_20newsgroups(subset = 'test', categories = categories_computer, shuffle = True, random_state = None).data
recreation_train=datasets.fetch_20newsgroups(subset = 'train', categories = categories_rec, shuffle = True, random_state = None).data
recreation_test=datasets.fetch_20newsgroups(subset = 'test', categories = categories_rec, shuffle = True, random_state = None).data

In [5]:
#implement new dataset cont'd: combine train and test together and save as X_overall
X_train=computer_train+recreation_train
X_test=computer_test+recreation_test
X_overall=X_train+X_test
Y_train=[1]*len(computer_train)+[0]*len(recreation_train)
Y_test=[1]*len(computer_test)+[0]*len(recreation_test)
print(np.shape(X_overall))

(7882,)


In [6]:
#Lemmatization
processed_train_data=preprocessing(X_overall)

#Vectorization
data_vec=vectorizer.fit_transform(processed_train_data)

#TdIdf
from sklearn.feature_extraction.text import TfidfTransformer
TdT=TfidfTransformer()
data_vec_ti=TdT.fit_transform(data_vec)





In [7]:
#LSI
from sklearn.decomposition import TruncatedSVD
transformer=TruncatedSVD(n_components=50)
data_svd=transformer.fit_transform(data_vec_ti)
print('The shape of dimensionality reduced feature map of LSI')
print(np.shape(data_svd))

#nmf
from sklearn.decomposition import NMF
nmf=NMF(n_components=50)
data_nmf=nmf.fit_transform(data_vec_ti)
print('The shape of dimensionality reduced feature map of NMF')
print(np.shape(data_nmf))

The shape of dimensionality reduced feature map of LSI
(7882, 50)
The shape of dimensionality reduced feature map of NMF
(7882, 50)


In [9]:
#error
xhat=np.dot(data_svd,transformer.components_)

data_mat=data_vec_ti.todense()
data_ar=data_mat.getA()

dif=xhat-data_mat
error=0

error_lsi=np.sqrt(np.trace(np.dot(dif.T,dif)))
print('LSI error:')
print(error_lsi)

nmf_div=nmf.reconstruction_err_
print('NMF error:')
print(nmf_div)

LSI error:
83.38413069454103
NMF error:
83.66398349177325
