In [0]:
# Author: Julianna Cole <jec56@njit.edu>
#         Rhea Pavithran <rp553@njit.edu>
# Class: IS 465-002
# Assignment: Classify documents using NPL


# The purpose of this program is topic extraction, which allows readers to 
# identify key terms in an article or document. This allows the reader to
# quickly determine what the content of the document is about, saving them
# time from having to read it in its entirety. Both Frobenius norm and the
# generalized Kullback-Leibler divergence are used for NMF
# (Non-negative Matrix Factorization), which is a tool for analyzing data and
# automatically extracting meaningful information. These are different methods
# for classifying text in a given document and detecing a particular topic
# for the extracted words.


# ------------------------------------------------------------------------
# The first section of this code loads the dataset, extracts data from it,
# and displays the amount of time it took to complete each method in seconds.

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# ------------------------------------------------------------------------
# What does the code do?
# After loading the datasets and vectorizing them, the code analyzes the 
# language used while eliminating common words as well as headers, footers,
# and quoted replies. 

# If a word is common between documents, they are removed.
# This is because the code will be able to identify unique words across
# all documents in order to identify the common theme. 

print("Loading dataset...")
t0 = time()
data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))


# ------------------------------------------------------------------------
# Use tf-idf (term frequency-inverse document frequency) features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


# ------------------------------------------------------------------------
# Use tf (term frequency) features for LDA (Latent Dirichlet Allocation).
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()


# ------------------------------------------------------------------------
# Fit the NMF (Non-Negative Matrix Factorization) model.
# This section of code utilizes Frobenius norm to output the data.
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


# ------------------------------------------------------------------------
# Fit the NMF (Non-Negative Matrix Factorization) model.
# This section of code utilizes the generalized Kullback-Leibler divergence
# to output the data.
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


# ------------------------------------------------------------------------
# Fit the LDA (Latent Dirichlet Allocation) model.
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Loading dataset...
done in 12.341s.
Extracting tf-idf features for NMF...
done in 0.396s.
Extracting tf features for LDA...
done in 0.381s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.415s.

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp 