In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups

#select the categories with which I'm familiar (so that I could actually evaluate the performance of the algorithms)

categs =['alt.atheism',
         'rec.autos',
         'sci.electronics',
         'sci.med',
         'sci.space',
         'soc.religion.christian',
         'talk.politics.guns',
         'talk.politics.mideast']

In [3]:
#remove headers, footers, quotes to make it fair

news_train = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories = categs)

In [4]:
#creating tf idf matrix
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = 'english')
news_train_tfidf = vectorizer.fit_transform(news_train.data)

In [5]:
news_train_tfidf.shape

(4561, 46431)

In [6]:
#get the vocabulary
vocabulary = vectorizer.get_feature_names()

In [7]:
# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10
ntopics=8

### LSA

In [8]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
news_train_lsa = lsa.fit_transform(news_train_tfidf)

components_lsa = word_topic(news_train_tfidf, news_train_lsa, vocabulary)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)       

### LDA

In [22]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=1, 
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

news_train_lda = lda.fit_transform(news_train_tfidf) 

components_lda = word_topic(news_train_tfidf, news_train_lda, vocabulary)

topwords['LDA']=top_words(components_lda, n_top_words)


### NNMF

In [10]:
from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
news_train_nmf = nmf.fit_transform(news_train_tfidf) 

components_nmf = word_topic(news_train_tfidf, news_train_nmf, vocabulary)

topwords['NNMF']=top_words(components_nmf, n_top_words)

### Identified Topics

In [23]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
            LSA          LDA         NNMF
0  people 51.49    like 2.21     don 2.48
0     don 48.21     don 2.16  people 2.45
0     just 47.6  people 2.13    just 2.41
0    like 45.72    just 1.99    think 2.1
0    know 43.82    know 1.88    know 2.09
0   think 43.29    good 1.79    like 1.96
0     god 38.93   think 1.76    does 1.34
0    does 36.49    does 1.73    time 1.31
0    time 33.26     car 1.72      say 1.3
0     good 32.4    time 1.44    good 1.28
Topic 1:
               LSA          LDA             NNMF
1         edu 13.1    like 2.43         geb 2.92
1       banks 11.0    just 2.23         pitt 2.9
1        geb 10.86  people 2.08          dsl 2.9
1        pitt 10.8     don 2.08     chastity 2.9
1     gordon 10.78    know 1.96        n3jxp 2.9
1   shameful 10.67   think 1.79        cadre 2.9
1  surrender 10.55    good 1.62    shameful 2.89
1   chastity 10.51    right 1.6   intellect 2.87
1      n3jxp 10.51     edu 1.58  skepticism 2.87
1        dsl 10.51     car 1.5

In [12]:
list(news_train.target_names)

['alt.atheism',
 'rec.autos',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast']