In [188]:
import re
import scipy
import spacy
import numpy as np
import pandas as pd
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
from os.path import isfile, join
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

# Comparing LSA, LDA, and NNMF
Take the well-known 20 newsgroups dataset and use each of the methods on it. Your goal is to determine which method, if any, best reproduces the topics represented by the newsgroups. Write up a report where you evaluate each method in light of the 'ground truth'- the known source of each newsgroup post. Which works best, and why do you think this is the case?

In [213]:
#saves list of files names to loop though
atheism_file_names = ['20news-18828\\alt.atheism\\{}'.format(f) for f in listdir('20news-18828\\alt.atheism') if isfile(join('20news-18828\\alt.atheism', f))]
compGraphic_file_names = ['20news-18828\\comp.graphics\\{}'.format(f) for f in listdir('20news-18828\\comp.graphics') if isfile(join('20news-18828\\comp.graphics', f))]
lite_file_names = compGraphic_file_names[0:10] + atheism_file_names[0:10]

In [214]:
nlp = spacy.load('en')
pattern = "[-|*&#$%\?/<>()\d\\t]"
atheism_df = []
i = 0
for file in lite_file_names:
    file1_open = open(file)
    file1_content = file1_open.read()
    #removing the double-dash from all words
    file1_content=file1_content.replace('\\', '')
    file1_content=[re.sub(r'\n','',word) for word in file1_content]
    file1_content=[re.sub(pattern,'',word) for word in file1_content]
    if i < 9:
        atheism_df.append([''.join(file1_content), 'Comp Graphic'])
    else:
        atheism_df.append([''.join(file1_content), 'atheism'])
    i += 1

In [220]:
# Creating the tf-idf matrix.
vectorizer = TfidfVectorizer(stop_words='english')
emma_paras_tfidf=vectorizer.fit_transform(atheism_df[0])

terms = vectorizer.get_feature_names()

# Number of topics.
ntopics=2

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 15

In [221]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
emma_paras_lsa = lsa.fit_transform(emma_paras_tfidf)

components_lsa = word_topic(emma_paras_tfidf, emma_paras_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)                


In [222]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_topics=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

emma_paras_lda = lda.fit_transform(emma_paras_tfidf) 

components_lda = word_topic(emma_paras_tfidf, emma_paras_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)




In [223]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
emma_paras_nmf = nmf.fit_transform(emma_paras_tfidf) 

components_nmf = word_topic(emma_paras_tfidf, emma_paras_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [224]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
                   LSA                 LDA                NNMF
0            comp 0.71           navy 0.36           navy 0.39
0         graphic 0.71         lipman 0.24         lipman 0.26
0  worksinprogress 0.0  presentations 0.24  presentations 0.26
0             legs 0.0   visualization 0.2  visualization 0.22
0               dt 0.0         virtual 0.2        virtual 0.22
0            email 0.0      scientific 0.2        reality 0.22
0      engineering 0.0         reality 0.2     scientific 0.22
0         exchange 0.0           comp 0.17        seminar 0.17
0         factsnet 0.0        graphic 0.17   presentation 0.17
0              fax 0.0             dt 0.16             dt 0.17
0   fornavyrelated 0.0        seminar 0.16         robert 0.17
0            group 0.0         robert 0.16         center 0.13
0          include 0.0   presentation 0.16          oasys 0.13
0      information 0.0       bethesda 0.12       bethesda 0.13
0         internet 0.0         center 0.12    

In [7]:
from ipywidgets import IntSlider
slider = IntSlider(value=50)
slider

A Jupyter Widget

In [8]:
slider.value = 100

In [9]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [14]:
def f(x):
    return x**2

In [15]:
interact(f, x=10)

A Jupyter Widget

<function __main__.f>