In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

newsgroups_test = fetch_20newsgroups(subset='test')
pprint(list(newsgroups_test.target_names))
pprint(newsgroups_test.filenames.shape)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
(7532,)


In [40]:
categories = ['rec.sport.baseball', 'talk.politics.guns', 'misc.forsale']

newsgroups = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), 
                                categories = categories)

data = newsgroups.data
len(data)

1728

In [30]:
import gensim, spacy
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk

In [41]:
data_words = list(map(gensim.utils.simple_preprocess, data))

In [42]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['com', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

data_words = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]

data_ready = []
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
for sent in data_words:
    # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
    doc = nlp(" ".join(sent)) 
    data_ready.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords once more after lemmatization
data_ready = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_ready]    

In [43]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)
pprint(lda_model.print_topics())

[(0,
  '0.013*"offer" + 0.013*"sale" + 0.012*"price" + 0.012*"include" + '
  '0.010*"sell" + 0.009*"drive" + 0.008*"new" + 0.008*"card" + 0.007*"ask" + '
  '0.007*"work"'),
 (1,
  '0.016*"year" + 0.013*"game" + 0.010*"last" + 0.009*"team" + 0.009*"player" '
  '+ 0.008*"play" + 0.007*"baseball" + 0.007*"win" + 0.006*"home" + '
  '0.006*"hit"'),
 (2,
  '0.014*"gun" + 0.008*"time" + 0.008*"people" + 0.007*"state" + 0.006*"bill" '
  '+ 0.005*"weapon" + 0.005*"law" + 0.004*"thing" + 0.004*"case" + '
  '0.004*"well"'),
 (3,
  '0.029*"lens" + 0.023*"weaver" + 0.013*"camera" + 0.011*"file" + '
  '0.007*"ticket" + 0.007*"ca" + 0.006*"mar" + 0.006*"picture" + '
  '0.005*"exposure" + 0.005*"film"')]


In [57]:
# from gensim.models import TfidfModel
# tfidf = TfidfModel(corpus)

# model = TfidfModel(corpus)  # fit model
# tfidf = [model[corpus[i]] for i in range(len(corpus))]
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None)  

tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

from sklearn.decomposition import NMF
no_topics = 4

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([id2word[feature_names[i][0]]
            for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
print_top_words(nmf, tfidf_feature_names, n_top_words=10)

Topic #0: year time well much last way anyone thing really people
Topic #1: sale shipping offer sell include condition new interested mail manual
Topic #2: game bat first late cleveland win york anyone brave mize
Topic #3: email reply interested address look ship offer call cheap title



In [66]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
display(vis)