In [30]:
# example from https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()
en_stop = []

# create sample documents: doc_set = dataset
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [32]:
# tokenization and preprocessing
texts = []
# loop through document list
for i in doc_set:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)
for i in range(0,len(texts)):
  print(texts[i])

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'like', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']
['my', 'mother', 'spend', 'a', 'lot', 'of', 'time', 'drive', 'my', 'brother', 'around', 'to', 'basebal', 'practic']
['some', 'health', 'expert', 'suggest', 'that', 'drive', 'may', 'caus', 'increas', 'tension', 'and', 'blood', 'pressur']
['i', 'often', 'feel', 'pressur', 'to', 'perform', 'well', 'at', 'school', 'but', 'my', 'mother', 'never', 'seem', 'to', 'drive', 'my', 'brother', 'to', 'do', 'better']
['health', 'profession', 'say', 'that', 'brocolli', 'is', 'good', 'for', 'your', 'health']


In [33]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.dictionary.Dictionary(texts)
dictionary.id2token = { v:k for k, v in dictionary.token2id.items()}
import pandas as pd
df = pd.DataFrame(columns=["word", "id", "doc-freq"])
df["word"] = dictionary.token2id.keys()
df["id"] = dictionary.token2id.values()
df["doc-freq"] = [dictionary.dfs[id] for id in dictionary.token2id.values()]
df

Unnamed: 0,word,id,doc-freq
0,brocolli,0,2
1,brother,1,3
2,but,2,2
3,eat,3,1
4,good,4,2
5,is,5,2
6,like,6,1
7,mother,7,3
8,my,8,3
9,not,9,1


In [35]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
print(texts[0])
print(corpus[0])

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'like', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']
[(0, 2), (1, 1), (2, 1), (3, 2), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 2)]


In [36]:
# generate LDA model
# corpur can be doc2bow, doc2tfidf?
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

# when choose n_topics = 2
print(ldamodel.print_topics(num_topics=2, num_words=4))
print("\n")
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

# when choose n_topics = 3
print("\n")
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.075*"to" + 0.075*"my" + 0.040*"brother" + 0.040*"mother"'), (1, '0.082*"health" + 0.048*"that" + 0.045*"is" + 0.044*"good"')]


Topic: 0 Word: 0.075*"to" + 0.075*"my" + 0.040*"brother" + 0.040*"mother" + 0.040*"drive" + 0.029*"but" + 0.029*"eat" + 0.029*"pressur" + 0.029*"brocolli" + 0.029*"good"
Topic: 1 Word: 0.082*"health" + 0.048*"that" + 0.045*"is" + 0.044*"good" + 0.044*"brocolli" + 0.044*"profession" + 0.044*"for" + 0.044*"your" + 0.044*"say" + 0.015*"and"


[(0, '0.075*"to" + 0.075*"my" + 0.040*"brother"'), (1, '0.082*"health" + 0.048*"that" + 0.045*"is"')]


In [37]:
# new document
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(unseen_document.split(" "))

for index, score in sorted(ldamodel[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, ldamodel.print_topic(index, 5)))


Score: 0.6420323252677917	 Topic: 0.082*"health" + 0.048*"that" + 0.045*"is" + 0.044*"good" + 0.044*"brocolli"
Score: 0.35796764492988586	 Topic: 0.075*"to" + 0.075*"my" + 0.040*"brother" + 0.040*"mother" + 0.040*"drive"


In [39]:
# coherence score
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.46510267502360986
