In [None]:
# example from https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()
en_stop = []

# create sample documents: doc_set = dataset
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# tokenization and preprocessing
texts = []
# loop through document list
for i in doc_set:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)
for i in range(0,len(texts)):
  print(texts[i])

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'like', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']
['my', 'mother', 'spend', 'a', 'lot', 'of', 'time', 'drive', 'my', 'brother', 'around', 'to', 'basebal', 'practic']
['some', 'health', 'expert', 'suggest', 'that', 'drive', 'may', 'caus', 'increas', 'tension', 'and', 'blood', 'pressur']
['i', 'often', 'feel', 'pressur', 'to', 'perform', 'well', 'at', 'school', 'but', 'my', 'mother', 'never', 'seem', 'to', 'drive', 'my', 'brother', 'to', 'do', 'better']
['health', 'profession', 'say', 'that', 'brocolli', 'is', 'good', 'for', 'your', 'health']


In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.dictionary.Dictionary(texts)
dictionary.id2token = { v:k for k, v in dictionary.token2id.items()}
import pandas as pd
df = pd.DataFrame(columns=["word", "id", "doc-freq"])
df["word"] = dictionary.token2id.keys()
df["id"] = dictionary.token2id.values()
df["doc-freq"] = [dictionary.dfs[id] for id in dictionary.token2id.values()]
df

Unnamed: 0,word,id,doc-freq
0,brocolli,0,2
1,brother,1,3
2,but,2,2
3,eat,3,1
4,good,4,2
5,is,5,2
6,like,6,1
7,mother,7,3
8,my,8,3
9,not,9,1


In [None]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
print(texts[0])
print(corpus[0])

['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'like', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother']
[(0, 2), (1, 1), (2, 1), (3, 2), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 2)]


In [None]:
# generate LDA model
# corpur can be doc2bow, doc2tfidf?
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

# when choose n_topics = 2
print(ldamodel.print_topics(num_topics=2, num_words=4))
print("\n")
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

# when choose n_topics = 3
print("\n")
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.075*"to" + 0.075*"my" + 0.040*"brother" + 0.040*"mother"'), (1, '0.082*"health" + 0.048*"that" + 0.045*"is" + 0.044*"good"')]


Topic: 0 Word: 0.075*"to" + 0.075*"my" + 0.040*"brother" + 0.040*"mother" + 0.040*"drive" + 0.029*"but" + 0.029*"eat" + 0.029*"pressur" + 0.029*"brocolli" + 0.029*"good"
Topic: 1 Word: 0.082*"health" + 0.048*"that" + 0.045*"is" + 0.044*"good" + 0.044*"brocolli" + 0.044*"profession" + 0.044*"for" + 0.044*"your" + 0.044*"say" + 0.015*"and"


[(0, '0.075*"to" + 0.075*"my" + 0.040*"brother"'), (1, '0.082*"health" + 0.048*"that" + 0.045*"is"')]


In [None]:
# new document
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(unseen_document.split(" "))

for index, score in sorted(ldamodel[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, ldamodel.print_topic(index, 5)))


Score: 0.6420323252677917	 Topic: 0.082*"health" + 0.048*"that" + 0.045*"is" + 0.044*"good" + 0.044*"brocolli"
Score: 0.35796764492988586	 Topic: 0.075*"to" + 0.075*"my" + 0.040*"brother" + 0.040*"mother" + 0.040*"drive"


In [None]:
# coherence score
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.46510267502360986


# **Mini-LDA-to-20NGs**

In [None]:
# other test
# download data and covert to the from that etm can understand and process
from sklearn.datasets import fetch_20newsgroups
train_data = fetch_20newsgroups(subset='train').data
test_data = fetch_20newsgroups(subset='test').data
documents = train_data
documents.extend(test_data)
print(f'Number of documents {len(documents)}')

Number of documents 18846


In [None]:
documents[:10]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [None]:
# preprocessing
# tokenization and preprocessing
texts = []
# loop through document list
for i in documents:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in set(stopwords.words('english'))]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)

In [None]:
for i in range(0,len(texts[:5])):
  print(texts[i])

['lerxst', 'wam', 'umd', 'edu', 'thing', 'subject', 'car', 'nntp', 'post', 'host', 'rac3', 'wam', 'umd', 'edu', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', '15', 'wonder', 'anyon', 'could', 'enlighten', 'car', 'saw', 'day', '2', 'door', 'sport', 'car', 'look', 'late', '60', 'earli', '70', 'call', 'bricklin', 'door', 'realli', 'small', 'addit', 'front', 'bumper', 'separ', 'rest', 'bodi', 'know', 'anyon', 'tellm', 'model', 'name', 'engin', 'spec', 'year', 'product', 'car', 'made', 'histori', 'whatev', 'info', 'funki', 'look', 'car', 'pleas', 'e', 'mail', 'thank', 'il', 'brought', 'neighborhood', 'lerxst']
['guykuo', 'carson', 'u', 'washington', 'edu', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summari', 'final', 'call', 'si', 'clock', 'report', 'keyword', 'si', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', '1qvfo9innc3', 'organ', 'univers', 'washington', 'line', '11', 'nntp', 'post', 'host', 'carson', 'u', 'washington', 'edu', 'fair', 'number', '

In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.dictionary.Dictionary(texts)
dictionary.id2token = { v:k for k, v in dictionary.token2id.items()}
import pandas as pd
df = pd.DataFrame(columns=["word", "id", "doc-freq"])
df["word"] = dictionary.token2id.keys()
df["id"] = dictionary.token2id.values()
df["doc-freq"] = [dictionary.dfs[id] for id in dictionary.token2id.values()]
df.iloc[:50]

Unnamed: 0,word,id,doc-freq
0,15,0,1774
1,2,1,4544
2,60,2,539
3,70,3,420
4,addit,4,656
5,anyon,5,3560
6,bodi,6,682
7,bricklin,7,10
8,brought,8,394
9,bumper,9,50


In [None]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
print(texts[0])
print(corpus[0])

['lerxst', 'wam', 'umd', 'edu', 'thing', 'subject', 'car', 'nntp', 'post', 'host', 'rac3', 'wam', 'umd', 'edu', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', '15', 'wonder', 'anyon', 'could', 'enlighten', 'car', 'saw', 'day', '2', 'door', 'sport', 'car', 'look', 'late', '60', 'earli', '70', 'call', 'bricklin', 'door', 'realli', 'small', 'addit', 'front', 'bumper', 'separ', 'rest', 'bodi', 'know', 'anyon', 'tellm', 'model', 'name', 'engin', 'spec', 'year', 'product', 'car', 'made', 'histori', 'whatev', 'info', 'funki', 'look', 'car', 'pleas', 'e', 'mail', 'thank', 'il', 'brought', 'neighborhood', 'lerxst']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 5), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1

In [None]:
# generate LDA model
# corpur can be doc2bow, doc2tfidf?
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.018*"com" + 0.010*"govern" + 0.007*"articl" + 0.007*"organ" + 0.007*"law" + 0.007*"state" + 0.007*"write" + 0.006*"public" + 0.006*"subject" + 0.005*"line"
Topic: 1 Word: 0.012*"use" + 0.009*"file" + 0.008*"system" + 0.007*"1" + 0.007*"imag" + 0.006*"2" + 0.006*"drive" + 0.006*"window" + 0.006*"softwar" + 0.006*"do"
Topic: 2 Word: 0.103*"_" + 0.023*"netcom" + 0.020*"__" + 0.019*"___" + 0.018*"com" + 0.016*"ohio" + 0.014*"ac" + 0.012*"state" + 0.011*"sandvik" + 0.011*"kent"
Topic: 3 Word: 0.018*"god" + 0.011*"christian" + 0.008*"one" + 0.006*"jesu" + 0.006*"say" + 0.005*"believ" + 0.005*"would" + 0.005*"homosexu" + 0.005*"bibl" + 0.005*"peopl"
Topic: 4 Word: 0.069*"x" + 0.018*"1" + 0.013*"uk" + 0.010*"newsread" + 0.008*"version" + 0.008*"tin" + 0.008*"ac" + 0.008*"use" + 0.007*"file" + 0.007*"line"
Topic: 5 Word: 0.032*"com" + 0.012*"car" + 0.011*"line" + 0.010*"subject" + 0.010*"organ" + 0.010*"write" + 0.009*"articl" + 0.006*"edu" + 0.005*"post" + 0.005*"bike"
Topic: 