In [22]:
!pip install icecream

Collecting icecream
  Downloading https://files.pythonhosted.org/packages/1f/c0/8e2bc1b5eab95e5155841c826b431692638c19bf04ee4cdc86b379f85150/icecream-2.1.1-py2.py3-none-any.whl
Collecting asttokens>=2.0.1 (from icecream)
  Downloading https://files.pythonhosted.org/packages/16/d5/b0ad240c22bba2f4591693b0ca43aae94fbd77fb1e2b107d54fff1462b6f/asttokens-2.0.5-py2.py3-none-any.whl
Collecting executing>=0.3.1 (from icecream)
  Downloading https://files.pythonhosted.org/packages/17/85/b84ea78f52bcb5513a790e64edc19687d8699ea6b4197f075da28547a370/executing-0.7.0-py2.py3-none-any.whl
Installing collected packages: asttokens, executing, icecream
Successfully installed asttokens-2.0.5 executing-0.7.0 icecream-2.1.1
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [90]:
from icecream import ic

import numpy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

def get_topics(model, feature_names, no_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        # print ("Topic %d:" % (topic_idx))
        
        topic = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        # print (topic)
        topics.append(topic)
    return topics


In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [31]:
ic(len(documents))
ic(len(tfidf_vectorizer.vocabulary_))
ic(len(tfidf_vectorizer.idf_))


ic| len(documents): 11314
ic| len(tfidf_vectorizer.vocabulary_): 1000
ic| len(tfidf_vectorizer.idf_): 1000


5

In [20]:
#idf of all words
tfidf.shape

(11314, 1000)

In [48]:
max(tfidf_vectorizer.vocabulary_, key=tfidf_vectorizer.vocabulary_.get)

'young'

In [47]:
#check vocab -> count of all words
# {k: v for k, v in sorted(tfidf_vectorizer.vocabulary_.items(), key=lambda item: item[1], reverse = True)}
tfidf_vectorizer.vocabulary_

{'young': 999,
 'york': 998,
 'yes': 997,
 'years': 996,
 'year': 995,
 'xt': 994,
 'x11': 993,
 'wrote': 992,
 'wrong': 991,
 'written': 990,
 'writing': 989,
 'write': 988,
 'wouldn': 987,
 'worth': 986,
 'world': 985,
 'works': 984,
 'working': 983,
 'worked': 982,
 'work': 981,
 'words': 980,
 'word': 979,
 'won': 978,
 'women': 977,
 'wm': 976,
 'wish': 975,
 'wire': 974,
 'windows': 973,
 'window': 972,
 'win': 971,
 'willing': 970,
 'widget': 969,
 'wide': 968,
 'white': 967,
 'went': 966,
 'weeks': 965,
 'week': 964,
 'weapons': 963,
 'ways': 962,
 'way': 961,
 'water': 960,
 'wasn': 959,
 'washington': 958,
 'war': 957,
 'wants': 956,
 'wanted': 955,
 'want': 954,
 'wait': 953,
 'w7': 952,
 'vs': 951,
 'volume': 950,
 'voice': 949,
 'view': 948,
 'video': 947,
 'vga': 946,
 'version': 945,
 've': 944,
 'various': 943,
 'van': 942,
 'values': 941,
 'value': 940,
 'usually': 939,
 'using': 938,
 'uses': 937,
 'users': 936,
 'user': 935,
 'usenet': 934,
 'useful': 933,
 'used': 9

In [54]:
#vector of document 1. in term weight form
print(tfidf[1])


  (0, 321)	0.08508020140594663
  (0, 162)	0.13642493998180574
  (0, 978)	0.13689915003677672
  (0, 103)	0.1789108565351029
  (0, 627)	0.16782173086611904
  (0, 625)	0.10206217262820534
  (0, 851)	0.1369674547870616
  (0, 584)	0.1316147324734244
  (0, 341)	0.12931381835191216
  (0, 508)	0.08326180125444764
  (0, 644)	0.15281486344818457
  (0, 837)	0.2995085066493483
  (0, 503)	0.5317344106298215
  (0, 781)	0.16296634579039537
  (0, 366)	0.33460542357935297
  (0, 546)	0.12239619292707879
  (0, 620)	0.2155941077784601
  (0, 138)	0.20055635956076379
  (0, 441)	0.2620770918816339
  (0, 83)	0.16222236317687802
  (0, 88)	0.12543203928691785
  (0, 369)	0.17310614842947136
  (0, 741)	0.12144336090596407
  (0, 670)	0.09317917351235136
  (0, 359)	0.15795055553362805


In [56]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
#check vocab -> count of all words
tf_vectorizer.vocabulary_

{'sure': 872,
 'story': 860,
 'did': 303,
 'statement': 854,
 'media': 589,
 'pro': 714,
 'israeli': 497,
 'world': 985,
 'having': 444,
 'letter': 535,
 'try': 914,
 'think': 897,
 'reason': 745,
 'report': 760,
 'clearly': 229,
 'reports': 762,
 'received': 748,
 'government': 424,
 'makes': 573,
 'away': 146,
 'look': 554,
 'jews': 502,
 'got': 422,
 'power': 703,
 'expect': 359,
 'people': 670,
 'read': 741,
 'faq': 369,
 'actually': 88,
 'accept': 83,
 'hard': 441,
 'atheism': 138,
 'need': 620,
 'little': 546,
 'faith': 366,
 'runs': 781,
 'jim': 503,
 'sorry': 837,
 'oh': 644,
 'just': 508,
 'end': 341,
 'maybe': 584,
 'start': 851,
 'new': 625,
 'newsgroup': 627,
 'alt': 103,
 'won': 978,
 'big': 162,
 'don': 321,
 'points': 691,
 'like': 540,
 'know': 516,
 'ask': 134,
 'question': 734,
 'sort': 838,
 'want': 954,
 'continue': 258,
 'israel': 496,
 'stop': 859,
 'asking': 136,
 'questions': 735,
 'work': 981,
 'bad': 149,
 'attack': 139,
 'group': 429,
 'center': 203,
 'policy

In [57]:

#vector of document 1. in term frequency (word count form
print(tf[1])

  (0, 359)	1
  (0, 670)	1
  (0, 741)	1
  (0, 369)	1
  (0, 88)	1
  (0, 83)	1
  (0, 441)	2
  (0, 138)	1
  (0, 620)	2
  (0, 546)	1
  (0, 366)	2
  (0, 781)	1
  (0, 503)	3
  (0, 837)	2
  (0, 644)	1
  (0, 508)	1
  (0, 341)	1
  (0, 584)	1
  (0, 851)	1
  (0, 625)	1
  (0, 627)	1
  (0, 103)	1
  (0, 978)	1
  (0, 162)	1
  (0, 321)	1


In [60]:
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [62]:
nmf.components_.shape

(20, 1000)

In [91]:
no_top_words = 10
# NMF
ic("=== NMF topics ===")
display_topics(nmf, tfidf_feature_names, no_top_words)
nmf_topics = get_topics(nmf, tfidf_feature_names, no_top_words)


Topic 0:
people time right did good said say make way government
Topic 1:
window problem using server application screen display motif manager running
Topic 2:
god jesus bible christ faith believe christian christians sin church
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 15
Topic 5:
thanks mail advance hi looking info help information address appreciated
Topic 6:
windows file files dos program version ftp ms directory running
Topic 7:
edu soon cs university ftp internet article email pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just ll thought tell oh little fine work wanted mean
Topic 11:
does know anybody mean work say doesn help exist program
Topic 12:
card video monitor cards drivers bus vga driver color memory
Topic 13:
like sounds looks look bike sound lot things really thing
To

In [136]:
no_top_words = 10
ic("=== LDA topics ===")
display_topics(lda, tf_feature_names, no_top_words)
lda_topics = get_topics(lda, tf_feature_names, no_top_words)


ic| '=== LDA topics ==='
Topic 0:
people gun state control right guns crime states law police
Topic 1:
time question book years did like don space answer just
Topic 2:
mr line rules science stephanopoulos title current define int yes
Topic 3:
key chip keys clipper encryption number des algorithm use bit
Topic 4:
edu com cs vs w7 cx mail uk 17 send
Topic 5:
use does window problem way used point different case value
Topic 6:
windows thanks know help db does dos problem like using
Topic 7:
bike water effect road design media dod paper like turn
Topic 8:
don just like think know people good ve going say
Topic 9:
car new price good power used air sale offer ground
Topic 10:
file available program edu ftp information files use image version
Topic 11:
ax max b8f g9v a86 145 pl 1d9 0t 34u
Topic 12:
government law privacy security legal encryption court fbi technology information
Topic 13:
card bit memory output video color data mode monitor 16
Topic 14:
drive scsi disk mac hard apple drives c

In [150]:
docIdx = int(input("Enter doc number:"))
ic.disable()
ic("=============================================")
# check doc
NMFresult = nmf.transform(tfidf[docIdx])
ic(NMFresult)
ic(NMFresult.sum())
ic(NMFresult.max())
NMFtopicIdx = numpy.argmax(NMFresult)
# ic(NMFtopicIdx)
ic("=============================================")
# LDA result
LDAresult = lda.transform(tf[docIdx])
ic(LDAresult.sum())
ic(LDAresult)
ic(LDAresult.max())
LDAtopicIdx = numpy.argmax(LDAresult)

#check which topic the document belong to
print("=============================================")
print("Doc Number{}:".format(docIdx))
print("=============================================")
print(documents[docIdx])

print("=============================================")
print("Doc:{} is in NMF topic: {} => {}".format(docIdx, NMFtopicIdx,nmf_topics[NMFtopicIdx]) )
print("Doc:{} is in LDA topic: {} => {}".format(docIdx, LDAtopicIdx,lda_topics[LDAtopicIdx]) )

Doc Number900:
Anyone have any information on the effects/origin of oxaprozin?
It's marketed under the name "DAYpro", and appears to be an
anti-inflammatory.  Is it similar to naproxin?  Stronger?

TIA
Doc:900 is in NMF topic: 5 => thanks mail advance hi looking info help information address appreciated
Doc:900 is in LDA topic: 12 => government law privacy security legal encryption court fbi technology information
