In [1]:
#==============================================================================
# CellStrat Hub Pack - Natural Language Processing
# Compatible tier : Free Tier or above  
# Kerner : conda_pytorch_latest_p36 
#==============================================================================

In [2]:
#==============================================================================================
#Topic modeling is an unsupervised machine learning technique that's capable of scanning a set 
# of documents, detecting word and phrase patterns within them, and automatically clustering 
# word groups and similar expressions that best characterize a set of documentsr
#==============================================================================================

In [3]:
#==============================================================================================
# Latent Dirichlet Allocation (LDA) and LSA are based on the same underlying assumptions: 
# The distributional hypothesis, (i.e. similar topics make use of similar words) and the 
# statistical mixture hypothesis (i.e. documents talk about several topics) for which a 
# statistical distribution can be determined. The purpose of LDA is mapping each document in our 
# corpus to a set of topics which covers a good deal of the words in the document.
#==============================================================================================


In [4]:
# -*- coding: utf-8 -*-

#==============================================================================
# Install LDA library
#=============================================================================
import numpy as np
!pip install lda
import lda
import lda.datasets

Collecting lda
  Downloading lda-2.0.0.tar.gz (320 kB)
Collecting pbr<4,>=0.6
  Downloading pbr-3.1.1-py2.py3-none-any.whl (99 kB)
Building wheels for collected packages: lda
  Building wheel for lda (setup.py): started
  Building wheel for lda (setup.py): finished with status 'error'
  Running setup.py clean for lda
Failed to build lda
Installing collected packages: pbr, lda
    Running setup.py install for lda: started
    Running setup.py install for lda: finished with status 'error'


  ERROR: Command errored out with exit status 1:
   command: 'C:\ProgramData\Anaconda3\python.exe' -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Pradip Kumar Bala\\AppData\\Local\\Temp\\pip-install-vo412ypf\\lda\\setup.py'"'"'; __file__='"'"'C:\\Users\\Pradip Kumar Bala\\AppData\\Local\\Temp\\pip-install-vo412ypf\\lda\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\Pradip Kumar Bala\AppData\Local\Temp\pip-wheel-xw0ep3bh'
       cwd: C:\Users\Pradip Kumar Bala\AppData\Local\Temp\pip-install-vo412ypf\lda\
  Complete output (35 lines):
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-3.8
  creating build\lib.win-amd64-3.8\lda
  creating build\lib.win-amd64-3.8\lda\tests
  copying lda\tests\test_datasets.py -> build\lib.win-amd64-3.8\lda\tests
  copying lda\te

ModuleNotFoundError: No module named 'lda'

In [None]:
#==============================================================================
# The input below, X, is a document-term matrix (sparse matrices are accepted).

#Document term matrix : Document in rows, terms(word frequencies) as columns : 

#Document is vectorized. LDA accepts DTM as input
#==============================================================================

x=lda.datasets.load_reuters()
print(x) 

In [None]:
#==============================================================================
#Pring the shape of the DTM
#==============================================================================
x.shape


In [None]:
#==============================================================================
#Vocabulary details of the dataset.
#The below vocabs are represented in DTM matrix
#==============================================================================
vocab=lda.datasets.load_reuters_vocab()
print(vocab)

In [None]:
#==============================================================================
#Print the titles in the existing dataset
#==============================================================================
titles=lda.datasets.load_reuters_titles()
print(titles)

In [None]:
#========================================================================================
#n_topics : The number of requested latent topics to be extracted from the training corpus.
#n_iter: Maximum number of iterations through the corpus when inferring the topic distribution
#==========================================================================================
model=lda.LDA(n_topics=20,n_iter=1500,random_state=1)

In [None]:
model.fit(x)

In [None]:
topic_word = model.topic_word_
print(topic_word)

In [None]:
 n_top_words = 8

In [None]:
#========================================================================================
#Each topic is combination of 8 key words as per our selction
#Total 20 topics are arrived at
#=========================================================================================
for i,topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

In [None]:
#========================================================================================
#Each topic represented by vector
#=========================================================================================
doc_topic = model.doc_topic_
print(doc_topic)

In [None]:
#========================================================================================
#Each document is represnted by means of 20 topics
#=========================================================================================
doc_topic.shape

In [None]:
print(titles)

In [None]:
#========================================================================================
# Display the topic details
#=========================================================================================
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))