In [None]:
# Let's read in our document-term matrix
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

In [None]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
from gensim import matutils, models
import scipy.sparse

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# One of the required inputs is a term-document matrix
tdm = data.transpose()
tdm.head()

In [None]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [None]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [24]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.012*"سرمایه" + 0.010*"سال" + 0.008*"مبلغ" + 0.007*"گزارش" + 0.007*"محقق" + 0.006*"مجمع" + 0.006*"افزایش" + 0.006*"میلیون" + 0.006*"ریالی" + 0.006*"گذاری"'),
 (1,
  '0.035*"ریالی" + 0.028*"نرخ" + 0.016*"عدد" + 0.016*"مواجه" + 0.013*"خوبی" + 0.013*"عملیاتی" + 0.012*"رسیده" + 0.012*"ماه" + 0.012*"خوب" + 0.012*"سال"')]

In [25]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.014*"سرمایه" + 0.009*"سال" + 0.008*"مبلغ" + 0.007*"مجمع" + 0.007*"محقق" + 0.006*"گزارش" + 0.006*"گذاری" + 0.006*"افزایش" + 0.006*"میلیون" + 0.006*"درامد"'),
 (1,
  '0.034*"ریالی" + 0.023*"نرخ" + 0.018*"عدد" + 0.015*"مواجه" + 0.014*"خوبی" + 0.013*"رسیده" + 0.013*"خوب" + 0.012*"عملیاتی" + 0.012*"ماه" + 0.012*"سال"'),
 (2,
  '0.031*"نرخ" + 0.029*"ریالی" + 0.015*"مواجه" + 0.012*"سال" + 0.011*"عملیاتی" + 0.011*"عدد" + 0.010*"خوبی" + 0.010*"ماه" + 0.009*"رشد" + 0.009*"محقق"')]

In [26]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.032*"نرخ" + 0.030*"ریالی" + 0.013*"عدد" + 0.013*"مواجه" + 0.011*"عملیاتی" + 0.011*"سال" + 0.011*"خوبی" + 0.011*"رسیده" + 0.010*"خوب" + 0.010*"محقق"'),
 (1,
  '0.014*"سرمایه" + 0.010*"سال" + 0.010*"افزایش" + 0.010*"مبلغ" + 0.009*"ریالی" + 0.008*"رسیده" + 0.008*"مجمع" + 0.008*"محقق" + 0.007*"زیان" + 0.007*"گزارش"'),
 (2,
  '0.012*"سرمایه" + 0.010*"سال" + 0.008*"بانک" + 0.007*"گذاری" + 0.007*"سهام" + 0.006*"محقق" + 0.006*"گزارش" + 0.006*"مبلغ" + 0.006*"میلیون" + 0.006*"مجمع"'),
 (3,
  '0.037*"ریالی" + 0.026*"نرخ" + 0.018*"مواجه" + 0.017*"عدد" + 0.014*"خوبی" + 0.013*"عملیاتی" + 0.013*"ماه" + 0.012*"خوب" + 0.012*"رسیده" + 0.012*"سال"')]

# reference
https://github.com/adashofdata/nlp-in-python-tutorial