In [1]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pprint import pprint

In [2]:

# Sample documents
documents = [
    "We are looking for a skilled software engineer with experience in Python and machine learning.",
    "An article about Python programming and its applications in machine learning.",
    "A detailed guide on how to become a machine learning specialist.",
    "An overview of the latest trends in software engineering.",
    "Machine learning and Python are popular among software engineers."
]

# Preprocess the documents
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    # Tokenize and remove stopwords
    tokens = word_tokenize(doc.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

processed_docs = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus
id2word = corpora.Dictionary(processed_docs)
corpus = [id2word.doc2bow(doc) for doc in processed_docs]

# Train the LDA model
num_topics = 2  # Set the number of topics
lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=id2word,
                                   num_topics=num_topics,
                                   random_state=100,
                                   update_every=1,
                                   chunksize=10,
                                   passes=10,
                                   alpha='auto',
                                   per_word_topics=True)

# Print the topics
pprint(lda_model.print_topics())

# Compute coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

[(0,
  '0.122*"machine" + 0.122*"learning" + 0.089*"python" + 0.075*"software" + '
  '0.044*"looking" + 0.044*"experience" + 0.044*"guide" + 0.044*"skilled" + '
  '0.044*"detailed" + 0.044*"become"'),
 (1,
  '0.075*"trends" + 0.075*"overview" + 0.075*"engineering" + 0.075*"latest" + '
  '0.073*"software" + 0.072*"article" + 0.072*"programming" + '
  '0.072*"applications" + 0.048*"python" + 0.042*"learning"')]

Coherence Score:  0.4699240140220493
