In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from latentDirichletLatentAllocation import LDA as CustomLDA
import time
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [2]:
df = pd.read_csv("bbc_news.csv")
documents = df['description'].tolist()[:600]

vectorizer = CountVectorizer(max_df=0.85, min_df=2, stop_words="english")
X = vectorizer.fit_transform(documents)
vocab = vectorizer.get_feature_names_out()

In [3]:
num_topics = 5
max_iter = 50

start_time = time.time()
lda_custom = CustomLDA(num_topics=num_topics, max_iter=max_iter)
lda_custom.fit(documents)
custom_time = time.time() - start_time

print(f"Custom realisation time: {custom_time:.3f}")
for topic_id, words in enumerate(lda_custom.get_top_words(n_words=5)):
    print(f"Topic {topic_id}: {words}")

Custom realisation time: 1.683
Topic 0: ['says', 'help', 'years', 'ukraine', 'johnson']
Topic 1: ['world', 'says', 'chelsea', 'russian', 'women']
Topic 2: ['ukraine', 'russian', 'ukrainian', 'war', 'uk']
Topic 3: ['russia', 'england', 'west', 'ukraine', 'president']
Topic 4: ['country', 'old', 'ukraine', 'year', 'war']


In [4]:
texts = [doc.lower().split() for doc in documents]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

topics_custom = lda_custom.get_top_words(n_words=10)
topics_as_tokens = [[word for word in topic] for topic in topics_custom]
coherence_model = CoherenceModel(
    topics=topics_as_tokens,
    texts=texts,
    dictionary=dictionary,
    coherence="c_v"
)
custom_coherence = coherence_model.get_coherence()
print(f"Когерентность (custom): {custom_coherence:.3f}")

Когерентность (custom): 0.394


In [5]:
from sklearn.decomposition import LatentDirichletAllocation

start_time = time.time()
lda_sklearn = LatentDirichletAllocation(
    n_components=num_topics,
    max_iter=max_iter,
    learning_method="online",
    random_state=47
)
lda_sklearn.fit(X)
sklearn_time = time.time() - start_time

print(f"\nScikit-learn time: {sklearn_time:.3f}")
for topic_id, topic in enumerate(lda_sklearn.components_):
    top_words = [vocab[i] for i in topic.argsort()[-5:][::-1]]
    print(f"topic {topic_id}: {top_words}")


Scikit-learn time: 1.184
topic 0: ['russian', 'world', 'ukraine', 'war', 'old']
topic 1: ['says', 'league', 'say', 'war', 'finals']
topic 2: ['ukraine', 'uk', 'russia', 'ukrainian', 'children']
topic 3: ['ukraine', 'says', 'social', 'bbc', 'thousands']
topic 4: ['england', 'russia', 'west', 'ukraine', 'indies']


In [6]:
topics_sklearn = []
for topic in lda_sklearn.components_:
    top_words = [vocab[i] for i in topic.argsort()[-10:][::-1]]
    topics_sklearn.append(top_words)

coherence_model = CoherenceModel(
    topics=topics_sklearn,
    texts=texts,
    dictionary=dictionary,
    coherence="c_v"
)
sklearn_coherence = coherence_model.get_coherence()
print(f"Когерентность (sklearn): {sklearn_coherence:.3f}")

Когерентность (sklearn): 0.372
