In [1]:
%reload_ext autoreload
%autoreload 2

In [None]:
import time
import gensim
from coherence import calculate_lda_coherence
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer

### 20 News Groups Dataset

In [3]:
from sklearn.datasets import fetch_20newsgroups

print("Loading dataset...")
data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)
data_samples = data[:1000]
cleaned_docs_by_words = [doc.split() for doc in data_samples]

n_topics = 20
vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=50, stop_words="english"
)
tf = vectorizer.fit_transform(data_samples)
tf

Loading dataset...


<1000x50 sparse matrix of type '<class 'numpy.int64'>'
	with 5087 stored elements in Compressed Sparse Row format>

### Custom LDA

In [4]:
from lda import LDA

lda = LDA(n_topics=n_topics, alpha=1.1, beta=1.1, max_iter=100)

t1 = time.time()
lda.fit(tf)
t2 = time.time()
print(f"Fitting Time: {(t2-t1):.3f} s")

  return array / np.sum(array, axis=axis, keepdims=True)


Iter 1: 2.4237
Iter 2: 2.4079
Iter 3: 2.6024
Iter 4: 2.6783
Iter 5: 2.6667
Iter 6: 2.6229
Iter 7: 2.5630
Iter 8: 2.4226
Iter 9: 2.2576
Iter 10: 2.1028
Iter 11: 1.9303
Iter 12: 1.7188
Iter 13: 1.5812
Iter 14: 1.4840
Iter 15: 1.3898
Iter 16: 1.2856
Iter 17: 1.1831
Iter 18: 1.0983
Iter 19: 1.0133
Iter 20: 0.9259
Iter 21: 0.8306
Iter 22: 0.7314
Iter 23: 0.6318
Iter 24: 0.5427
Iter 25: 0.4756
Iter 26: 0.4295
Iter 27: 0.4066
Iter 28: 0.3960
Iter 29: 0.3752
Iter 30: 0.3116
Iter 31: 0.2375
Iter 32: 0.1877
Iter 33: 0.1560
Iter 34: 0.1321
Iter 35: 0.1127
Iter 36: 0.0978
Iter 37: 0.0882
Iter 38: 0.0825
Iter 39: 0.0796
Iter 40: 0.0795
Iter 41: 0.0812
Iter 42: 0.0846
Iter 43: 0.0885
Iter 44: 0.0913
Iter 45: 0.0900
Iter 46: 0.0849
Iter 47: 0.0788
Iter 48: 0.0727
Iter 49: 0.0667
Iter 50: 0.0614
Iter 51: 0.0572
Iter 52: 0.0536
Iter 53: 0.0500
Iter 54: 0.0464
Fitting Time: 212.270 s


In [5]:
coherence_score = calculate_lda_coherence(
    topic_word_distribution=lda.phi_wt.T,
    texts=cleaned_docs_by_words,  
    feature_names=vectorizer.get_feature_names_out(),
    n_words=50,
    n_topics=n_topics
)

coherence_score

0.372149434718127

### Gensim LDA

In [6]:
dictionary = corpora.Dictionary(cleaned_docs_by_words)
corpus = [dictionary.doc2bow(doc) for doc in cleaned_docs_by_words]

In [7]:
t1 = time.time()
lda_model = gensim.models.LdaModel(
    corpus=corpus, 
    id2word=dictionary, 
    num_topics=n_topics
)
t2 = time.time()
print(f"Fitting time: {(t2-t1):.3f} s")

Fitting time: 8.703 s


In [8]:
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(
    model=lda_model, 
    texts=cleaned_docs_by_words, 
    corpus=corpus, 
    coherence='c_v'
)
coherence = cm.get_coherence()
coherence

0.3569362157948577

### Sklearn LDA Model

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
lda_sklearn = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=20,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)

t1 = time.time()
lda_sklearn.fit(tf)
t2 = time.time()
print(f"Fitting time: {(t2-t1):.3f} s")

Fitting time: 2.247 s


In [11]:
coherence_score = calculate_lda_coherence(
    topic_word_distribution=lda_sklearn.components_,
    feature_names=vectorizer.get_feature_names_out(),
    texts=cleaned_docs_by_words, 
    n_words=50,
    n_topics=n_topics
)
coherence_score 


0.38795283193956526