Base colab code source for LDA model taken from: https://colab.research.google.com/github/MIND-Lab/OCTIS/blob/master/examples/OCTIS_LDA_training_only.ipynb

BERTopic evaluation package: https://github.com/MaartenGr/BERTopic_evaluation

In [None]:
!pip install octis
!pip install .[bertopic]
!pip install bertopic==v0.9.4
!pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting octis
  Downloading octis-1.10.4-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 7.4 MB/s 
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.2 MB/s 
Collecting tomotopy
  Downloading tomotopy-0.12.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (16.5 MB)
[K     |████████████████████████████████| 16.5 MB 55.9 MB/s 
Collecting gensim>=4.0.0
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 57 kB/s 
[?25hCollecting libsvm
  Downloading libsvm-3.23.0.4.tar.gz (170 kB)
[K     |████████████████████████████████| 170 kB 42.3 MB/s 
Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp38-cp38-manylinux2010_x86_64.whl (24.9 MB)
[K     |█████████

In [None]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from sklearn.feature_extraction.text import CountVectorizer
# from evaluation import Trainer
# from data import DataLoader

In [None]:
# Define dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

print(dataset.get_vocabulary())

NUM_TOPICS = 20



In [None]:
# Create Model
model = LDA(num_topics=NUM_TOPICS, alpha=0.1)

In [None]:
# Train the model using default partitioning choice 
output = model.train_model(dataset)

print(*list(output.keys()), sep="\n") # Print the output identifiers



topic-word-matrix
topics
topic-document-matrix
test-topic-document-matrix


In [None]:
for t in output['topics'][:5]:
  print(" ".join(t))

game team win year play good player make season time
car buy money problem company pay year work sell good
chip card mode bit run time communication bank work system
make thing good time point problem people find light back
book jewish patient arab disease page information title greek program


In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
umass = Coherence(texts=dataset.get_corpus(), topk=10, measure='u_mass')
cv = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_v')
uci = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_uci')

In [None]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: "+str(npmi_score))
umass_score = umass.score(output)
print("Coherence: "+str(umass_score))
cv_score = cv.score(output)
print("Coherence: "+str(cv_score))
uci_score = uci.score(output)
print("Coherence: "+str(uci_score))

Topic diversity: 0.74
Coherence: 0.04804667849438671
Coherence: -2.2445109826639307
Coherence: 0.519133103675679
Coherence: 0.017321758979920034


In [None]:
%%capture
from sentence_transformers import SentenceTransformer

# Prepare data
dataset, custom = "20NewsGroup", False
data_loader = DataLoader(dataset)
_, timestamps = data_loader.load_docs()
data = data_loader.load_octis(custom)
data = [" ".join(words) for words in data.get_corpus()]

# Extract embeddings
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(data, show_progress_bar=True)

In [None]:
import pickle

with open('embeddings.pickle', 'wb') as handle:
    pickle.dump((dataset, custom, embeddings), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle

with open('embeddings.pickle', 'rb') as handle:
    dataset, custom, embeddings = pickle.load(handle)

In [None]:
from evaluation import Trainer
print(str(dataset))
params = {
    "embedding_model": "all-mpnet-base-v2",
    "nr_topics": [NUM_TOPICS for i in range(5)],
    "min_topic_size": 15,
    "diversity": None,
    "verbose": True
}

trainer = Trainer(dataset=dataset,
                  model_name="BERTopic",
                  params=params,
                  bt_embeddings=embeddings,
                  custom_dataset=custom,)
                  # verbose=True)
results = trainer.train(save=f"BERTopic_news_{1}")

20NewsGroup


2022-12-05 00:41:07,738 - BERTopic - Reduced dimensionality with UMAP
2022-12-05 00:41:08,725 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-12-05 00:41:27,239 - BERTopic - Reduced number of topics from 85 to 21


Results123
npmi: 0.18182166234282388
umass: -1.7965300794488983
cv: 0.7425894106367811
uci: 1.4153266737303767
diversity: 0.9
 


2022-12-05 00:42:25,919 - BERTopic - Reduced dimensionality with UMAP
2022-12-05 00:42:26,854 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-12-05 00:42:46,870 - BERTopic - Reduced number of topics from 86 to 21


Results123
npmi: 0.18374540244260112
umass: -1.8825536021191254
cv: 0.7416305115371481
uci: 1.4124947917432362
diversity: 0.88
 


2022-12-05 00:43:40,050 - BERTopic - Reduced dimensionality with UMAP
2022-12-05 00:43:41,011 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-12-05 00:44:00,310 - BERTopic - Reduced number of topics from 85 to 21


Results123
npmi: 0.18598519827807083
umass: -1.8521944391744114
cv: 0.7400174770881423
uci: 1.462846612425816
diversity: 0.905
 


2022-12-05 00:44:56,659 - BERTopic - Reduced dimensionality with UMAP
2022-12-05 00:44:57,577 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-12-05 00:45:16,782 - BERTopic - Reduced number of topics from 81 to 21


Results123
npmi: 0.1855918150331591
umass: -1.8944230857100421
cv: 0.7527964370622736
uci: 1.44410616962557
diversity: 0.905
 


2022-12-05 00:46:12,068 - BERTopic - Reduced dimensionality with UMAP
2022-12-05 00:46:12,980 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-12-05 00:46:32,984 - BERTopic - Reduced number of topics from 89 to 21


Results123
npmi: 0.17778786299342791
umass: -1.8798098878148835
cv: 0.7342561860382728
uci: 1.3821971649500149
diversity: 0.92
 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>