# Importing libraries

In [1]:
import os
import sys

import pandas as pd
from bertopic import BERTopic
from git_root import git_root
from hdbscan import HDBSCAN
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from sentence_transformers import SentenceTransformer
from umap import UMAP

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
my_git_root = git_root()
sys.path.append(my_git_root)

ModuleNotFoundError: No module named 'octis'

# Loading the data

In [2]:
df_segmented_paragraphs_path = f'{my_git_root}/data/output/documents_chunked_fixed_length.csv'
df_segmented_paragraphs = pd.read_csv(df_segmented_paragraphs_path)

In [3]:
df_segmented_paragraphs

Unnamed: 0,name,text
0,aalto-university.md,# Aalto University \n\n## Aalto University Re...
1,aalto-university.md,about the user rights of third parties accord...
2,aalto-university.md,Research data and the necessary software to ac...
3,aalto-university.md,and documentation to make it useful. Aalto Un...
4,aberystwyth-university.md,# Research Data Management Policy\n\nVersion 3...
...,...,...
960,wrexham-university.md,atic monitoring of a publicly accessible area ...
961,wrexham-university.md,"loss and corruption, and unauthorised access ..."
962,wrexham-university.md,"2021, these are Andorra, Argentina, Canada, F..."
963,wrexham-university.md,no longer than is necessary for the purposes ...


In [4]:
documents = df_segmented_paragraphs['text'].to_list()

In [5]:
len(documents)

965

# Topic Modelling

In [6]:
embedding_model = SentenceTransformer('thenlper/gte-small', trust_remote_code=True)

embeddings = embedding_model.encode(documents)

Apply Default Parameters

In [7]:
umap_parameters = {
    'n_neighbors': 15,
    'n_components': 5,
    'min_dist': 0.0,
    'metric': 'cosine',
    'random_state': 42
}

umap_model = UMAP(**umap_parameters)

hdbscan_parameters = {
    'min_cluster_size': 10,
    'min_samples': 10,
}

hdbscan_model = HDBSCAN(**hdbscan_parameters)

bertopic_parameters = {
    'top_n_words': 25,
    'n_gram_range': (2, 5),
    'umap_model': umap_model,
    'hdbscan_model': hdbscan_model
}

In [8]:
topic_model = BERTopic(**bertopic_parameters)
topics, probs = topic_model.fit_transform(documents, embeddings)

# Evaluation

In [13]:
# Preprocess Documents
def remove_empty_topics(topic_words, top_n_words):
    result = []
    for words in topic_words:
        if words != ['']*top_n_words:
            result.append(words)
    return result

documents_df = pd.DataFrame({"Document": documents,
                        "ID": range(len(documents)),
                        "Topic": topics})
documents_per_topic = documents_df.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]

topics_dict = topic_model.get_topics()
topic_words = [[word for word, _ in words] for words in topics_dict.values()]
topic_term_matrix = topic_model.c_tf_idf_.toarray()

remove_empty_topics(topic_words, 25)

octis_topics = {'topics': topic_words, 'topic-document-matrix': topic_term_matrix}

coherence = Coherence(texts = tokens, measure='c_npmi')
diversity = TopicDiversity(topk=25)

diversity_score = diversity.score(octis_topics)
coherence_score = coherence.score(octis_topics)

print(f'Diversity: {diversity_score}')
print(f'Coherence: {coherence_score}')

Diversity: 0.5283333333333333
Coherence: -0.019322583743616013


In [14]:
coherence_score_normalized = (coherence_score + 1) / 2
harmonic_mean = 2 * (coherence_score_normalized*diversity_score) / (coherence_score_normalized+diversity_score)
print(harmonic_mean)

0.5086274553212281
