# Importing libraries

In [33]:
import math
import plotly.express as px
import os
import sys

import optuna
from bertopic import BERTopic
from git_root import git_root
from hdbscan import HDBSCAN
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from optuna.samplers import TPESampler
from sentence_transformers import SentenceTransformer
from umap import UMAP

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
my_git_root = git_root()
sys.path.append(my_git_root)
from src import *

# Loading the data

In [34]:
df_segmented_paragraphs_path = f'{my_git_root}/final_notebooks/data/chunked_documents_final.csv'
df_segmented_paragraphs = pd.read_csv(df_segmented_paragraphs_path)

In [35]:
df_segmented_paragraphs

Unnamed: 0,name,text
0,aalto-university.md,Aalto University
1,aalto-university.md,Aalto University Research Data Management Policy
2,aalto-university.md,The research data management policy aims to ma...
3,aalto-university.md,The data management policy shall be implemente...
4,aalto-university.md,Ownership of copyright protected research data...
...,...,...
3481,wrexham-university.md,"FAIR Data\n\nWhere it is lawful to do so, the ..."
3482,wrexham-university.md,Prifysgol Wrecsam Wrexham University\n\nPublic...
3483,wrexham-university.md,Reporting a Data Incident/Breach\n\nThe UK GDP...
3484,wrexham-university.md,"Other Polices, Procedures, Legislation\n\nThis..."


In [36]:
documents = df_segmented_paragraphs['text'].to_list()

In [37]:
len(documents)

3486

# Topic Modelling

In [38]:
embedding_model = SentenceTransformer('thenlper/gte-small', trust_remote_code=True)

embeddings = embedding_model.encode(documents)

In [39]:
umap_parameters = {
    'n_neighbors': 15,
    'n_components': 5,
    'min_dist': 0.0,
    'metric': 'cosine',
    'random_state': 42
}

umap_model = UMAP(**umap_parameters)

hdbscan_parameters = {
    'min_cluster_size': 10,
    'min_samples': 10,
}

hdbscan_model = HDBSCAN(**hdbscan_parameters)

bertopic_parameters = {
    'top_n_words': 25,
    'n_gram_range': (2, 5),
    'umap_model': umap_model,
    'hdbscan_model': hdbscan_model
}

In [40]:
topic_model = BERTopic(**bertopic_parameters)
topics, probs = topic_model.fit_transform(documents, embeddings)

In [41]:
topic_model.get_topics()

{-1: [('research data', 0.006405989163478966),
  ('data management', 0.004176109077401026),
  ('of the', 0.004115023415912813),
  ('the data', 0.003575825328360981),
  ('the research', 0.0032421216328901124),
  ('research data management', 0.0030461067828477088),
  ('of research', 0.0029012449592728897),
  ('should be', 0.002847663952203793),
  ('the university', 0.0027669448132520706),
  ('in the', 0.0027553348718865195),
  ('with the', 0.002642496533857493),
  ('to the', 0.002596658451536849),
  ('must be', 0.002276991817122829),
  ('for the', 0.002213755612152254),
  ('of research data', 0.0021316734443994046),
  ('personal data', 0.0019899402365309363),
  ('of data', 0.0019302519197593737),
  ('data and', 0.0019266039464694328),
  ('will be', 0.0018673415336398329),
  ('this policy', 0.0018493325394575964),
  ('and the', 0.0018227937156031237),
  ('the research data', 0.0017934841716344073),
  ('responsible for', 0.0017755288011868837),
  ('by the', 0.001771057674620925),
  ('it is

In [42]:
# for key, value in topic_model.get_topics().items():
#     topic_descriptors = ', '.join([x[0] for x in value])
#     print(f'\\item \\textbf{{Topic {key}}}: {topic_descriptors}\\\\ \hline')

In [43]:
df_topic_info = topic_model.get_document_info(documents)
df_topic_info

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Aalto University,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
1,Aalto University Research Data Management Policy,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
2,The research data management policy aims to ma...,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
3,The data management policy shall be implemente...,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
4,Ownership of copyright protected research data...,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
...,...,...,...,...,...,...,...,...
3481,"FAIR Data\n\nWhere it is lawful to do so, the ...",11,11_fair principles_findable accessible_the fai...,"[fair principles, findable accessible, the fai...",[5. FAIR principles \n\nThe FAIR principles12...,fair principles - findable accessible - the fa...,0.347049,False
3482,Prifysgol Wrecsam Wrexham University\n\nPublic...,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
3483,Reporting a Data Incident/Breach\n\nThe UK GDP...,-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False
3484,"Other Polices, Procedures, Legislation\n\nThis...",-1,-1_research data_data management_of the_the data,"[research data, data management, of the, the d...",[1. Introduction\n\nResearch data are central ...,research data - data management - of the - the...,0.000000,False


In [44]:
topic_model.visualize_topics()

# Evaluation

In [45]:
# Preprocess Documents
def remove_empty_topics(topic_words, top_n_words):
    result = []
    for words in topic_words:
        if words != ['']*top_n_words:
            result.append(words)
    return result

documents_df = pd.DataFrame({"Document": documents,
                        "ID": range(len(documents)),
                        "Topic": topics})
documents_per_topic = documents_df.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]

topics_dict = topic_model.get_topics()
topic_words = [[word for word, _ in words] for words in topics_dict.values()]
topic_term_matrix = topic_model.c_tf_idf_.toarray()

remove_empty_topics(topic_words, 25)

octis_topics = {'topics': topic_words, 'topic-document-matrix': topic_term_matrix}

coherence = Coherence(texts = tokens, measure='c_npmi')
diversity = TopicDiversity(topk=25)

diversity_score = diversity.score(octis_topics)
coherence_score = coherence.score(octis_topics)

print(f'Diversity: {diversity_score}')
print(f'Coherence: {coherence_score}')

Diversity: 0.7769014084507042
Coherence: -0.0733694153393749


In [46]:
coherence_score_normalized = (coherence_score + 1) / 2
harmonic_mean = 2 * (coherence_score_normalized*diversity_score) / (coherence_score_normalized+diversity_score)
print(harmonic_mean)

0.5804635640553681


# Hierarchical Clustering

In [47]:
hierarchical_topics = topic_model.hierarchical_topics(documents)

100%|██████████| 69/69 [00:00<00:00, 144.41it/s]


In [48]:
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
fig.show()

# Exploring the topics

In [49]:
def get_topic_info(df_topic_info, topic_indices):
    for topic_index in topic_indices:
        my_df_topic_info = df_topic_info[df_topic_info['Topic'] == topic_index]
        print(f'Topic: {my_df_topic_info["Topic"].iloc[0]}')
        print(f'Representation: {my_df_topic_info["Representation"].iloc[0]}')
        my_documents = my_df_topic_info['Document'].tolist()
        print(f'Documents:')
        for doc in my_documents:
            print(f'\t{doc}\n')

In [50]:
get_topic_info(df_topic_info, [2])

Topic: 2
Representation: ['research data', 'the university', 'of research', 'research data management', 'this policy', 'data management', 'open research', 'on open', 'and the', 'of research data', 'of the', 'university of', 'open research data', 'on open research', 'open access', 'concordat on', 'concordat on open', 'the research', 'concordat on open research', 'data and', 'principles on', 'common principles on', 'the university of', 'on open research data', 'common principles']
Documents:
	1. INTRODUCTION AND PURPOSE  

The maintenance of accurate and retrievable data arising from research projects is an essential component of good practice in the conduct of research and a key component of research integrity and research reproducibility. Consideration around research data management and evidence of good research data management should be seen as an integral part of the research process, which is what Aston University expects of all researchers. This is in line with Aston’s values and 