### Mounting drive and installing packages

In [None]:
# mounting drive to import data and stopword list
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# installing packages
!pip install bertopic
!pip install joblib==1.1.0

### Playing with Topic Modelling

Loading the data and stopword list

In [None]:
import pandas as pd
import re

# load data
df_original = pd.read_csv("file_name.csv")

# keeps the original data intact for when saving the output
df_all = df_original.copy()

# load stopwords. This should result in a list. The following work for a txt-file
with open('stopword_list_name.txt') as f:
    stopwords = f.readlines()

# saving all stopwords in a list
for i, val in enumerate(stopwords):
  new_line = val.replace("\n", "")
  stopwords[i] = new_line

Pre-processing Example

In [None]:
# suggestion for pre-processing of Twitter data. This can be skipped if no additional pre-processing is needed
# in the following we pretend a column in the pandas dataframe df_all is called "text". This can easily be changed to the appropriate column name
df_all.text = df_all.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1) # example of removing links
df_all.text = df_all.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1) # eaxmple of removing twitter handles
df_all.text = df_all.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1) # example of removing symbols

In [None]:
# stopwords should not be removed, because BERTopic need them for contextual information. 
# However, if the data contains words that supply no contextual information these can be removed with the following 

def remove_random_words(sentence, stopword_list):
    tokens = sentence.split(" ")
    tokens_filtered= [word for word in tokens if not word in stopword_list]
    return (" ").join(tokens_filtered)

# list of words with no contextual information
random_words = [""]

df_all.text = [remove_random_words(sentence, random_words) for sentence in df_all.text] # removing words in the list "random_words"

Saving all documents as list

In [None]:
documents = df_all.text.to_list() # saving the pre-processed text as a list for BERTopic

Creating topics with BERTopic

In [None]:
from hdbscan import HDBSCAN
import numpy as np
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

#umap_model = UMAP(random_state=550) # if applicable, the random state of UMAP can be fixed to make the topics reproducible

# used for disregarding the stopwords in the stopword list, when generating topic representations
# this is useful for generating more easily interpretable topics
vectorizer_model = CountVectorizer(stop_words = stopwords)

# initialising an instance of HDBSCAN with appropriate parameters
hdbscan_model = HDBSCAN(min_cluster_size=70, # minimum number of documents in each cluster
                        metric='euclidean', # metric used for calculating distance in vector space
                        prediction_data=True, # this speeds up later predictions
                        min_samples=10 # indicating how conservative the clusterings algorithm should be when it comes to outliers. Higher values generete larger outlier topic
                        )

# creating the wanted BERTopic instance
topic_model = BERTopic(embedding_model="paraphrase-multilingual-mpnet-base-v2", # specifying to use the multilingual model. Alternative is "all-mpnet-base-v2" for English
                           nr_topics = "auto", # "auto" merge similar topics. Alternatively a number of topics can be specified
                           calculate_probabilities=True, # calculates the probabilities of all topics for each document
                           vectorizer_model=vectorizer_model, # using the CountVectorizer from above to remove stopwords from topic representations
                           hdbscan_model = hdbscan_model # using the HDBSCAN settings initialised above
                           #umap_model=umap_model # only use if UMAP model should be used for replication of results
                           #diversity=0.2 # Additional parameter that can be used to diversify the resulting topic representations. If set to None, MMR will not be used.
                           )

# applying BERTopic to the data
topics, probabilities = topic_model.fit_transform(documents)


Inspecting the topics with top 4 words from topic representations

In [None]:
topic_model.get_topic_info()

Assigning additional documents to each topic group (only if applicable to the specific use case)

In [None]:
# threshold to use for assigning additional tweets to each topic group
probability_threshold = 0.30

# assigning additional documents to each topic if probability above 0.30
topics = [np.argmax(prob) if max(prob) >= probability_threshold else topics[idx] for idx, prob in enumerate(probabilities)]

### Saving the data now with column indicating topics

In [None]:
# adding topics to original data
df_original["topic"] = topics

# saving the output of BERTopic analysis. As minimum this will contain a column with the original text from each document and the topics
df_original.to_csv("file_name.csv")