### Mounting drive and installing packages

In [None]:
# mounting drive to import data and stopword list

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# installing packages

!pip install datasets
!pip install bertopic
!pip install joblib==1.1.0
!pip install spacy_langdetect
!pip install lexicalrichness

### Topic Modelling with BERTopic

Loading the data and preprocessing

In [None]:
import pandas as pd
import re

# loading data
df_all = pd.read_csv("/content/drive/MyDrive/nlp_model/tweets_data.csv")

# loading list of stopwords. Original can be found at https://gist.github.com/berteltorp/0cf8a0c7afea7f25ed754f24cfc2467b#file-stopord-txt
with open('/content/drive/MyDrive/nlp_model/stopord.txt') as f:
    stopwords = f.readlines()

# saving all stopwords in a list
for i, val in enumerate(stopwords):
  new_line = val.replace("\n", "")
  stopwords[i] = new_line

# I do not want to remove stopwords, because BERTopic need them for context information. 
# However, there are certain Twitter handels and random words that we would like to remove, 
# since they convey no contextual information that we are interested in

def remove_mystopwords(sentence, stopword_list):
    tokens = sentence.split(" ")
    tokens_filtered= [word for word in tokens if not word in stopword_list]
    return (" ").join(tokens_filtered)

# creating list of words to be removed
random_words = ["link", "rt", "amp", "user"]

df_all["original_tweet"] = df_all.text # saving the original tweets in a new column
df_all.text = df_all.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1) # removing links
df_all.text = df_all.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1) # removing twitter handles
df_all.text = df_all.apply(lambda row: " ".join(re.sub("[^a-zA-ZÆæØøÅå]+", " ", row.text).split()), 1) # removing symbols
df_all.text = [remove_mystopwords(sentence, random_words) for sentence in df_all.text] # removing words in the list "random_words"

tweets = df_all.text.to_list() # saving the pre-processed text as a list

Creating topics with BERTopic

In [None]:
from hdbscan import HDBSCAN
import numpy as np
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

umap_model = UMAP(random_state=550) # fixing the random state to make the topics reproducible

vectorizer_model = CountVectorizer(stop_words = stopwords) # used for disregarding the stopwords in the Danish stopword list, when generating topic representations

# initialising an instance of HDBSCAN with the parameters appropriate for our case
hdbscan_model = HDBSCAN(min_cluster_size=70, # minimum number of documents in each cluster
                        metric='euclidean', # metric used for calculating distance in vector space
                        cluster_selection_method='eom', 
                        prediction_data=True, min_samples=10)

topic_model_hdb = BERTopic(embedding_model="paraphrase-multilingual-mpnet-base-v2", # specifying to use the v2 multilingual model
                           nr_topics = "auto", # this makes the model use something called HDBSCAN to merge topics that are quite similar
                           calculate_probabilities=True, 
                           vectorizer_model=vectorizer_model, # this is a way to remove our stop-words so they will not appear in the topic descriptions
                           hdbscan_model = hdbscan_model,
                           umap_model=umap_model # a way to make reproducible results
                           )

topics_hdb, probs_hdb = topic_model_hdb.fit_transform(tweets)

topic_model_hdb.get_topic_info()


Unnamed: 0,Topic,Count,Name
0,-1,1357,-1_dkpol_dag_dkmedier_sldk
1,0,1040,0_år_dag_tror_tak
2,1,260,1_dkpol_politik_df_valg
3,2,232,2_dkpol_dkmedier_dkbiz_danske
4,3,168,3_hold_sldk_kampe_mål
5,4,153,4_danmark_eu_dkpol_dansk
6,5,125,5_børn_uddpol_dkpol_unge
7,6,118,6_paludan_ytringsfrihed_ytringsfriheden_dkpol
8,7,111,7_dkpol_dkmedier_medier_fake
9,8,95,8_dkpol_sundpol_pension_sundhedsreform


Getting top 10 words for each topic representation

In [None]:
for i in range(-1, 11, 1):
   print(i, topic_model_hdb.get_topic(i))

-1 [('dkpol', 0.02770480283708928), ('dag', 0.012704453246107137), ('dkmedier', 0.012279566044288188), ('sldk', 0.011486384119350103), ('år', 0.010966686894915756), ('siger', 0.009423362030432536), ('tv', 0.009237898850773047), ('går', 0.008284245902130609), ('gang', 0.008154694372904605), ('tror', 0.008099108615260683)]
0 [('år', 0.014349897370796533), ('dag', 0.014322010916983947), ('tror', 0.013713037019162435), ('tak', 0.013018417427172399), ('ret', 0.012403996964966949), ('første', 0.011338621630117895), ('tid', 0.01113892389442149), ('blevet', 0.010602453717040328), ('hele', 0.010547280983321567), ('går', 0.010519877198372272)]
1 [('dkpol', 0.05413188564433684), ('politik', 0.03819049425506205), ('df', 0.034034896836674766), ('valg', 0.024050462055089674), ('vælgerne', 0.022063059119880195), ('partier', 0.021623910293133278), ('regering', 0.02156787075933461), ('folketinget', 0.021266608695401035), ('vælgere', 0.020781929104307848), ('venstre', 0.020076208247651085)]
2 [('dkpol',

Assigning additional tweets to each topic group

In [None]:
probability_threshold = 0.30 # threshold to use for assigning additional tweets to each topic group
new_topics = [np.argmax(prob) if max(prob) >= probability_threshold else topics_hdb[idx] for idx, prob in enumerate(probs_hdb)]

# Adding the topics to the original data
df_new_topics = df_all.copy()
df_new_topics["old_topic"] = topics_hdb
df_new_topics["new_topic"] = new_topics
df_new_topics["probabilities"] = np.max(probs_hdb, axis = 1)

### Saving sub-groups

In [None]:
df_new_topics = df_new_topics.drop(['old_topic'], axis = 1).reset_index(drop=True) # dropping the column indicating the old topic before reassignment

df_new_topics.to_csv("/content/drive/MyDrive/nlp_model/sub_groups_16122022.csv")