## Loading Necessary Libraries

In [1]:
import pandas as pd
import warnings
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

warnings.simplefilter('ignore')

## Loading datasets

In [2]:
data_2020_long = pd.read_csv('data_2020_long.csv')
data_2022_long = pd.read_csv('data_2022_long.csv')

In [3]:
data_2020_long['timestamp'] = pd.to_datetime(data_2020_long['timestamp'], errors='coerce')
data_2022_long['timestamp'] = pd.to_datetime(data_2022_long['timestamp'], errors='coerce')


## Embeddings

In [None]:
## 2020-2021 embeddings
split_size = len(data_2020_long) // 30
# List to store each sequential part
sequential_splits = []

for i in range(30):
    # Calculate start and end indices for each split
    start_idx = i * split_size
    # Ensure the last split captures any remaining rows
    end_idx = (i + 1) * split_size if i < 30 - 1 else len(data_2020_long)

    # Slice the DataFrame
    split_df = data_2020_long.iloc[start_idx:end_idx]
    sequential_splits.append(split_df)

In [None]:
# Initialize your SentenceTransformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# List to hold the embeddings
embeddings_list = []

for i, part in enumerate(sequential_splits):
    # Extract the text column (adjust column name as needed)
    docs = part['text'].tolist()  # Replace 'text_column' with the actual column name

    # Generate embeddings
    embeddings = sentence_model.encode(docs)

    # Store the embeddings (you can save or process them as needed)
    embeddings_list.append(embeddings)

    print(f'Processed sequential part {i+1}/{30}')

In [None]:
embeddings_2020 = embeddings_list[:18]
flattened_embeddings_2020 = np.vstack(embeddings_2020)
len(flattened_embeddings_2020)

In [None]:
np.save("flattened_embeddings_2020.npy", flattened_embeddings_2020)

In [None]:
## 2022-2023 embeddings
split_size = len(data_2022_long) // 20

In [None]:
# List to store each sequential part
sequential_splits2 = []

for i in range(20):
    # Calculate start and end indices for each split
    start_idx = i * split_size
    # Ensure the last split captures any remaining rows
    end_idx = (i + 1) * split_size if i < 20 - 1 else len(data_2022_long)

    # Slice the DataFrame
    split_df = data_2022_long.iloc[start_idx:end_idx]
    sequential_splits2.append(split_df)

In [None]:
# Initialize your SentenceTransformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# List to hold the embeddings
embeddings_list_2 = []

for i, part in enumerate(sequential_splits2):
    # Extract the text column (adjust column name as needed)
    docs = part['text'].tolist()  # Replace 'text_column' with the actual column name

    # Generate embeddings
    embeddings = sentence_model.encode(docs)

    # Store the embeddings (you can save or process them as needed)
    embeddings_list_2.append(embeddings)

    print(f'Processed sequential part {i+1}/{20}')

In [None]:
embeddings_2022 = embeddings_list_2
flattened_embeddings_2022 = np.vstack(embeddings_2022)
len(flattened_embeddings_2022)

In [None]:
# Save as .npy file
np.save("flattened_embeddings_2022.npy", flattened_embeddings_2022)

## Read in embeddings

In [7]:
flattened_embeddings_2020 = np.load("flattened_embeddings_2020.npy")
flattened_embeddings_2022 = np.load("flattened_embeddings_2022.npy")

## Topic Modelling for 2023 comments

In [None]:
# Filter for observations in 2023
filtered_df = data_2022_long[data_2022_long['timestamp'].dt.year == 2023]

# Get the indices of the filtered DataFrame
filtered_indices = filtered_df.index.tolist()

# Extract the corresponding embeddings using the filtered indices
filtered_embeddings = [flattened_embeddings_2022[i] for i in filtered_indices]


# Verify truth
len(filtered_df) == len(filtered_embeddings)

In [None]:
#Best parameters
best_hdbscan = HDBSCAN(cluster_selection_method='eom', metric='euclidean',
        min_cluster_size=300, min_samples=5)

#Best bertopic model
topic_model = BERTopic(hdbscan_model=best_hdbscan)

#fit topic modelling to the preprocessed text data
topics, probabilities = topic_model.fit_transform(filtered_df["text"], np.array(filtered_embeddings))

In [None]:
top_15_topics = topic_model.get_topic_info().head(16).set_index('Topic')[['Count', 'Name', 'Representation']]

top_15_topics = top_15_topics.drop(index=-1, errors='ignore')

top_15_topics

In [None]:
top_15_topics['Count'].sum()/len(filtered_df)

In [None]:
#Assign each comment to a topic
filtered_df['topic'] = topics

In [None]:
data_top15 = filtered_df[(filtered_df['topic'] < 15) & (filtered_df['topic'] >= 0)]

data_top15.head(5)

In [None]:
data_top15.to_csv("output/2023_comments_top15topics.csv")
top_15_topics.to_csv("output/2023top15topics.csv")

## Topic Modelling for 2022 comments

In [None]:
# Filter for observations in 2022
filtered_df = data_2022_long[data_2022_long['timestamp'].dt.year == 2022]

# Get the indices of the filtered DataFrame
filtered_indices = filtered_df.index.tolist()

# Extract the corresponding embeddings using the filtered indices
filtered_embeddings = [flattened_embeddings_2022[i] for i in filtered_indices]


# Verify truth
len(filtered_df) == len(filtered_embeddings)

In [None]:
#Best parameters
best_hdbscan = HDBSCAN(cluster_selection_method='eom', metric='euclidean',
        min_cluster_size=300, min_samples=5)

#Best bertopic model
topic_model = BERTopic(hdbscan_model=best_hdbscan)

#fit topic modelling to the preprocessed text data
topics, probabilities = topic_model.fit_transform(filtered_df["text"], np.array(filtered_embeddings))

In [None]:
top_15_topics = topic_model.get_topic_info().head(16).set_index('Topic')[['Count', 'Name', 'Representation']]

top_15_topics = top_15_topics.drop(index=-1, errors='ignore')

top_15_topics

In [None]:
top_15_topics['Count'].sum()/len(filtered_df)

In [None]:
#Assign each comment to a topic
filtered_df['topic'] = topics

In [None]:
data_top15 = filtered_df[(filtered_df['topic'] < 15) & (filtered_df['topic'] >= 0)]

data_top15.head(5)

In [None]:
data_top15.to_csv("output/2022_comments_top15topics.csv")
top_15_topics.to_csv("output/2022top15topics.csv")

## Topic Modelling for 2021 comments

In [10]:
# Filter for observations in 2022
filtered_df = data_2020_long[data_2020_long['timestamp'].dt.year == 2021]

# Get the indices of the filtered DataFrame
filtered_indices = filtered_df.index.tolist()

# Extract the corresponding embeddings using the filtered indices
filtered_embeddings = [flattened_embeddings_2020[i] for i in filtered_indices]


# Verify truth
len(filtered_df) == len(filtered_embeddings)

True

In [None]:
#Best parameters
best_hdbscan = HDBSCAN(cluster_selection_method='eom', metric='euclidean',
        min_cluster_size=300, min_samples=5)

#Best bertopic model
topic_model = BERTopic(hdbscan_model=best_hdbscan)

#fit topic modelling to the preprocessed text data
topics, probabilities = topic_model.fit_transform(filtered_df["text"], np.array(filtered_embeddings))

In [None]:
top_15_topics = topic_model.get_topic_info().head(16).set_index('Topic')[['Count', 'Name', 'Representation']]

top_15_topics = top_15_topics.drop(index=-1, errors='ignore')

top_15_topics

In [None]:
top_15_topics['Count'].sum()/len(filtered_df)

In [None]:
#Assign each comment to a topic
filtered_df['topic'] = topics

In [None]:
data_top15 = filtered_df[(filtered_df['topic'] < 15) & (filtered_df['topic'] >= 0)]

data_top15.head(5)

In [None]:
data_top15.to_csv("output/2021_comments_top15topics.csv")
top_15_topics.to_csv("output/2021top15topics.csv")

## Topic Modelling for 2020 comments

In [11]:
# Filter for observations in 2022
filtered_df = data_2020_long[data_2020_long['timestamp'].dt.year == 2020]

# Get the indices of the filtered DataFrame
filtered_indices = filtered_df.index.tolist()

# Extract the corresponding embeddings using the filtered indices
filtered_embeddings = [flattened_embeddings_2020[i] for i in filtered_indices]


# Verify truth
len(filtered_df) == len(filtered_embeddings)

True

In [13]:
#Best parameters
best_hdbscan = HDBSCAN(cluster_selection_method='eom', metric='euclidean',
        min_cluster_size=300, min_samples=5)

#Best bertopic model
topic_model = BERTopic(hdbscan_model=best_hdbscan)

#fit topic modelling to the preprocessed text data
topics, probabilities = topic_model.fit_transform(filtered_df["text"], np.array(filtered_embeddings))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [15]:
top_15_topics = topic_model.get_topic_info().head(16).set_index('Topic')[['Count', 'Name', 'Representation']]

top_15_topics = top_15_topics.drop(index=-1, errors='ignore')

top_15_topics

In [16]:
top_15_topics['Count'].sum()/len(filtered_df)

0.13673076204823026

In [17]:
#Assign each comment to a topic
filtered_df['topic'] = topics

In [None]:
top_15_topics = topic_model.get_topic_info().head(16).set_index('Topic')[['Count', 'Name', 'Representation']]

top_15_topics = top_15_topics.drop(index=-1, errors='ignore')

top_15_topics

In [None]:
data_top15.to_csv("output/2020_comments_top15topics.csv")
top_15_topics.to_csv("output/2020top15topics.csv")