### Import libraries

In [None]:
import pandas as pd
import warnings
import gensim
import numpy as np
import plotly.express as px
import sklearn

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import re
import spacy

import matplotlib.pyplot as plt

!pip install bertopic
from bertopic import BERTopic

warnings.simplefilter('ignore')

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB

### Loading in datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_2020_long = pd.read_csv('/content/drive/My Drive/UNI/data_2020_long.csv')
data_2022 = pd.read_csv('/content/drive/My Drive/UNI/data_2022.csv')

In [None]:
data_2022['timestamp'] = pd.to_datetime(data_2022['timestamp'])
data_2022['comment_length'] = data_2022['text'].str.split(' ').str.len()
data_2022['text'] = data_2022['text'].astype(str)
data_2022.drop(data_2022.columns[0], axis=1, inplace=True)

# Dataset before filtering for longer
data_2022.shape

(1823746, 11)

In [None]:
data_2022_long = data_2022[data_2022['comment_length'] > 8]
data_2022_long.shape #about 50% of the dataset has comments longer than 8

(922522, 11)

##Random Sampling

In [None]:
data_2022_long['year'] = data_2022_long['timestamp'].dt.year
data_2020_long['year'] = data_2022_long['timestamp'].dt.year

# Now you can perform the groupby and filter operations
data_2022_text = data_2022_long[data_2022_long['year'] == 2022]
data_2023_text = data_2022_long[data_2022_long['year'] == 2023]
data_2020_text = data_2020_long[data_2020_long['year'] == 2020]
data_2021_text = data_2021_long[data_2021_long['year'] == 2021]

In [None]:
data_2022_long['month'] = data_2022_long['timestamp'].dt.month

In [None]:
23000/922522 #Sampling about 2.5% of the data

In [None]:
# Define the total sample size and the number of samples to take
total_sample_size = 23000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2022_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2022_long[(data_2022_long['month'] == row['month']) & (data_2022_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2022 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(sampled_df_2022) < total_sample_size:
    remaining_rows = data_2022_long.sample(n=total_sample_size - len(sampled_df_2022), random_state=1)
    sampled_df = pd.concat([sampled_df_2022, remaining_rows], ignore_index=True)


In [None]:
data_2020_long['timestamp'] = pd.to_datetime(data_2020_long['timestamp'])
data_2020_long['month'] = data_2020_long['timestamp'].dt.month

In [None]:
33000/1345325 #Sampling about 2.5% of the data

In [None]:
# Define the total sample size and the number of samples to take
total_sample_size = 33000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2020_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2020_long[(data_2020_long['month'] == row['month']) & (data_2020_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2020 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(data_2020_long) < total_sample_size:
    remaining_rows = data_2020_long.sample(n=total_sample_size - len(sampled_df_2020), random_state=1)
    sampled_df = pd.concat([sampled_df_2020, remaining_rows], ignore_index=True)

In [None]:
sample_data = pd.concat([sampled_df_2020, sampled_df_2022], ignore_index=True)

##Generating Embeddings for 2020-2021 dataset and 2022-2023 dataset

In [None]:
# Step 1: Convert Text Data to Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # Use an SBERT model for embedding
embeddings_2020 = model.encode(data_2020_long['text'].tolist(), show_progress_bar=True)
embeddings_2022 = model.encode(data_2022_long['text'].tolist(), show_progress_bar=True)

In [None]:
#save the embeddings as separate files
np.save("/content/drive/My Drive/UNI/flattened_embeddings_2020.npy", embeddings_2020)
np.save("/content/drive/My Drive/UNI/flattened_embeddings_2022.npy", embeddings_2022)

## Tuning

In [None]:
SEED = 42

In [None]:
#step 1 - generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # Use an SBERT model for embedding
embeddings = model.encode(sample_data['text'].tolist(), show_progress_bar=True)

ValueError: cannot reshape array of size 322699232 into shape (1345325,384)

In [None]:
# Step 2: Reduce Dimensionality with UMAP
fitted_umap = umap.UMAP(n_components=5, n_neighbors=30, min_dist=0.0, random_state=SEED).fit(embeddings)
umap_embeddings = fitted_umap.embedding_

In [None]:
# Step 3: Define DBCV Score Function for RandomizedSearchCV
def calculate_core_distances(X, min_samples):
    from sklearn.metrics import pairwise_distances
    distances = pairwise_distances(X)
    sorted_distances = np.sort(distances, axis=1)
    core_distances = sorted_distances[:, min_samples]
    return core_distances

In [None]:
def calculate_density_reachable(X, labels, core_distances):
    density_reachable = np.zeros((X.shape[0], X.shape[0]))
    for i in range(X.shape[0]):
        for j in range(i + 1, X.shape[0]):
            if labels[i] == labels[j] and labels[i] != -1:
                distance = np.linalg.norm(X[i] - X[j])
                reachable = max(core_distances[i], core_distances[j])
                if distance <= reachable:
                    density_reachable[i, j] = density_reachable[j, i] = 1
    return density_reachable

In [None]:
def dbcv_score(X, labels, min_samples):
    core_distances = calculate_core_distances(X, min_samples)
    density_reachable = calculate_density_reachable(X, labels, core_distances)
    total_reachability = np.sum(density_reachable)

    clusters = np.unique(labels)
    clusters = clusters[clusters != -1]

    cluster_validity = []
    for cluster in clusters:
        cluster_points = X[labels == cluster]
        if len(cluster_points) > 1:
            intra_cluster_density = np.mean([
                np.sum(density_reachable[labels == cluster][:, labels == cluster]) / 2
            ])
            cluster_validity.append(intra_cluster_density)
    if cluster_validity:
        validity_index = np.mean(cluster_validity) / total_reachability
    else:
        validity_index = -1
    return validity_index

In [None]:
def custom_dbcv_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(np.unique(labels)) > 1:
        return dbcv_score(X, labels, estimator.min_samples)
    else:
        return -1  # Penalize configurations with no valid clusters

In [None]:
dbcv_scorer = make_scorer(custom_dbcv_scorer, greater_is_better=True)

In [None]:
#set up parameter distribution
# Step 4: Set up Parameter Distributions for RandomizedSearchCV
param_distributions = {
    "min_samples": [5, 10, 15, 20],
    "min_cluster_size": [250, 300, 350, 400],
    "cluster_selection_method": ["eom", "leaf"],
    "metric": ["euclidean", "manhattan"],
}

In [None]:
# Step 5: Initialize HDBSCAN and RandomizedSearchCV
hdb = hdbscan.HDBSCAN()
random_search = RandomizedSearchCV(
    hdb,
    param_distributions=param_distributions,
    n_iter=50,
    scoring=dbcv_scorer,
    random_state=SEED,
)

In [None]:
# Step 6: Fit RandomizedSearchCV on UMAP Embeddings
random_search.fit(umap_embeddings)

In [None]:
# Get Best Parameters and Score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best DBCV Score: {random_search.best_score_}")

In [None]:
# Step 7: Use Best HDBSCAN Model in BERTopic
best_hdbscan = random_search.best_estimator_

In [None]:
#best parameters are: min_samples = 5, min_cluster_size = 300, cluster_selection_method = "eom", metric = "euclidean"