### Import libraries

In [None]:
import pandas as pd
import warnings
import gensim
import numpy as np
import plotly.express as px
import sklearn

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import re
import spacy

import matplotlib.pyplot as plt

!pip install bertopic
from bertopic import BERTopic
import umap
import numpy as np
import hdbscan
from hdbscan import HDBSCAN
from itertools import product
import random

!pip install sentence-transformers # Install the necessary library
from sentence_transformers import SentenceTransformer # Import the SentenceTransformer class

warnings.simplefilter('ignore')

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB

### Loading in datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_2022_long = pd.read_csv('/content/drive/My Drive/UNI/data_2022_long.csv')

In [None]:
data_2020_long = pd.read_csv('/content/drive/My Drive/UNI/data_2020_long.csv')

##Random Sampling

In [None]:
random.seed(42)

In [None]:
import pandas as pd
# Convert 'timestamp' column to datetime objects
data_2022_long['timestamp'] = pd.to_datetime(data_2022_long['timestamp'])
data_2020_long['timestamp'] = pd.to_datetime(data_2020_long['timestamp'])

# Now you can extract the year
data_2022_long['year'] = data_2022_long['timestamp'].dt.year
data_2020_long['year'] = data_2020_long['timestamp'].dt.year  # Corrected line
data_2022_long['month'] = data_2022_long['timestamp'].dt.month
data_2020_long['month'] = data_2020_long['timestamp'].dt.month

# Now you can perform the groupby and filter operations
data_2022_text = data_2022_long[data_2022_long['year'] == 2022]
data_2023_text = data_2022_long[data_2022_long['year'] == 2023]
data_2020_text = data_2020_long[data_2020_long['year'] == 2020]
# Assuming you have data_2021_long loaded and processed similarly
# data_2021_text = data_2021_long[data_2021_long['year'] == 2021]

In [None]:
# Define the total sample size and the number of samples to take
total_sample_size = 23000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2022_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2022_long[(data_2022_long['month'] == row['month']) & (data_2022_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2022 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(sampled_df_2022) < total_sample_size:
    remaining_rows = data_2022_long.sample(n=total_sample_size - len(sampled_df_2022), random_state=1)
    sampled_df = pd.concat([sampled_df_2022, remaining_rows], ignore_index=True)


In [None]:
33000/1345325 #Sampling about 2.5% of the data

0.024529388809395498

In [None]:
# Define the total sample size and the number of samples to take
total_sample_size = 33000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2020_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2020_long[(data_2020_long['month'] == row['month']) & (data_2020_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2020 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(data_2020_long) < total_sample_size:
    remaining_rows = data_2020_long.sample(n=total_sample_size - len(sampled_df_2020), random_state=1)
    sampled_df = pd.concat([sampled_df_2020, remaining_rows], ignore_index=True)

In [None]:
sample_data = pd.concat([sampled_df_2020, sampled_df_2022], ignore_index=True)

## Tuning

In [None]:
#step 1 - generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # Use an SBERT model for embedding
embeddings = model.encode(sample_data['text'].tolist(), show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1750 [00:00<?, ?it/s]

In [None]:
# Tuning HDBSCAN parameters using the true DBCV score from relative_validity
def randomized_hdbscan_search(embeddings, min_samples_values, min_cluster_sizes, n_iter=10, seed=None):
    random.seed(42)

    best_dbvc_score = -np.inf
    best_params = None
    best_model = None

    # Convert embeddings to dtype float64 for HDBSCAN compatibility
    embeddings = embeddings.astype(np.float64)

    # Sample parameter combinations randomly
    param_combinations = list(product(min_cluster_sizes, min_samples_values))
    sampled_combinations = random.sample(param_combinations, min(n_iter, len(param_combinations)))

    for min_cluster_size, min_samples in sampled_combinations:
        # Initialize and fit HDBSCAN with sampled parameters
        clusterer = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',
            cluster_selection_method='eom'
        )
        clusterer.fit(embeddings)

        # Only consider models with more than one cluster
        if len(set(clusterer.labels_)) > 1:  # Exclude noise-only cases
            dbvc_score = clusterer.relative_validity_
            if dbvc_score > best_dbvc_score:
                best_dbvc_score = dbvc_score
                best_params = (min_cluster_size, min_samples)
                best_model = clusterer

    return best_model, best_params, best_dbvc_score

In [None]:
# Set up parameter distribution for RandomizedSearchCV
min_samples_values = [5, 10, 15, 20]
min_cluster_sizes = [250, 300, 350, 400]

In [None]:
best_model, best_params, best_dbvc_score = randomized_hdbscan_search(
    embeddings, min_samples_values, min_cluster_sizes, n_iter=10)

In [None]:
print(f"Best Parameters: {best_params}")
print(f"Best DBCV Score: {best_dbvc_score}")

In [None]:
#best parameters are: min_samples = 5, min_cluster_size = 300