### Import libraries

In [21]:
import pandas as pd
import warnings
import numpy as np
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator, ClusterMixin

!pip install bertopic
from bertopic import BERTopic
import umap
import hdbscan
from hdbscan import HDBSCAN
from itertools import product
import random

#!pip install sentence-transformers # Install the necessary library
from sentence_transformers import SentenceTransformer # Import the SentenceTransformer class

warnings.simplefilter('ignore')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Loading in datasets

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
data_2022_long = pd.read_csv('/content/drive/My Drive/UNI/data_2022_long.csv')
data_2020_long = pd.read_csv('/content/drive/My Drive/UNI/data_2020_long.csv')

In [23]:
data_2020_long =  pd.read_csv('data_2020_long.csv')
data_2022_long =  pd.read_csv('data_2022_long.csv')

## Random Sampling

In [24]:
random.seed(42)

In [25]:
# Convert 'timestamp' column to datetime objects
data_2022_long['timestamp'] = pd.to_datetime(data_2022_long['timestamp'])
data_2020_long['timestamp'] = pd.to_datetime(data_2020_long['timestamp'])

# Now you can extract the year
data_2022_long['year'] = data_2022_long['timestamp'].dt.year
data_2020_long['year'] = data_2020_long['timestamp'].dt.year  # Corrected line
data_2022_long['month'] = data_2022_long['timestamp'].dt.month
data_2020_long['month'] = data_2020_long['timestamp'].dt.month

# Now you can perform the groupby and filter operations
#data_2022_text = data_2022_long[data_2022_long['year'] == 2022]
#data_2023_text = data_2022_long[data_2022_long['year'] == 2023]
#data_2020_text = data_2020_long[data_2020_long['year'] == 2020]
# Assuming you have data_2021_long loaded and processed similarly
# data_2021_text = data_2021_long[data_2021_long['year'] == 2021]

In [26]:
# Define the total sample size and the number of samples to take
total_sample_size = 23000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2022_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2022_long[(data_2022_long['month'] == row['month']) & (data_2022_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2022 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(sampled_df_2022) < total_sample_size:
    remaining_rows = data_2022_long.sample(n=total_sample_size - len(sampled_df_2022), random_state=1)
    sampled_df = pd.concat([sampled_df_2022, remaining_rows], ignore_index=True)


In [27]:
# Define the total sample size and the number of samples to take
total_sample_size = 33000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2020_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2020_long[(data_2020_long['month'] == row['month']) & (data_2020_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2020 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(data_2020_long) < total_sample_size:
    remaining_rows = data_2020_long.sample(n=total_sample_size - len(sampled_df_2020), random_state=1)
    sampled_df = pd.concat([sampled_df_2020, remaining_rows], ignore_index=True)

In [23]:
33000/1345325 #Sampling about 2.5% of the data

0.024529388809395498

In [28]:
sample_data = pd.concat([sampled_df_2020, sampled_df_2022], ignore_index=True)

## Tuning

In [29]:
#step 1 - generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # Use an SBERT model for embedding
embeddings = model.encode(sample_data['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/1750 [00:00<?, ?it/s]

In [30]:
# Step 2: Reduce Dimensionality with UMAP
fitted_umap = umap.UMAP(n_components=5, n_neighbors=30, min_dist=0.0, random_state=42).fit(embeddings)
umap_embeddings = fitted_umap.embedding_

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [124]:
#Step 3: Define the parameter range of values
min_cluster_sizes = [250, 300, 350, 400]
min_samples_values = [5, 10, 15, 20]

In [126]:
# Step 4: Define a custom HDBSCAN estimator wrapper
class HDBSCANWrapper(BaseEstimator, ClusterMixin):
    def __init__(self, min_cluster_size=5, min_samples=5, metric='euclidean'):
        self.min_cluster_size = min_cluster_size
        self.min_samples = min_samples
        self.metric = metric
        self.model = None

    def fit(self, X, y=None):
        # Instantiate and fit HDBSCAN with the parameters
        self.model = HDBSCAN(
            min_cluster_size=self.min_cluster_size,
            min_samples=self.min_samples,
            metric=self.metric,
            cluster_selection_method='eom',
            gen_min_span_tree=True  # Ensures minimum spanning tree generation
        )
        self.model.fit(X)
        return self

    def score(self, X, y=None):
        # Calculate and return DBCV score as a performance measure
        if len(set(self.model.labels_)) > 1:  # Ensure it has more than one cluster
            return self.model.relative_validity_
        else:
            return -np.inf  # Assign a very low score if there's only noise

# Step 5: Define parameter distributions for RandomizedSearchCV
param_dist = {
    'min_cluster_size': min_cluster_sizes,
    'min_samples': min_samples_values,
}

# Step 6: Set up RandomizedSearchCV with the custom HDBSCANWrapper
random_search = RandomizedSearchCV(
    estimator=HDBSCANWrapper(),
    param_distributions=param_dist,
    n_iter=10,  # Set the number of random configurations to try
    random_state=42,
    n_jobs=-1  # Use all available processors
)

In [127]:
# Step 7: Perform the search on embeddings
search_results = random_search.fit(umap_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [128]:
# Step 8: Extract the best parameters and score
best_model = search_results.best_estimator_
best_params = search_results.best_params_
best_score = search_results.best_score_

In [129]:
print(f"Best Parameters: {best_params}")
print(f"Best DBCV Score: {best_score}")

Best Parameters: {'min_samples': 5, 'min_cluster_size': 350}
Best DBCV Score: 0.18316611700521318


In [None]:
# Best Parameters: min samples = 5, min cluster size = 300/350 - HDBSCAN clustering is stochastic in nature