### Import libraries

In [8]:
import pandas as pd
import warnings
import gensim
import numpy as np
import plotly.express as px
import sklearn

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import re
import spacy

import numpy as np
import hdbscan
from sklearn.datasets import make_blobs
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt

!pip install bertopic
from bertopic import BERTopic
import umap

!pip install sentence-transformers # Install the necessary library
from sentence_transformers import SentenceTransformer # Import the SentenceTransformer class

warnings.simplefilter('ignore')



### Loading in datasets

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
data_2022_long = pd.read_csv('/content/drive/My Drive/UNI/data_2022_long.csv')

In [11]:
data_2020_long = pd.read_csv('/content/drive/My Drive/UNI/data_2020_long.csv')

##Random Sampling

In [12]:
import pandas as pd
# Convert 'timestamp' column to datetime objects
data_2022_long['timestamp'] = pd.to_datetime(data_2022_long['timestamp'])
data_2020_long['timestamp'] = pd.to_datetime(data_2020_long['timestamp'])

# Now you can extract the year
data_2022_long['year'] = data_2022_long['timestamp'].dt.year
data_2020_long['year'] = data_2020_long['timestamp'].dt.year  # Corrected line
data_2022_long['month'] = data_2022_long['timestamp'].dt.month
data_2020_long['month'] = data_2020_long['timestamp'].dt.month

# Now you can perform the groupby and filter operations
data_2022_text = data_2022_long[data_2022_long['year'] == 2022]
data_2023_text = data_2022_long[data_2022_long['year'] == 2023]
data_2020_text = data_2020_long[data_2020_long['year'] == 2020]
# Assuming you have data_2021_long loaded and processed similarly
# data_2021_text = data_2021_long[data_2021_long['year'] == 2021]

In [15]:
# Define the total sample size and the number of samples to take
total_sample_size = 23000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2022_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2022_long[(data_2022_long['month'] == row['month']) & (data_2022_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2022 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(sampled_df_2022) < total_sample_size:
    remaining_rows = data_2022_long.sample(n=total_sample_size - len(sampled_df_2022), random_state=1)
    sampled_df = pd.concat([sampled_df_2022, remaining_rows], ignore_index=True)


In [17]:
33000/1345325 #Sampling about 2.5% of the data

0.024529388809395498

In [18]:
# Define the total sample size and the number of samples to take
total_sample_size = 33000

# Calculate the number of unique combinations of 'time' and 'subreddit_id'
strata = data_2020_long.groupby(['month', 'subreddit_id']).size().reset_index(name='counts')

# Calculate the fraction of each stratum to sample based on its size
strata['sampling_fraction'] = strata['counts'] / strata['counts'].sum()

# Determine how many samples to take from each stratum
strata['samples'] = (strata['sampling_fraction'] * total_sample_size).round().astype(int)

# Sample from each stratum
samples = []
for _, row in strata.iterrows():
    stratum = data_2020_long[(data_2020_long['month'] == row['month']) & (data_2020_long['subreddit_id'] == row['subreddit_id'])]
    sampled_rows = stratum.sample(n=row['samples'], random_state=1)
    samples.append(sampled_rows)

# Concatenate all samples into a single DataFrame
sampled_df_2020 = pd.concat(samples, ignore_index=True)

# If the total sampled rows is less than 20,000, you can randomly sample the remaining rows from the entire DataFrame
if len(data_2020_long) < total_sample_size:
    remaining_rows = data_2020_long.sample(n=total_sample_size - len(sampled_df_2020), random_state=1)
    sampled_df = pd.concat([sampled_df_2020, remaining_rows], ignore_index=True)

In [19]:
sample_data = pd.concat([sampled_df_2020, sampled_df_2022], ignore_index=True)

## Tuning

In [21]:
SEED = 42

In [23]:
#step 1 - generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")  # Use an SBERT model for embedding
embeddings = model.encode(sample_data['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/1750 [00:00<?, ?it/s]

In [24]:
# Step 2: Reduce Dimensionality with UMAP
fitted_umap = umap.UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False, random_state = SEED).fit(embeddings)
umap_embeddings = fitted_umap.embedding_

In [43]:
import numpy as np
import hdbscan
from itertools import product
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import euclidean_distances


In [44]:
# Define a function to calculate the DBCV score based on the paper's methodology
def calculate_dbvc_score(data, labels):
    # Calculate pairwise distances using Euclidean metric
    distances = squareform(pdist(data, metric='euclidean'))
    return hdbscan.validity.validity_index(distances, labels)


In [48]:
# Function to tune HDBSCAN parameters using DBCV score
def tune_hdbscan(data, min_cluster_sizes, min_samples_values):
    best_dbvc_score = -np.inf
    best_params = None
    best_model = None

    # Convert parameter values to ensure they are integers
    min_cluster_sizes = [int(x) for x in min_cluster_sizes]
    min_samples_values = [int(x) for x in min_samples_values]

    for min_cluster_size, min_samples in product(min_cluster_sizes, min_samples_values):
        # Initialize HDBSCAN with specified parameters
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            metric='euclidean',  # You can change this to 'manhattan' if you decide to test it
            cluster_selection_method='eom'
        )
        clusterer.fit(data)

        # Only calculate DBCV score if there is more than one cluster
        if len(set(clusterer.labels_)) > 1:  # Exclude noise-only cases
            dbvc_score = calculate_dbvc_score(data, clusterer.labels_)
            if dbvc_score > best_dbvc_score:
                best_dbvc_score = dbvc_score
                best_params = (min_cluster_size, min_samples)
                best_model = clusterer

    return best_model, best_params, best_dbvc_score

In [53]:
# Set up parameter distribution for RandomizedSearchCV
min_samples_values = [5, 10, 15, 20]
min_cluster_sizes = [250, 300, 350, 400]

In [None]:
best_model, best_params, best_dbvc_score = tune_hdbscan(umap_embeddings, min_cluster_sizes, min_samples_values)

In [None]:
print(f"Best Parameters: min_cluster_size={best_params[0]}, min_samples={best_params[1]}")
print(f"Best DBCV Score: {best_dbvc_score}")

In [33]:
#best parameters are: min_samples = 5, min_cluster_size = 300, cluster_selection_method = "eom", metric = "euclidean"