# Library

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../../../../Util')
import TextClustering as tc
import BERTopicUtils as btu
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.corpora import Dictionary
import torch

# Grid Search

In [3]:
df = btu.load_data_filtered('../cleaned_data_name_thread.zip', 'name_thread')

66735


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
tc1 = tc.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)

In [None]:
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=300, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=representation_model,
    embedding_model=model,
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)

In [8]:
btu.print_topics(topic_model, topics)

Topic 0:
[('weed', 0.3703934700138675), ('cannabis', 0.3478357489521716), ('cart', 0.3440817555469596), ('thc', 0.3126469759049928), ('shatter', 0.31239258293349326), ('distillate', 0.300227529265498), ('vape', 0.299007567472003), ('indoor', 0.2964173092968111), ('hash', 0.29596516071202483), ('bud', 0.2951408565699177)]
Topic 1:
[('cocaine', 0.5402634636403887), ('coke', 0.5089352976568257), ('heroin', 0.4730594763095486), ('drug', 0.4130049758815167), ('fishscale', 0.3394246005132988), ('colombian', 0.33092593298959), ('uncut', 0.3268121662569789), ('peruvian', 0.3202249365910443), ('brazil', 0.3186727902919751), ('crack', 0.30846562332723415)]
Topic 2:
[('package', 0.4678882986886639), ('shipping', 0.4370462216641005), ('delivery', 0.4252661869902795), ('usps', 0.39731727676882883), ('order', 0.39157332152387053), ('shipped', 0.3703190030264491), ('informed', 0.34773063848945995), ('delivered', 0.3441932686114708), ('ship', 0.3411896102274624), ('delay', 0.3241192257741199)]
Topic 3

In [7]:
def topics_meet_criteria(topic_model: BERTopic, check_gun_topic: bool = False) -> bool:
    """
    Check if the topics meet the criteria for the assignment.
    :param topic_model: The BERTopic model used for clustering
    :param check_gun_topic: Whether to check for the gun topic
    :return: True if the topics meet the criteria, False otherwise
    """
    topics = topic_model.get_topics()
    drug_keywords = {'drug', 'cocaine', 'ketamine', 'weed', 'mdma', 'coke', 'lsd', 'heroine', 'xanax'}
    scam_bitcoin_keywords = {'transfer', 'bitcoin', 'wallet', 'paypal', 'bank'}
    market_keywords = {'market', 'vendor', 'darknet', 'scammer', 'scammed'}
    hacker_keywords = {'hacker', 'hack', 'exploit', 'exploited', 'exploitation', 'exploiting', 'exploiter', 'exploits'}
    gun_keywords = {'gun', 'firearm', 'pistol', 'rifle', 'shooting', 'weapon', 'handgun'}
    
    found_drug_topic = found_scam_bitcoin_topic = found_market_topic = found_hacker_topic = False
    drug_topic_id = scam_bitcoin_topic_id = market_topic_id = hacker_topic_id = None

    for topic_id, topic in topics.items():
        words = set([word for word, _ in topic])
        if words & drug_keywords:
            found_drug_topic = True
            drug_topic_id = topic_id
        if words & scam_bitcoin_keywords:
            found_scam_bitcoin_topic = True
            scam_bitcoin_topic_id = topic_id
        if words & market_keywords:
            found_market_topic = True
            market_topic_id = topic_id
        if words & hacker_keywords:
            found_hacker_topic = True
            hacker_topic_id = topic_id

    distinct_clusters = len(set(filter(None, [drug_topic_id, scam_bitcoin_topic_id, market_topic_id, hacker_topic_id])))

    if check_gun_topic:
        return (found_drug_topic and found_scam_bitcoin_topic and found_market_topic and found_hacker_topic and 
                distinct_clusters >= 4)
    
    return (found_drug_topic and found_scam_bitcoin_topic and found_market_topic and 
            distinct_clusters >= 3)

def cluster_size_within_threshold(topic_model: BERTopic, threshold: float = 0.15) -> bool:
    """
    Check if the sizes of the top 4 clusters are within the threshold.
    :param topic_model: The BERTopic model used for clustering
    :param threshold: The threshold for the size difference between the top 4 clusters
    :return: True if the sizes are within the threshold, False otherwise
    """
    topic_freq = topic_model.get_topic_freq()
    top_4_clusters = topic_freq.iloc[1:5]  # First row is for outliers

    for i in range(len(top_4_clusters) - 1):
        size_diff = abs(top_4_clusters.iloc[i]["Count"] - top_4_clusters.iloc[i + 1]["Count"])
        max_size = max(top_4_clusters.iloc[i]["Count"], top_4_clusters.iloc[i + 1]["Count"])
        if size_diff / max_size > threshold:
            return False

    return True

def compute_clustering_scores(topic_model: BERTopic, topics: list, embeddings: np.ndarray) -> tuple:
    """
    Compute the silhouette score and Davies-Bouldin score for the clustering results.
    :param topic_model: The BERTopic model used for clustering
    :param topics: The topics assigned to each document
    :param embeddings: The embeddings of the documents
    :return: The silhouette score and Davies-Bouldin score
    """
    # Filter out outliers (cluster -1)
    valid_indices = [i for i, topic in enumerate(topics) if topic != -1]
    valid_embeddings = np.array([embeddings[i] for i in valid_indices])
    valid_topics = np.array([topics[i] for i in valid_indices])
    
    if len(set(valid_topics)) > 1:  # At least two clusters are needed to compute these scores
        silhouette = silhouette_score(valid_embeddings, valid_topics)
        davies_bouldin = davies_bouldin_score(valid_embeddings, valid_topics)
        return silhouette, davies_bouldin
    else:
        return -1, float('inf')
        
def grid_search_clusters(initial_model: BERTopic, initial_topics: list, 
                         log_file: str = "grid_search_log_hdbscan_min_cluster.csv", target_outliers: int = 25000, 
                         max_iters: int = 10, cluster_size_threshold: float = 0.20, min_silhouette: float = 0.3) -> tuple:
    """
    Perform a grid search to find the optimal parameters for the BERTopic model.
    :param initial_model: The initial BERTopic model to start the search from
    :param initial_topics: The initial topics assigned by the model
    :param target_outliers: The target number of outliers to have in the final model
    :param max_iters: The maximum number of iterations to run the grid search for
    :param cluster_size_threshold: The threshold for the difference in cluster sizes
    :param min_silhouette: The minimum acceptable silhouette score
    :return: The best BERTopic model, the topics assigned by the model, and the probabilities of each topic
    """
    best_model = initial_model
    best_topics = initial_topics
    best_probs = None

    # Define parameter ranges for grid search
    n_neighbors_range = [5, 10, 15, 30, 50, 80]
    n_components_range = [2, 3, 5, 10]
    min_cluster_size_range = [200, 300, 500, 600, 1000, 1100, 1200, 1300, 1400]
    
    vectorizer_model = CountVectorizer(stop_words="english")
    representation_model = MaximalMarginalRelevance(diversity=0.3)
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    zeroshot_topic_list = pd.read_csv('../../../../Datasets/IntentCrime/intent_crime.csv')['intent'].tolist()

    iteration = 0
    log_file = log_file
    
    # Open log file and write headers
    with open(log_file, mode='w', newline='') as log_csv:
        log_writer = csv.writer(log_csv)
        log_writer.writerow(["min_cluster_size", "outliers", "silhouette", "davies_bouldin", "meets_criteria", "cluster_sizes", "representative_words"])

    # Start the grid search
    print(f"Iteration {iteration}: Adjusting parameters...")

    for min_clu in tqdm(min_cluster_size_range, desc="min_cluster_size"):
        # Configure UMAP model with current parameters
        umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', random_state=42)
        
        hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_clu, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
        
        # Recreate BERTopic model with new parameters
        topic_model = BERTopic(
            top_n_words=10, 
            n_gram_range=(1, 2),
            umap_model=umap_model, 
            hdbscan_model=hdbscan_model, 
            vectorizer_model=vectorizer_model, 
            ctfidf_model=ctfidf_model, 
            representation_model=representation_model,
            zeroshot_topic_list=zeroshot_topic_list, 
            zeroshot_min_similarity=.1, 
            verbose=True
        )
        
        # Refit the model
        topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
        
        # Calculate the number of outliers
        outlier_count = sum([1 if topic == -1 else 0 for topic in topics])
        
        # Compute clustering scores
        print("Computing clustering scores...")
        umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
        indices = [index for index, topic in enumerate(topics) if topic != -1]
        X = umap_embeddings[np.array(indices)]
        labels = [topic for index, topic in enumerate(topics) if topic != -1]
        silhouette = silhouette_score(X, labels)
        davies_bouldin = davies_bouldin_score(X, labels)
        
        # Check if the configuration meets the criteria
        meets_criteria = topics_meet_criteria(topic_model)
        #cluster_size_within_threshold_flag = cluster_size_within_threshold(topic_model, cluster_size_threshold)
        
        # Get cluster sizes and representative words
        topic_info = topic_model.get_topic_info()
        cluster_sizes = topic_info['Count'].to_list()
        representative_words = [topic_model.get_topic(topic_id) for topic_id in topic_info['Topic'].to_list()]
        
        # Format representative words
        formatted_representative_words = [" ".join([word for word, _ in words]) for words in representative_words]

        # Log the current configuration
        with open(log_file, mode='a', newline='') as log_csv:
            log_writer = csv.writer(log_csv)
            log_writer.writerow([min_clu, outlier_count, silhouette, davies_bouldin, meets_criteria, cluster_sizes, formatted_representative_words])
                
    return best_model, best_topics, best_probs

In [None]:
topic_model, topics, probs = grid_search_clusters(topic_model, topics, log_file="grid_search_log_hdbscan_MiniLM.csv", target_outliers=25000)