In [None]:
import numpy as np
import pandas as pd
import torch
import plotly.express as px
import plotly.graph_objects as go

from src.cluster_labeller import Clusterer, ParamHDBSCAN
from src.topic_modeller import vectorize_docs, embed_docs, embed_words, extract_keywords

In [None]:
data_path = "./datasets/News_Category_Dataset_v3.json"
model_path = "./models/all-mpnet-base-v2"
emb_dir = "./emb_dir"

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
df = pd.read_json(data_path, lines=True)

In [None]:
df.head()

In [None]:
drop_col = ["link", "authors"]
df.drop(columns=drop_col, inplace=True)
df.rename(columns={"headline":"message"}, inplace=True)
df.drop_duplicates(subset=["message"], inplace=True)
df = df[:50000]

df.reset_index(inplace=True, drop=True)

In [None]:
df.info()

In [None]:
docs = df["message"].to_list()

In [None]:
clusterer = Clusterer(embedding_model_path=model_path, docs=docs, device=device)

In [None]:
# result = clusterer.generate_emb(emb_dir=emb_dir)
result = clusterer.load_embeddings(embeddings_dir=emb_dir)

In [None]:
clusters, score, params = clusterer.cluster()
unique, counts = np.unique(clusters, return_counts=True)

# MOVE THIS TO CLASS
print(f"Number of docs: {len(docs)}")
print(f"DBCV score: {score:.4f}")
print(f"Params: {params}")
print(f"Number of Classes (Not including noise): {len(unique) - 1}")
print(f"Coverage: {(clusters >= 0).sum()/len(clusters)*100:.2f}%")

In [None]:
params = ParamHDBSCAN(
    min_samples=[None, 5, 6, 10],
    min_cluster_size=[5, 10, 15],
    cluster_selection_epsilon=[0.0, 0.1, 0.2],
    cluster_selection_method=["eom"],
    metric=["euclidean"]
)

In [None]:
tuning_results = clusterer.tune_HDBSCAN(params)

In [None]:
def plot_hyperparam_tuning(tuning_results) -> None:
    # Create a DataFrame
    df = pd.DataFrame({
        'num_clusters': tuning_results["num_clusters"],
        'DBCV_score': tuning_results["DBCV_score"],
        'coverage': tuning_results["coverage"],
        'params': tuning_results["params"],
        'index': [i for i in range(len(tuning_results["params"]))]
    })

    # Create the scatter plots
    fig = go.Figure()

    # Add the first scatter plot
    fig.add_trace(go.Scatter(
        x=df['num_clusters'],
        y=df['DBCV_score'],
        mode='markers',
        marker=dict(size=10, color='blue'),
        text=[f'DBCV_score: {dbcv:.4f}, Coverage: {cov*100:.2f}%, Index: {idx}' for dbcv, cov, idx in zip(df['DBCV_score'], df['coverage'], df["index"])],
        name='DBCV_score'
    ))

    # Add the second scatter plot
    fig.add_trace(go.Scatter(
        x=df['num_clusters'],
        y=df['coverage'],
        mode='markers',
        marker=dict(size=10, color='red'),
        text=[f'DBCV_score: {dbcv:.4f}, Coverage: {cov*100:.2f}%, Index: {idx}' for dbcv, cov, idx in zip(df['DBCV_score'], df['coverage'], df["index"])],
        name='coverage'
    ))

    # Update layout to include hover mode and show the legend
    fig.update_layout(
        title='Num Clusters vs DBCV_score and Coverage',
        xaxis_title='Num Clusters',
        yaxis_title='Score/Coverage',
        hovermode='closest'
    )

    # Add hover text to show the associated params
    fig.update_traces(hoverinfo='text')

    # Show the plot
    fig.show()

In [None]:
plot_hyperparam_tuning(tuning_results)

In [None]:
params: dict = tuning_results["params"][16]
# del params["gen_min_span_tree"]

clusters, score, params = clusterer.cluster(**params)
unique, counts = np.unique(clusters, return_counts=True)
counts.sort()

# MOVE THIS TO CLASS
print(f"Number of docs: {len(docs)}")
print(f"DBCV score: {score:.4f}")
print(f"Params: {params}")
print(f"Number of Classes (Not including noise): {len(unique) - 1}")
print(f"Coverage: {(clusters >= 0).sum()/len(clusters)*100:.2f}%")
print(f"Largest Cluster: {counts[-2]}" )
print(f"Smallest Cluster: {params["min_cluster_size"]}")

In [None]:
df["clusters"] = clusters.astype(str)
df["x"] = result.emb_2d[:,0]
df["y"] = result.emb_2d[:,1]

In [None]:
df.info()

In [None]:
keywords_dict = {}

unique_no_noise: list = unique.astype(str).tolist()
unique_no_noise.remove('-1')

for cluster in unique_no_noise:
    try:
        print(cluster)
    
        indices = df[df["clusters"]==cluster].index.tolist()
        cluster_docs = df[df["clusters"]==cluster]["message"].to_list()    
        words, count_matrix = vectorize_docs(cluster_docs)
        
        word_emb = embed_words(words, emb_model=clusterer.emb_model)
        # doc_emb = embed_docs(cluster_docs, emb_model=clusterer.emb_model)
        doc_emb = result.emb_source[indices]

        
        keywords = extract_keywords(cluster_docs, words, count_matrix, doc_emb, word_emb) 
        
        keywords_dict[cluster]=keywords["keyword"]
    except ValueError as ve:
        keywords_dict[cluster]=['']
        print(ve)

In [None]:
df["topics"] = df["clusters"].map(keywords_dict)

In [None]:
df.head(30)

In [None]:
fig = px.scatter(df, x='x', y='y', color='clusters', text='message')
fig.update_traces(mode="markers", hovertemplate=None)
fig.update_layout(legend_title='Cluster')
fig.show()