In [1]:
import pandas as pd
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
import plotly.express as px
from keybert import KeyBERT
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
df = pd.read_csv("final_dataset_Preprocessed.csv")

In [3]:
df.drop("Heading", axis=1, inplace=True)
df.drop("URL", axis=1, inplace=True)

In [4]:
df.dropna(inplace=True)

In [5]:
embedder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
embeddings = embedder.encode(df["Body"].values)
print(embeddings)

In [6]:
np.save('embeddings_final_dataset_Preprocessed.npy', embeddings)

In [7]:
# embeddings = np.load(r"C:\Users\Dhruv\Downloads\embeddings_Headings_final_data_Preprocessed.npy")

In [8]:
print(embeddings.shape)

(11583, 768)


In [165]:
reduced_embeddings = umap.UMAP(n_components=2, n_neighbors=100, min_dist=0.02).fit_transform(embeddings)
print(reduced_embeddings.shape)

(11583, 2)


In [183]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=35)
labels = clusterer.fit_predict(reduced_embeddings)
df["label"] = [str(label) for label in labels]
print(f"Num of clusters: {labels.max()}")

Num of clusters: 30


In [184]:
# put the values of the two dimensions inside the dataframe
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]
# substring of the full text, for visualization purposes
df["text_short"] = df["Body"].str[:200]

In [None]:
hover_data = {
    "text_short": True,
    "x": False,
    "y": False
}
fig = px.scatter(df, x="x", y="y", template="plotly_dark",
                   title="Embeddings", hover_data=hover_data)
fig.update_layout(showlegend=False)
fig.show()

In [186]:
num_outliers = len(df[df["label"] == "-1"])
print(f"Num of outliers: {num_outliers} ({num_outliers / len(df) * 100:.2f} % of total)")

Num of outliers: 3456 (29.84 % of total)


In [None]:
df_no_outliers = df[df["label"] != "-1"]

# scatter plot
hover_data = {
    "text_short": True,
    "x": False,
    "y": False
}
fig = px.scatter(df_no_outliers, x="x", y="y", template="plotly_dark",
                   title="Embeddings", color="label", hover_data=hover_data)
fig.show()

In [190]:
cluster = "1"
df_subset = df[df["label"] == cluster].reset_index()
texts_concat = ". ".join(df_subset["Heading"].values)
keywords_and_scores = KeyBERT().extract_keywords(texts_concat,
                                    keyphrase_ngram_range=(1, 1), top_n=10)
print(keywords_and_scores)

[('schumacher', 0.5447), ('alonso', 0.5014), ('ferrari', 0.4603), ('prix', 0.4576), ('prixs', 0.4172), ('racing', 0.3931), ('jenson', 0.3871), ('ferraris', 0.375), ('ricciardo', 0.3729), ('fia', 0.368)]


In [191]:
def filter_keywords(keywords, n_keep=3):
    new_keywords = []
    for candidate_keyword in keywords:
        is_ok = True
        for compare_keyword in keywords:
            if candidate_keyword == compare_keyword:
                continue
            if compare_keyword in candidate_keyword:
                is_ok = False
                break
        if is_ok:
            new_keywords.append(candidate_keyword)
            if len(new_keywords) >= n_keep:
                break
    return new_keywords

keywords = [t[0] for t in keywords_and_scores]
keywords_filtered = filter_keywords(keywords)
print(keywords_filtered)

['schumacher', 'alonso', 'ferrari']


In [None]:
df_no_outliers

In [None]:
# assign a meaningful name to each cluster
def get_cluster_name(df, cluster):
    df_subset = df[df["label"] == cluster].reset_index()
    texts_concat = ". ".join(df_subset["Heading"])
    kw_model = KeyBERT()
    keywords_and_scores = kw_model.extract_keywords(texts_concat, keyphrase_ngram_range=(1, 1),
                                        top_n=10)
    keywords = [t[0] for t in keywords_and_scores]
    keywords_filtered = filter_keywords(keywords)
    return " - ".join(keywords_filtered)

# get all the new cluster names
all_clusters = df_no_outliers["label"].unique()
d_cluster_name_mapping = {}
for cluster in all_clusters:
    if cluster == "-1":
        d_cluster_name_mapping[cluster] = "outliers"
    else:
        d_cluster_name_mapping[cluster] = get_cluster_name(df_no_outliers, cluster)

# rename clusters
df_no_outliers["label"] = df_no_outliers["label"].apply(lambda label: d_cluster_name_mapping[label])

In [None]:
clusters = df_no_outliers['label'].unique()

for cluster in clusters:
    # Combine all text for a specific cluster
    cluster_text = ' '.join(df_no_outliers[df_no_outliers['label'] == cluster]['Body'])
    
    # Generate a word cloud for the cluster's text
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_text)
    
    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for Cluster {cluster}')
    plt.axis('off')
    plt.show()