In [24]:
#!pip install pandas sentence-transformers hdbscan numpy matplotlib umap-learn


import json
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import hdbscan
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from collections import defaultdict
import numpy as np
import pandas as pd
import random

# Load JSON data
with open('extracted_events.json', 'r') as f:
    data = json.load(f)

# Extract text data (e.g., 'Phrase Summary')
texts = [
    entry['Phrase Summary']
    for sublist in data
    for entry in sublist
]


ModuleNotFoundError: No module named 'umap.umap_'

In [23]:
# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each text entry
embeddings = model.encode(texts, show_progress_bar=True)


Batches:   0%|          | 0/296 [00:00<?, ?it/s]

In [None]:
# Dimensionality Reduction with UMAP

# Reduce embeddings to 2D using UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.05, random_state=42)
umap_embeddings = umap_model.fit_transform(embeddings)


In [None]:
# HDBSCAN Parameter Tuning

# Function to test multiple HDBSCAN parameter configurations
def tune_hdbscan(embeddings, min_cluster_sizes, min_samples_values):
    results = []
    for min_cluster_size in min_cluster_sizes:
        for min_samples in min_samples_values:
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean')
            labels = clusterer.fit_predict(embeddings)

            # Record the results
            num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            noise_points = list(labels).count(-1)
            results.append({
                'min_cluster_size': min_cluster_size,
                'min_samples': min_samples,
                'num_clusters': num_clusters,
                'noise_points': noise_points
            })
            print(f"min_cluster_size={min_cluster_size}, min_samples={min_samples} -> Clusters: {num_clusters}, Noise Points: {noise_points}")

    return pd.DataFrame(results)

# Define the range of parameters to test
min_cluster_sizes = [5, 10, 15]
min_samples_values = [1, 2, 5]

# Run parameter tuning
hdbscan_results = tune_hdbscan(embeddings, min_cluster_sizes, min_samples_values)
print(hdbscan_results)


In [None]:
# Choosing Optimal HDBSCAN Parameters

# Best HDBSCAN parameters based on tuning results
best_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=5, metric='euclidean')
cluster_labels = best_clusterer.fit_predict(embeddings)

# Filter out noise points (label = -1) for visualization and analysis
valid_points = cluster_labels != -1
filtered_embeddings = embeddings[valid_points]
filtered_labels = cluster_labels[valid_points]
filtered_texts = [texts[i] for i in range(len(texts)) if valid_points[i]]


In [None]:
# Visualizing Clusters with UMAP

# Apply UMAP on filtered embeddings
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.05, random_state=42)
umap_embeddings = umap_model.fit_transform(filtered_embeddings)

# Further apply t-SNE on UMAP-reduced data
tsne = TSNE(n_components=2, perplexity=50, n_iter=1000, random_state=42)
tsne_embeddings = tsne.fit_transform(umap_embeddings)

# Plot with cluster annotations
plt.figure(figsize=(12, 8))
scatter = plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1],
                      c=filtered_labels, cmap='tab10', alpha=0.7, s=15)
plt.title('t-SNE Clustering with Annotated Samples')
plt.colorbar(scatter, label='Cluster ID')

# Annotate random sample points in each cluster
for cluster_id in set(filtered_labels):
    cluster_indices = [i for i, label in enumerate(filtered_labels) if label == cluster_id]
    random_indices = random.sample(cluster_indices, min(3, len(cluster_indices)))
    for i in random_indices:
        plt.annotate(filtered_texts[i], (tsne_embeddings[i, 0], tsne_embeddings[i, 1]), fontsize=8)

plt.show()


In [None]:
# Extract Keywords for Each Cluster

def extract_keywords(texts, labels):
    cluster_keywords = defaultdict(list)
    vectorizer = TfidfVectorizer(max_features=5, stop_words='english')

    for cluster in set(labels):
        if cluster == -1:  # Skip noise points
            continue
        cluster_texts = [texts[i] for i in range(len(texts)) if labels[i] == cluster]
        X = vectorizer.fit_transform(cluster_texts)
        keywords = vectorizer.get_feature_names_out()
        cluster_keywords[cluster] = keywords

    return cluster_keywords

# Extract and display keywords
keywords = extract_keywords(filtered_texts, filtered_labels)
for cluster, words in keywords.items():
    print(f"Cluster {cluster} Keywords: {', '.join(words)}")


In [None]:
# Summarize and Review Clusters

# Generate summaries for manual review
cluster_summaries = {}
for cluster_id in set(filtered_labels):
    if cluster_id == -1:
        continue
    examples = [filtered_texts[i] for i in range(len(filtered_texts)) if filtered_labels[i] == cluster_id][:3]
    keywords_list = keywords[cluster_id]

    # Generate a short summary using keywords and examples
    summary = f"**Cluster {cluster_id} - Theme**: {', '.join(keywords_list)}\n"
    summary += "This section includes events and discussions focused on topics like "
    summary += ", ".join(keywords_list) + ".\n"
    summary += "Example events include:\n"
    for example in examples:
        summary += f"- {example}\n"

    cluster_summaries[cluster_id] = summary
    print(summary)  # Review summaries manually


In [None]:
# Evaluate Clustering Performance

# Calculate Silhouette Score for cluster quality evaluation
score = silhouette_score(umap_embeddings, cluster_labels)
print(f'Silhouette Score: {score:.2f}')
