In [3]:
import json
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the input dictionary
with open('query_to_cluster_id.json', 'r') as f:
    query_to_cluster = json.load(f)

# Step 2: Group queries by cluster
cluster_to_queries = defaultdict(list)
for query, cluster_id in query_to_cluster.items():
    cluster_to_queries[cluster_id].append(query)

# Step 3: Generate a 2-word label for each cluster
cluster_labels = {}

for cluster_id, queries in cluster_to_queries.items():
    if not queries:  # If empty list, skip
        continue

    # Use TF-IDF to extract top keywords
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
    try:
        X = vectorizer.fit_transform(queries)
        feature_names = vectorizer.get_feature_names_out()
        
        if len(feature_names) < 2:
            label = "Miscellaneous Cluster"
        else:
            tfidf_scores = X.mean(axis=0).A1
            top_indices = tfidf_scores.argsort()[::-1][:2]
            top_words = [feature_names[idx] for idx in top_indices]
            label = ' '.join(top_words)

    except ValueError:  # empty vocabulary error
        label = "Miscellaneous Cluster"
    
    cluster_labels[cluster_id] = label
with open('cluster_labels.json', 'w') as f:
    json.dump(cluster_labels, f, indent=2)
print("Cluster labels saved to 'cluster_labels.json'.")


Cluster labels saved to 'cluster_labels.json'.
