In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
# Load the 20 newsgroups dataset (a sample dataset included in scikit-learn)
newsgroups = fetch_20newsgroups(subset='all')
# Vectorize the text data using TF-IDF representation
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(newsgroups.data)
# Perform K-means clustering
k = 20 # Number of clusters (you can adjust this)
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
terms = vectorizer.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(k):
 print(f"Cluster {i + 1}:")
 top_terms = [terms[ind] for ind in order_centroids[i, :5]]
 print(top_terms)

Cluster 1:
['drive', 'scsi', 'ide', 'drives', 'controller']
Cluster 2:
['edu', 'people', 'don', 'com', 'just']
Cluster 3:
['game', 'team', 'ca', 'hockey', 'games']
Cluster 4:
['andrew', 'cmu', 'edu', 'pittsburgh', 'host']
Cluster 5:
['access', 'digex', 'pat', 'net', 'communications']
Cluster 6:
['gun', 'fbi', 'batf', 'koresh', 'guns']
Cluster 7:
['org', 'writes', 'article', 'edu', 'lines']
Cluster 8:
['windows', 'edu', 'file', 'card', 'thanks']
Cluster 9:
['uiuc', 'cso', 'edu', 'henry', 'illinois']
Cluster 10:
['israel', 'israeli', 'jews', 'armenian', 'turkish']
Cluster 11:
['edu', 'university', 'host', 'nntp', 'posting']
Cluster 12:
['god', 'jesus', 'bible', 'christian', 'christ']
Cluster 13:
['cs', 'edu', 'pitt', 'ohio', 'science']
Cluster 14:
['com', 'netcom', 'writes', 'article', 'subject']
Cluster 15:
['sale', '00', 'edu', 'offer', 'condition']
Cluster 16:
['hp', 'com', 'edu', 'organization', 'lines']
Cluster 17:
['bike', 'dod', 'com', 'edu', 'ca']
Cluster 18:
['nasa', 'gov', 'spa