In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [2]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [12]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)


In [13]:
# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Page 3 CISB5123 Nur Laila Ab Ghani

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0

Top terms per cluster:
Cluster 0:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music

Cluster 1:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games



In [11]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.4


In [8]:
print(X)

  (0, 26)	0.4493418549869352
  (0, 22)	0.36252617931707154
  (0, 16)	0.4493418549869352
  (0, 5)	0.4493418549869352
  (0, 18)	0.36252617931707154
  (0, 12)	0.36252617931707154
  (1, 13)	0.4049070906159763
  (1, 9)	0.4049070906159763
  (1, 2)	0.4049070906159763
  (1, 0)	0.27117099759647145
  (1, 8)	0.4049070906159763
  (1, 4)	0.4049070906159763
  (1, 22)	0.3266764912066096
  (2, 14)	0.4049070906159763
  (2, 25)	0.4049070906159763
  (2, 1)	0.4049070906159763
  (2, 20)	0.4049070906159763
  (2, 23)	0.3266764912066096
  (2, 10)	0.4049070906159763
  (2, 0)	0.27117099759647145
  (3, 21)	0.4206690600631704
  (3, 17)	0.4206690600631704
  (3, 6)	0.4206690600631704
  (3, 24)	0.4206690600631704
  (3, 19)	0.4206690600631704
  (3, 18)	0.3393931489111758
  (4, 3)	0.3603026019368333
  (4, 7)	0.3603026019368333
  (4, 15)	0.3603026019368333
  (4, 11)	0.3603026019368333
  (4, 23)	0.5813797411859515
  (4, 0)	0.24129885168269352
  (4, 12)	0.29068987059297574
