In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [2]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [4]:

dataset = [sentence.lower() for sentence in dataset]

In [5]:
tokenized_dataset = [doc.split() for doc in dataset]

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amaknabil2001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amaknabil2001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output
# Assuming you have defined the remove_stopwords function and tokenized_dataset is a list containing tokenized text

# Apply remove_stopwords function to each element in tokenized_dataset using list comprehension
tokenized_dataset = [remove_stopwords(tokens) for tokens in tokenized_dataset]



In [14]:

#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer

In [15]:
#defining the object for stemming
porter_stemmer = PorterStemmer()

In [16]:
#defining a function for stemming
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

In [17]:
tokenized_dataset = [stemming(tokens) for tokens in tokenized_dataset]

In [18]:
print(tokenized_dataset)

[['love', 'play', 'footbal', 'weekend'], ['enjoy', 'hike', 'camp', 'mountain'], ['like', 'read', 'book', 'watch', 'movi'], ['prefer', 'play', 'video', 'game', 'sport'], ['love', 'listen', 'music', 'go', 'concert']]


In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amaknabil2001\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

In [21]:

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [22]:
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

In [23]:
tokenized_dataset = [lemmatizer(tokens) for tokens in tokenized_dataset]

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Flatten the list of lists into a single list of strings
flattened_tokenized_dataset = [" ".join(tokens) for tokens in tokenized_dataset]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the tokenized dataset
X = vectorizer.fit_transform(flattened_tokenized_dataset)


In [29]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Page 3 CISB5123 Nur Laila Ab Ghani

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
i love playing football on the weekends                            1
i enjoy hiking and camping in the mountains                        0
i like to read books and watch movies                              1
i prefer playing video games over sports                           1
i love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 camp
 enjoy
 hike
 mountain
 weekend
 listen
 concert
 footbal
 game
 go

Cluster 1:
 love
 play
 footbal
 weekend
 go
 sport
 music
 concert
 video
 game



In [31]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.8


In [33]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [34]:
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4)

In [35]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in dataset])

In [36]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

  super()._check_params_vs_input(X, default_n_init=10)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
i love playing football on the weekends                            1
i enjoy hiking and camping in the mountains                        0
i like to read books and watch movies                              1
i prefer playing video games over sports                           1
i love listening to music and going to concerts                    1
Purity: 0.8
