In [27]:
# TEXT CLUSTERING USING TF-IDF VECTORIZER WITH PRE-PROCESSING
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"]

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

preprocessed_dataset = []
for document in dataset:
    tokens = word_tokenize(document)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    preprocessed_document = " ".join(stemmed_tokens)
    preprocessed_dataset.append(preprocessed_document)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_dataset)

k = 2

km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 camp
 enjoy
 hike
 mountain
 weekend
 listen
 concert
 footbal
 game
 go

Cluster 1:
 love
 play
 footbal
 weekend
 go
 sport
 music
 concert
 video
 game

Purity: 0.8


In [20]:
# TEXT CLUSTERING USING WORD2VEC VECTORIZER PRE-PROCESSING
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from string import punctuation

dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"]

nltk.download('punkt')
punctuation_set = set(punctuation)
tokenized_dataset = []
for document in dataset:
    tokens = word_tokenize(document)
    tokens = [token.lower() for token in tokens if token not in punctuation_set]
    tokenized_dataset.append(tokens)

word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
                          window=5, min_count=1, workers=4)

X = np.array([np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0) 
              for tokens in tokenized_dataset])

k = 2

km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\james\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  super()._check_params_vs_input(X, default_n_init=10)


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0
Purity: 0.6


In [40]:
#Q2
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter

df = pd.read_csv('customer_complaints_1.csv')

df.dropna(subset=['text'], inplace=True)

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(tokens)

df['Cleaned Text'] = df['text'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_features=1000) 
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned Text'])

k = 2 
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)

print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print("  ", ", ".join(top_terms))
    print()

total_samples = len(clusters)
cluster_label_counts = [Counter(clusters)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


  super()._check_params_vs_input(X, default_n_init=10)


Top terms per cluster:
Cluster 0:
   rude, service, day, rep, joke, charge, fee, local, bill, people

Cluster 1:
   internet, service, customer, comcast, contract, would, speed, xfinity, month, mbps

Purity: 0.8421052631578947
