In [6]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

In [8]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [10]:
# Exercise Question 2: yes, the purity differ when applying text preprocessing before vectorization, before 0.6 and after 0.8
import re
from bs4 import BeautifulSoup
import emoji
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('stopwords')

# Spell checker and stemmer
spell = Speller(lang='en')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Replace slang
    slang_dict = {
        "tbh": "to be honest", "omg": "oh my god", "lol": "laugh out loud",
        "idk": "i don't know", "brb": "be right back", "btw": "by the way",
        "imo": "in my opinion", "smh": "shaking my head", "fyi": "for your information",
        "np": "no problem", "ikr": "i know right", "asap": "as soon as possible",
        "bff": "best friend forever", "gg": "good game", "hmu": "hit me up",
        "rofl": "rolling on the floor laughing"
    }
    slang_pattern = r'\b(' + '|'.join(map(re.escape, slang_dict.keys())) + r')\b'
    text = re.sub(slang_pattern, lambda match: slang_dict[match.group(0)], text, flags=re.IGNORECASE)

    # Replace contractions
    contractions_dict = {
        "wasn't": "was not", "isn't": "is not", "aren't": "are not", "weren't": "were not",
        "doesn't": "does not", "don't": "do not", "didn't": "did not", "can't": "cannot",
        "couldn't": "could not", "shouldn't": "should not", "wouldn't": "would not", "won't": "will not",
        "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "i'm": "i am",
        "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is", "we're": "we are",
        "they're": "they are", "i've": "i have", "you've": "you have", "we've": "we have",
        "they've": "they have", "i'd": "i would", "you'd": "you would", "he'd": "he would",
        "she'd": "she would", "we'd": "we would", "they'd": "they would", "i'll": "i will",
        "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will",
        "they'll": "they will", "let's": "let us", "that's": "that is", "who's": "who is",
        "what's": "what is", "where's": "where is", "when's": "when is", "why's": "why is"
    }
    contractions_pattern = r'\b(' + '|'.join(map(re.escape, contractions_dict.keys())) + r')\b'
    text = re.sub(contractions_pattern, lambda match: contractions_dict[match.group(0).lower()], text, flags=re.IGNORECASE)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Spelling correction
    text = spell(text)

    # Remove stopwords and apply stemming
    words = text.split()
    filtered_stemmed = [stemmer.stem(word) for word in words if word not in stop_words]

    return " ".join(filtered_stemmed)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [14]:
from sklearn.cluster import KMeans
from tabulate import tabulate

# Define the number of clusters
k = 2
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f' {terms[ind]}')
    print()

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games

Cluster 1:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music



In [16]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


In [18]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter

In [20]:
dataset = ["I love playing football on the weekends",
 "I enjoy hiking and camping in the mountains",
 "I like to read books and watch movies",
 "I prefer playing video games over sports",
 "I love listening to music and going to concerts"]

In [22]:
tokenized_dataset = [doc.split() for doc in dataset]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [24]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset])

In [26]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)
# Predict the clusters for each document
y_pred = km.predict(X)
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0




In [28]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Purity: 0.8


In [5]:
# Exercise Question 1
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter
from tabulate import tabulate
import re

In [7]:
df = pd.read_csv("customer_complaints_1.csv")

In [9]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    tokens = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
    return " ".join(tokens)

df['clean_text'] = df['text'].astype(str).apply(preprocess)
documents = df['clean_text'].dropna().tolist()

In [11]:
# TF-IDF Vectorizer + KMeans Clustering
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(documents)

In [13]:
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X_tfidf)
y_pred_tfidf = km.predict(X_tfidf)

In [15]:
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(documents[:10], y_pred_tfidf[:10])])  # first 10
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Predicted Cluster
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
purity_tfidf = len(set(y_pred_tfidf)) / len(y_pred_tfidf)
print("TF-IDF Purity:", round(purity_tfidf, 4))

TF-IDF Purity: 0.1053


In [19]:
# Word2Vec Vectorizer + KMeans Clustering
tokenized_docs = [doc.split() for doc in documents]

In [21]:
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

In [23]:
X_word2vec = np.array([
    np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0)
    for words in tokenized_docs
])

In [25]:
km2 = KMeans(n_clusters=k, random_state=42)
km2.fit(X_word2vec)
y_pred_w2v = km2.predict(X_word2vec)



In [27]:
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(documents[:10], y_pred_w2v[:10])])  # first 10
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Predicted Cluster
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [29]:
purity_w2v = len(set(y_pred_w2v)) / len(y_pred_w2v)
print("Word2Vec Purity:", round(purity_w2v, 4))

Word2Vec Purity: 0.1053
