In [1]:
import numpy as np 
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter 

In [2]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"]

In [3]:
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(dataset)

In [4]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X)

In [5]:
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Display the document and its predicted cluster in a table 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    0


In [7]:
# Print top terms per cluster 
print("\nTop terms per cluster:") 
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
terms = vectorizer.get_feature_names_out() 
for i in range(k): 
    print("Cluster %d:" % i) 
    for ind in order_centroids[i, :10]: 
        print(' %s' % terms[ind]) 
        print() 


Top terms per cluster:
Cluster 0:
 playing

 love

 to

 weekends

 on

 football

 video

 sports

 prefer

 over

Cluster 1:
 and

 movies

 books

 camping

 enjoy

 hiking

 in

 like

 watch

 mountains



In [8]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.6


In [9]:
import numpy as np 
from sklearn.cluster import KMeans 
from gensim.models import Word2Vec 
from tabulate import tabulate 
from collections import Counter 

In [10]:
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"] 

In [11]:
tokenized_dataset = [doc.split() for doc in dataset] 
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4)

In [12]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in dataset]) 

In [13]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        1
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1




In [14]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.8


<h1>Exercise 1</h1>

In [23]:
import pandas as pd 
import re 
import emoji 
import string 
import nltk 
 
from bs4 import BeautifulSoup 
from autocorrect import Speller 
from nltk.corpus import stopwords, wordnet 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag 
 
nltk.download('stopwords') 
nltk.download('wordnet')                    # For lemmatization 
nltk.download('omw-1.4')                     # WordNet lexical database 
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging 
nltk.download('punkt')                       # For tokenization 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haoho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haoho\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\haoho\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\haoho\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\haoho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing",
    "lmao": "laughing my ass off",
    "wtf": "what the fuck",
    "wth": "what the hell",
    "afk": "away from keyboard",
    "ftw": "for the win",
    "irl": "in real life",
    "ama": "ask me anything",
    "tldr": "too long didn't read",
    "nsfw": "not safe for work",
    "atm": "at the moment",
    "bday": "birthday",
    "dm": "direct message",
    "rn": "right now",
    "ty": "thank you",
    "yw": "you're welcome",
    "wbu": "what about you",
    "wyd": "what are you doing",
    "ily": "I love you",
    "idc": "I don't care",
    "yolo": "you only live once",
    "sus": "suspicious",
    "fr": "for real",
    "cap": "lie",
    "no cap": "no lie",
    "salty": "bitter or upset",
    "savage": "brutally honest or bold",
    "based": "confident and unapologetically true",
    "simp": "someone who does too much for someone they like",
    "fam": "close friend or family",
    "vibe": "a mood or atmosphere",
    "mfw": "my face when",
    "tfw": "that feeling when",
    "afaik": "as far as I know",
    "fomo": "fear of missing out",
    "bruh": "bro or seriously?",
    "thx": "thanks",
    "pls": "please",
    "rip": "rest in peace",
    "pog": "awesome or amazing moment",
    "rizz": "charisma or charm, especially in flirting",
    "delulu": "delusional",
    "solulu": "solution",
    "trululu": "truth",
    "brain rot": "mental fatigue from consuming low-quality or excessive online content",
    "let them cook": "encouragement to let someone continue their actions or ideas",
    "yassified": "glamorized or made fabulous in an exaggerated way",
    "mid": "mediocre or average",
    "unhinged": "wild or chaotic behavior",
    "lore": "background story or context",
    "chat": "a group of people, often used to address an audience",
    "yap": "to talk excessively or gossip",
    "sigma": "a term describing a confident, independent person",
    "gigachad": "an idealized, hyper-masculine man",
    "baddie": "a confident and attractive woman",
    "mewing": "a facial exercise technique aimed at improving jawline definition",
    "beta maxing": "exhibiting submissive or non-dominant behavior",
    "skibidi": "nonsensical term from a viral meme, often used humorously",
    "gyatt": "exclamation of surprise or admiration",
    "fanum tax": "stealing food from someone",
    "bop": "a derogatory term used to shame individuals for perceived promiscuity",
    "serving cunt": "exuding confidence and style in a bold, unapologetic manner",
    "fujoing out": "obsessing over male-male fictional relationships",
    "AI slop": "low-quality content generated by artificial intelligence",
    "tim cheese": "a fictional meme character from a viral trend",
    "vibe check": "assessing someone's mood or energy",
    "main character energy": "behaving as if one is the protagonist of a story",
    "clout": "influence or fame, especially on social media",
    "ghosting": "suddenly cutting off communication without explanation",
    "thirst trap": "a photo or post intended to attract attention",
    "receipts": "evidence or proof, often in the form of screenshots",
    "shadowban": "a stealth ban where a user's content is hidden without their knowledge",
    "flex": "to show off",
    "ratio": "a situation where replies to a post outnumber likes, indicating disapproval",
    "thirsty": "desperate for attention",
    "cheugy": "out of date or trying too hard",
    "slay": "to excel or do something exceptionally well",
    "extra": "over the top; dramatic",
    "period": "emphasis on a statement; end of discussion",
    "bet": "agreement or affirmation",
    "cray": "crazy",
    "pwn": "dominate or defeat",
    "kda": "kill/death/assist ratio in gaming",
    "rpg": "role-playing game",
    "noob": "newbie or inexperienced person",
    "goated": "exceptionally good; the best",
    "drip": "stylish or fashionable",
    "stan": "overzealous fan",
    "snatched": "looking good or fashionable",
    "lowkey": "slightly or secretly",
    "highkey": "very or openly",
    "fire": "excellent or amazing",
    "lit": "exciting or excellent",
    "suss": "suspicious",
    "woke": "socially aware",
    "karen": "entitled or demanding woman",
    "chad": "confident and attractive man",
    "yeet": "throw or discard forcefully",
    "boomer": "older person",
    "gen z": "generation born between mid-1990s and early 2010s",
    "gen alpha": "generation born from early 2010s onwards",
    "af": "as fuck",
    "tf": "the fuck",
    "tysm": "thank you so much",
    "omw": "on my way",
    "bbl": "be back later",
    "gtg": "got to go",
    "ttyl": "talk to you later",
    "ilysm": "I love you so much",
    "lmk": "let me know",
    "nvm": "never mind",
    "tmi": "too much information",
    "hbu": "how about you",
    "that's tea": "that's gossip or the truth",
    "aura": "unique vibe or energy someone gives off"
}
slang_dict = {k.lower(): v for k, v in slang_dict.items()}
spell = Speller(lang='en')
nltk.download('stopwords') 
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haoho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
def lower_case(text):
    return text.lower()

def url_removal(text):
    return re.sub(r'http\S+|www\S+', '', text)  

def html_removal(text):
    return BeautifulSoup(text, "html.parser").get_text()

def punctuation_special_character_removal(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def numbers_removal(text):
    return re.sub(r"\d", "", text)

def emoji_removal(text):
    return emoji.replace_emoji(text, replace="")

def internet_slang_replacement(text):
    escaped_slang_words = []
    for word in slang_dict.keys(): 
        escaped_word = re.escape(word)  # Ensure special characters are escaped 
        escaped_slang_words.append(escaped_word)  # Add to list 
 
    # Join the words using '|' 
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b' 
 
    # Define a replacement function 
    def replace_match(match): 
        slang_word = match.group(0).lower()  # Extract matched slang word 
        return slang_dict.get(slang_word, slang_word)   
 
    # Use regex to replace slang words with full forms 
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE) 
 
    return replaced_text 

def spelling_correction(text):
    return spell(text)

def standardizing_cleaning_pipeline(text):
    if not isinstance(text, str):
        return ""
    text = lower_case(text)
    text = url_removal(text)
    text = html_removal(text)
    text = emoji_removal(text)
    text = internet_slang_replacement(text)
    text = punctuation_special_character_removal(text)
    text = numbers_removal(text)
    text = spelling_correction(text)
    return text

for i in range(len(dataset)):
    dataset[i] = standardizing_cleaning_pipeline(dataset[i])

In [26]:
dataset

['i love playing football on the weekends',
 'i enjoy hiking and camping in the mountains',
 'i like to read books and watch movies',
 'i prefer playing video games over sports',
 'i love listening to music and going to concerts']

In [29]:
def remove_stopwords(text): 
    words = text.split()   
    filtered_words = [] 
    for word in words:  
        lower_word = word.lower() 
        if lower_word not in stop_words: 
            filtered_words.append(word) 
    return " ".join(filtered_words) 

for i in range(len(dataset)):
    dataset[i] = remove_stopwords(dataset[i])

In [30]:
dataset

['love playing football weekends',
 'enjoy hiking camping mountains',
 'like read books watch movies',
 'prefer playing video games sports',
 'love listening music going concerts']

In [31]:
from nltk.stem.porter import *

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

def stem_text(text): 
    if not isinstance(text, str): 
        return "" 
    
    words = text.split() 
    stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming 
    return " ".join(stemmed_words)

def get_wordnet_pos(nltk_tag): 
    if nltk_tag.startswith('J'):  # Adjective 
        return wordnet.ADJ 
    elif nltk_tag.startswith('V'):  # Verb 
        return wordnet.VERB 
    elif nltk_tag.startswith('N'):  # Noun 
        return wordnet.NOUN 
    elif nltk_tag.startswith('R'):  # Adverb 
        return wordnet.ADV 
    else: 
        return wordnet.NOUN  # Default to noun 

def lemmatize_text(text):
    if not isinstance(text, str):  # Ensure input is a string 
       return "" 
 
    words = word_tokenize(text) 
    pos_tags = pos_tag(words)  
     
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words) 

for i in range(len(dataset)):
    dataset[i] = stem_text(dataset[i])
    dataset[i] = lemmatize_text(dataset[i])

dataset

['love play footbal weekend',
 'enjoy hike camp mountain',
 'like read book watch movi',
 'prefer play video game sport',
 'love listen music go concert']

In [32]:
dataset2 = dataset.copy()
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(dataset2)

In [33]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in dataset2]) 

In [34]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset2, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 


Document                        Predicted Cluster
----------------------------  -------------------
love play footbal weekend                       0
enjoy hike camp mountain                        1
like read book watch movi                       1
prefer play video game sport                    1
love listen music go concert                    0




In [35]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.6


<h2>The purity is still the same after preprocessing for tfidf</h2>

In [36]:
tokenized_dataset = [doc.split() for doc in dataset] 
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, 
window=5, min_count=1, workers=4)

In [37]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in 
word2vec_model.wv], axis=0) for doc in dataset]) 

In [38]:
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Tabulate the document and predicted cluster 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 


Document                        Predicted Cluster
----------------------------  -------------------
love play footbal weekend                       1
enjoy hike camp mountain                        0
like read book watch movi                       1
prefer play video game sport                    1
love listen music go concert                    1




In [39]:
# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Purity: 0.8


<h2>No changes for Word2Vector as well</h2>