In [121]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [122]:
file_path = "Lab_03_data1.txt"

In [123]:
# reading the file data to variable text
with open(file_path,'r') as f:
    text = f.read()

## Question 1

In [124]:
# Remove punctuation marks and quotation marks
text = text.translate(str.maketrans('', '', string.punctuation +  "‘’“”'"))


In [125]:
# Tokenization
tokens = word_tokenize(text)


In [126]:
# Stopword Removal
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if not token.lower() in stop_words]


In [127]:
# Lemmatiztion
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]

In [128]:
# Combine tokens back into a string
processed_text = ' '.join(tokens)


## Question 2

In [129]:
# Implement TF-IDF model from scratch
# Compute document frequencies
doc_freq = defaultdict(int)
for token in tokens:
    doc_freq[token] += 1

In [130]:
# Compute inverse document frequencies
num_docs = len(doc_freq)
idf = {}
for token, freq in doc_freq.items():
    idf[token] = math.log(num_docs / (freq + 1))


In [131]:
# Compute TF-IDF weights for each token
tf_idf = {}
for token in tokens:
    tf_idf[token] = (tokens.count(token) / len(tokens)) * idf[token]


In [132]:
# Print the top 10 tokens by TF-IDF weight
sorted_tokens = sorted(tf_idf.items(), key=lambda x: x[1], reverse=True)
print('Top 10 tokens by TF-IDF weight:')
for token, weight in sorted_tokens[:10]:
    print(token, weight)


Top 10 tokens by TF-IDF weight:
system 0.046525191668186576
information 0.04278862693241877
energy 0.037546532484194955
thermodynamics 0.02900313374368789
environment 0.02900313374368789
organism 0.025929788661136176
one 0.02434181763845234
said 0.02434181763845234
Business 0.02434181763845234
law 0.022715925329262112


## Question 3

In [133]:
# Implement TF-IDF model with Scikit-learn package
vectorizer = TfidfVectorizer(use_idf=True)
tf_idf_matrix = vectorizer.fit_transform([processed_text])
feature_names = vectorizer.get_feature_names()


In [134]:
# Print the top 10 tokens by TF-IDF weight using Scikit-learn
tf_idf_scores = tf_idf_matrix.toarray()[0]
sorted_scores = np.argsort(tf_idf_scores)[::-1]
print('Top 10 tokens by TF-IDF weight (Scikit-learn):')
for index in sorted_scores[:10]:
    print(feature_names[index], tf_idf_scores[index])


Top 10 tokens by TF-IDF weight (Scikit-learn):
system 0.271883104297442
information 0.2621729934296762
energy 0.2039123282230815
thermodynamics 0.16507188475201834
environment 0.15536177388425257
life 0.135941552148721
entropy 0.1262314412809552
one 0.1262314412809552
organism 0.1262314412809552
business 0.11652133041318942


## Question 4

In [136]:
# Implement K-means clustering on the processed data
num_clusters = int(input("Enter the number of clusters: "))
kmeans = KMeans(n_clusters=num_clusters,random_state=42)
kmeans.fit(tf_idf_matrix)
labels = kmeans.labels_