In [None]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
sentences = [
    "Welcome to Great Learning, Now start learning",
    "Learning is a good practice"
]

In [None]:
sentences = [sentence.lower() for sentence in sentences]

In [None]:
stop_words = set(stopwords.words('english'))
processed_sentences = []
for sentence in sentences:
    words = re.findall(r'\b\w+\b', sentence)
    filtered_words = [word for word in words if word not in stop_words]
    processed_sentences.append(" ".join(filtered_words))


In [None]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(processed_sentences)

In [None]:
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Bag of Words Matrix:\n", bow_matrix.toarray())

Vocabulary: ['good' 'great' 'learning' 'practice' 'start' 'welcome']
Bag of Words Matrix:
 [[0 1 2 0 1 1]
 [1 0 1 1 0 0]]


In [None]:
import nltk
import re
import numpy as np
from nltk.corpus import gutenberg
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('gutenberg')
nltk.download('punkt')
try:
    nltk.download('punkt_tab')
except Exception as e:
    print("Warning: Could not download 'punkt_tab'. Falling back to regex-based sentence splitting.")

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
raw_text = gutenberg.raw('shakespeare-hamlet.txt')
try:
    sentences = nltk.sent_tokenize(raw_text.lower())
except LookupError:
    sentences = re.split(r'(?<=[.!?])\s+', raw_text.lower())


In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)

In [None]:
print("Vocabulary (first 20 words):", vectorizer.get_feature_names_out()[:20])
print("TF-IDF Matrix shape:", tfidf_matrix.shape)

Vocabulary (first 20 words): ['1599' 'abhominably' 'abhorred' 'abilitie' 'aboord' 'aboue' 'about'
 'abridgements' 'abroad' 'absent' 'absolute' 'abstinence' 'abstracts'
 'absurd' 'abus' 'abuse' 'abuses' 'accent' 'accepts' 'accesse']
TF-IDF Matrix shape: (2353, 4688)


In [None]:
print("\nTF-IDF Scores for the first sentence:")
feature_names = vectorizer.get_feature_names_out()
first_sentence_tfidf = tfidf_matrix[0].toarray()[0]
for word, score in zip(feature_names[:20], first_sentence_tfidf[:20]):
    print(f"{word}: {score:.4f}")


TF-IDF Scores for the first sentence:
1599: 0.3938
abhominably: 0.0000
abhorred: 0.0000
abilitie: 0.0000
aboord: 0.0000
aboue: 0.0000
about: 0.0000
abridgements: 0.0000
abroad: 0.0000
absent: 0.0000
absolute: 0.0000
abstinence: 0.0000
abstracts: 0.0000
absurd: 0.0000
abus: 0.0000
abuse: 0.0000
abuses: 0.0000
accent: 0.0000
accepts: 0.0000
accesse: 0.0000


In [19]:
import nltk
import re
import numpy as np
from nltk.corpus import gutenberg
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import csr_matrix
nltk.download('gutenberg')
nltk.download('punkt')
raw_text = gutenberg.raw('shakespeare-hamlet.txt')

try:
    sentences = nltk.sent_tokenize(raw_text.lower())
except LookupError:
    sentences = re.split(r'(?<=[.!?])\s+', raw_text.lower())

vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(sentences)

tfidf_transformer_nosmooth = TfidfTransformer(smooth_idf=False)
tfidf_nosmooth = tfidf_transformer_nosmooth.fit_transform(count_matrix)

tfidf_transformer_add1 = TfidfTransformer(smooth_idf=True)
tfidf_add1 = tfidf_transformer_add1.fit_transform(count_matrix)


k = 0.5
k_matrix = csr_matrix(np.full(count_matrix.shape, k))
count_matrix_addk = count_matrix + k_matrix

tfidf_transformer_addk = TfidfTransformer(smooth_idf=True)
tfidf_addk = tfidf_transformer_addk.fit_transform(count_matrix_addk)


index = 11
feature_names = vectorizer.get_feature_names_out()

print("Sentence at index", index, ":\n", sentences[index])

print("\nTF-IDF Scores for sentence index", index, "without smoothing:")
sentence_vector_nosmooth = tfidf_nosmooth[index].toarray()[0]
for word, score in zip(feature_names, sentence_vector_nosmooth):
    if score > 0:
        print(f"{word}: {score:.4f}")

print("\nTF-IDF Scores for sentence index", index, "with Add-1 smoothing:")
sentence_vector_add1 = tfidf_add1[index].toarray()[0]
for word, score in zip(feature_names, sentence_vector_add1):
    if score > 0:
        print(f"{word}: {score:.4f}")

print(f"\nTF-IDF Scores for sentence index {index} with Add-K smoothing (k={k}):")
sentence_vector_addk = tfidf_addk[index].toarray()[0]
for word, score in zip(feature_names, sentence_vector_addk):
    if score > 0:
        print(f"{word}: {score:.4f}")


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentence at index 11 :
 you come most carefully vpon your houre

   bar.

TF-IDF Scores for sentence index 11 without smoothing:
bar: 0.4260
carefully: 0.5476
come: 0.2644
houre: 0.4471
most: 0.2821
vpon: 0.3007
you: 0.1751
your: 0.2194

TF-IDF Scores for sentence index 11 with Add-1 smoothing:
bar: 0.4318
carefully: 0.5214
come: 0.2727
houre: 0.4504
most: 0.2908
vpon: 0.3097
you: 0.1809
your: 0.2265

TF-IDF Scores for sentence index 11 with Add-K smoothing (k=0.5):
1599: 0.0145
abhominably: 0.0145
abhorred: 0.0145
abilitie: 0.0145
aboord: 0.0145
aboue: 0.0145
about: 0.0145
abridgements: 0.0145
abroad: 0.0145
absent: 0.0145
absolute: 0.0145
abstinence: 0.0145
abstracts: 0.0145
absurd: 0.0145
abus: 0.0145
abuse: 0.0145
abuses: 0.0145
accent: 0.0145
accepts: 0.0145
accesse: 0.0145
accident: 0.0145
accidentall: 0.0145
accord: 0.0145
according: 0.0145
account: 0.0145
accounted: 0.0145
accurst: 0.0145
accuse: 0.0145
acquaint: 0.0145
acquire: 0.0145
acquittance: 0.0145
act: 0.0145
acte: 0.01

#CONCLUSION
In this experiment, we implemented Bag of Words (BoW) and TF-IDF for text vectorization. BoW provides a simple frequency-based representation but does not account for word importance, resulting in a sparse matrix. In contrast, TF-IDF assigns weights based on term relevance, reducing the impact of common words while highlighting distinctive terms. Our results showed that TF-IDF effectively differentiates between important and unimportant words, making it more suitable for applications like document retrieval and text classification. However, both methods ignore word order and meaning, limiting their ability to capture context. Overall, TF-IDF offers a more informative representation than BoW, though advanced models like word embeddings provide better contextual understanding.

Since TF-IDF encountered with zero probabilty I used Add-one and add0-k smoothning to fix it.