# NLP Lab: Bag-of-Words, TF-IDF, and Word2Vec

In [9]:


%pip install nltk scikit-learn gensim


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [10]:

import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shaik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Sample Dataset

In [11]:

documents = [
    "I love natural language processing",
    "Natural language processing is very interesting",
    "I love learning NLP concepts"
]

print("Documents:")
for doc in documents:
    print("-", doc)


Documents:
- I love natural language processing
- Natural language processing is very interesting
- I love learning NLP concepts


## Bag of Words – Count Occurrence

In [12]:

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(documents)

print("Vocabulary:", count_vectorizer.get_feature_names_out())
print("BoW Count Matrix:\n", bow_counts.toarray())


Vocabulary: ['concepts' 'interesting' 'is' 'language' 'learning' 'love' 'natural'
 'nlp' 'processing' 'very']
BoW Count Matrix:
 [[0 0 0 1 0 1 1 0 1 0]
 [0 1 1 1 0 0 1 0 1 1]
 [1 0 0 0 1 1 0 1 0 0]]


## Bag of Words – Normalized Count Occurrence

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

# Step 1: Count Occurrence
count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(documents)

print("Vocabulary:", count_vectorizer.get_feature_names_out())
print("BoW Count Matrix:\n", bow_counts.toarray())

# Step 2: Normalized Count Occurrence (L2 normalization)
bow_normalized = normalize(bow_counts, norm='l2')

print("Normalized BoW Matrix:\n", bow_normalized.toarray())


Vocabulary: ['concepts' 'interesting' 'is' 'language' 'learning' 'love' 'natural'
 'nlp' 'processing' 'very']
BoW Count Matrix:
 [[0 0 0 1 0 1 1 0 1 0]
 [0 1 1 1 0 0 1 0 1 1]
 [1 0 0 0 1 1 0 1 0 0]]
Normalized BoW Matrix:
 [[0.         0.         0.         0.5        0.         0.5
  0.5        0.         0.5        0.        ]
 [0.         0.40824829 0.40824829 0.40824829 0.         0.
  0.40824829 0.         0.40824829 0.40824829]
 [0.5        0.         0.         0.         0.5        0.5
  0.         0.5        0.         0.        ]]


## TF-IDF

In [14]:

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


Vocabulary: ['concepts' 'interesting' 'is' 'language' 'learning' 'love' 'natural'
 'nlp' 'processing' 'very']
TF-IDF Matrix:
 [[0.         0.         0.         0.5        0.         0.5
  0.5        0.         0.5        0.        ]
 [0.         0.45954803 0.45954803 0.34949812 0.         0.
  0.34949812 0.         0.34949812 0.45954803]
 [0.52863461 0.         0.         0.         0.52863461 0.40204024
  0.         0.52863461 0.         0.        ]]


## Word2Vec Embeddings

In [15]:

# Tokenize sentences
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]


w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,
    window=5,
    min_count=1,
    workers=2
)


word = "language"
print(f"Embedding vector for '{word}':")
print(w2v_model.wv[word])
print("Vector size:", len(w2v_model.wv[word]))


Embedding vector for 'language':
[-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312
  0.00180291  0.01278507]
Vector size: 50
