In [9]:
%pip install -q scikit-learn gensim

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

**dataset**

In [11]:
documents = [
    "the cat is loafing on the mat",
    "the dog is sitting on the mat",
    "the cat and dog are friends"
]

**Count Occurrence (Bag of Words)**

In [12]:
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

print("\nBag of Words Matrix (Count Occurrence):")
print(bow_matrix.toarray())

Vocabulary:
['and' 'are' 'cat' 'dog' 'friends' 'is' 'loafing' 'mat' 'on' 'sitting'
 'the']

Bag of Words Matrix (Count Occurrence):
[[0 0 1 0 0 1 1 1 1 0 2]
 [0 0 0 1 0 1 0 1 1 1 2]
 [1 1 1 1 1 0 0 0 0 0 1]]


**Normalized Count Occurrence**

In [13]:
bow_array = bow_matrix.toarray()
normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("\nNormalized Bag of Words:")
print(normalized_bow)


Normalized Bag of Words:
[[0.         0.         0.14285714 0.         0.         0.14285714
  0.14285714 0.14285714 0.14285714 0.         0.28571429]
 [0.         0.         0.         0.14285714 0.         0.14285714
  0.         0.14285714 0.14285714 0.14285714 0.28571429]
 [0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.
  0.         0.         0.         0.         0.16666667]]


**TF-IDF Vectorization**

In [14]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

Vocabulary:
['and' 'are' 'cat' 'dog' 'friends' 'is' 'loafing' 'mat' 'on' 'sitting'
 'the']

TF-IDF Matrix:
[[0.         0.         0.35047243 0.         0.         0.35047243
  0.46082913 0.35047243 0.35047243 0.         0.54434622]
 [0.         0.         0.         0.35047243 0.         0.35047243
  0.         0.35047243 0.35047243 0.46082913 0.54434622]
 [0.4711101  0.4711101  0.35829137 0.35829137 0.4711101  0.
  0.         0.         0.         0.         0.27824521]]


**prep data for word2vec**

In [15]:
tokenized_docs = [doc.split() for doc in documents]
print(tokenized_docs)

[['the', 'cat', 'is', 'loafing', 'on', 'the', 'mat'], ['the', 'dog', 'is', 'sitting', 'on', 'the', 'mat'], ['the', 'cat', 'and', 'dog', 'are', 'friends']]


**Train Word2Vec Model**

In [16]:
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4
)

**Generate Word Embeddings**

In [17]:
print("Embedding for word 'cat':")
print(w2v_model.wv['cat'])

print("\nEmbedding vector size:", len(w2v_model.wv['cat']))

Embedding for word 'cat':
[ 2.8740549e-03 -5.2920175e-03 -1.4147566e-02 -1.5610614e-02
 -1.8243574e-02 -1.1870339e-02 -3.6948491e-03 -8.6477427e-03
 -1.2921341e-02 -7.4346447e-03  8.5783172e-03 -7.4780867e-03
  1.6756350e-02  3.0679870e-03 -1.4484639e-02  1.8867597e-02
  1.5262425e-02  1.0986564e-02 -1.3697691e-02  1.1645358e-02
  8.0181863e-03  1.0370739e-02  8.5118031e-03  3.8795089e-03
 -6.3403249e-03  1.6707690e-02  1.9224361e-02  7.5852061e-03
 -5.6739901e-03  1.4255047e-05  2.4376370e-03 -1.6916649e-02
 -1.6447891e-02 -4.6203137e-04  2.4745751e-03 -1.1486761e-02
 -9.4505474e-03 -1.4692149e-02  1.6657231e-02  2.4259568e-04
 -9.0187974e-03  1.1403411e-02  1.8360030e-02 -8.1997439e-03
  1.5929364e-02  1.0750868e-02  1.1758246e-02  1.0251808e-03
  1.6426168e-02 -1.4038081e-02]

Embedding vector size: 50
