## Word To Vector (word2vec):

- Continuous bag of words(cbow
- Skip gram

In [8]:
#Import Libraries
from gensim.models import Word2Vec

In [9]:
# Four sample tokenized sentences (documents)
tokenizedSentences = [
    ["natural", "language", "processing", "is", "fun"],
    ["gensim", "makes", "topic", "modeling", "easy"],
    ["we", "can", "train", "word2vec", "on", "our", "own", "corpus"],
    ["this", "example", "uses", "simple", "tokenized", "sentences"]
]

# Building CBOW Word2Vec model(sg parameter is set to 0)
cbowModel= Word2Vec(tokenizedSentences,vector_size=100,window=2,sg=1, min_count=1)

# Building skip-gram Word2Vec model
skipGramModel = Word2Vec(tokenizedSentences,vector_size=10,window=2,sg=1,min_count=1)

In [10]:

# Example of word for similarity comparison
targetWord = "natural"

# Get similar words using CBOW model
similarWordsCbow = cbowModel.wv.most_similar(targetWord, topn=2)
print("CBOW Similar Words:", similarWordsCbow)

# Get similar words using Skip-gram model
similarWordSkipgram = skipGramModel.wv.most_similar(targetWord, topn=5)
print("Skip-gram Similar Words:", similarWordSkipgram)


CBOW Similar Words: [('word2vec', 0.17132115364074707), ('uses', 0.1566610485315323)]
Skip-gram Similar Words: [('word2vec', 0.6313896775245667), ('tokenized', 0.41193631291389465), ('on', 0.35338476300239563), ('gensim', 0.3428654074668884), ('train', 0.2825911343097687)]


In [11]:

# Words to compare
word1 = "natural"
word2 = "language"

# Get vectors from CBOW model
vector1 = cbowModel.wv[word1]
vector2 = cbowModel.wv[word2]

# Method 1: Using gensim’s internal cosine similarity
cosine_similarity = cbowModel.wv.similarity(word1, word2)

# Method 2 (manual with numpy, for learning)
import numpy as np

manual_cosine = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

print(f"Cosine similarity between '{word1}' and '{word2}' (CBOW): {cosine_similarity:.4f}")
print(f"Manual cosine similarity: {manual_cosine:.4f}")


Cosine similarity between 'natural' and 'language' (CBOW): -0.2156
Manual cosine similarity: -0.2156


In [12]:
# Example: build a Dictionary and corpus
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

# Create a mapping (word -> id)
dictionary = Dictionary(tokenizedSentences)

# Convert each sentence into bag‑of‑words (list of (id, count))
corpus = [dictionary.doc2bow(text) for text in tokenizedSentences]

# (Optional) Fit a TF-IDF model on that corpus
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

print("Dictionary token-to-id mapping:")
print(dictionary.token2id)
print("\nBoW corpus:")
print(corpus)
print("\nTF-IDF corpus:")
print([list(doc) for doc in corpus_tfidf])

Dictionary token-to-id mapping:
{'fun': 0, 'is': 1, 'language': 2, 'natural': 3, 'processing': 4, 'easy': 5, 'gensim': 6, 'makes': 7, 'modeling': 8, 'topic': 9, 'can': 10, 'corpus': 11, 'on': 12, 'our': 13, 'own': 14, 'train': 15, 'we': 16, 'word2vec': 17, 'example': 18, 'sentences': 19, 'simple': 20, 'this': 21, 'tokenized': 22, 'uses': 23}

BoW corpus:
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)]]

TF-IDF corpus:
[[(0, 0.4472135954999579), (1, 0.4472135954999579), (2, 0.4472135954999579), (3, 0.4472135954999579), (4, 0.4472135954999579)], [(5, 0.4472135954999579), (6, 0.4472135954999579), (7, 0.4472135954999579), (8, 0.4472135954999579), (9, 0.4472135954999579)], [(10, 0.35355339059327373), (11, 0.35355339059327373), (12, 0.35355339059327373), (13, 0.35355339059327373), (14, 0.35355339059327373), (15, 0.35355339059327