In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


# 1. Importing Required Modules

In [4]:
from gensim.models import Word2Vec

# 2. Defining Sample Sentences

In [5]:
sentences = [
    "machine learning is great",
    "natural language processing and machine learning are amazing",
    "word embeddings are useful for many natural language processing tasks",
    "deep learning models can outperform traditional machine learning algorithms"
]


# 3. Tokenizing Sentences


In [8]:
tokenized_sentences = [sentence.split() for sentence in sentences]
print(tokenized_sentences)

[['machine', 'learning', 'is', 'great'], ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'are', 'amazing'], ['word', 'embeddings', 'are', 'useful', 'for', 'many', 'natural', 'language', 'processing', 'tasks'], ['deep', 'learning', 'models', 'can', 'outperform', 'traditional', 'machine', 'learning', 'algorithms']]


# 4. Training CBOW Model 

In [9]:
cbow_model = Word2Vec(sentences=tokenized_sentences, vector_size=100 , window=5 , min_count=1, workers=4 , sg=0)

- min_count=1: Ignores all words with a total frequency lower than this value.
- workers=4: Sets the number of worker threads to train the model in parallel, leveraging multicore machines for faster training.
- sg=0: Specifies the training algorithm. sg=0 indicates CBOW (Continuous Bag of Words).
- vector_size=100: Sets the dimensionality of the word vectors to 100.
- window=5: Defines the maximum distance between the current and predicted word within a sentence (context window).

# 5. Training Skip-gram model 

In [11]:
skipgram_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

- sg=1: Specifies the training algorithm. sg=1 indicates Skip-gram.

# 6. Saving the model


In [15]:
cbow_model.save("cbow_word2vec.model")
skipgram_model.save("skipgram_word2vec.model")

# 7.Loading the Models

In [16]:
cbow_model= Word2Vec.load("cbow_word2vec.model")
skipgram_model = Word2Vec.load("skipgram_word2vec.model")

# 8. Using the CBOW model 

In [17]:
cbow_vector = cbow_model.wv['machine']
print("CBOW Vector for 'machine':", cbow_vector)

cbow_similar_words = cbow_model.wv.most_similar('machine', topn=5)
print("CBOW Words similar to 'machine':", cbow_similar_words)

cbow_similarity = cbow_model.wv.similarity('machine', 'learning')
print("CBOW Similarity between 'machine' and 'learning':", cbow_similarity)

CBOW Vector for 'machine': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-

- cbow_model.wv['machine']: Retrieves the vector representation of the word 'machine' from the CBOW model.
- cbow_model.wv.most_similar('machine', topn=5): Finds the top 5 words most similar to 'machine' based on cosine similarity in the CBOW model.
- cbow_model.wv.similarity('machine', 'learning'): Computes the cosine similarity between the vectors of 'machine' and 'learning' in the CBOW model.

# 9. Using the Skip-gram Model

In [18]:
skipgram_vector = skipgram_model.wv['machine']
print("Skip-gram Vector for 'machine':", skipgram_vector)

skipgram_similar_words = skipgram_model.wv.most_similar('machine', topn=5)
print("Skip-gram Words similar to 'machine':", skipgram_similar_words)

skipgram_similarity = skipgram_model.wv.similarity('machine', 'learning')
print("Skip-gram Similarity between 'machine' and 'learning':", skipgram_similarity)


Skip-gram Vector for 'machine': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889