In [1]:
!pip install gensim




In [2]:
from gensim.models import Word2Vec

sentences = [
    ["king", "queen", "man", "woman"],
    ["paris", "france", "berlin", "germany"],
    ["cat", "dog", "animal"],
]

model = Word2Vec(sentences, vector_size=10, min_count=1, workers=2)

v = model.wv["king"]
print(v)
print("Vector length:", len(v))
print(model.wv.similarity("king", "queen"))


[-0.08619688  0.03665738  0.05189884  0.05741938  0.07466918 -0.06167675
  0.01105614  0.06047282 -0.0284005  -0.06173522]
Vector length: 10
-0.1525032


In [3]:
# Install NLTK
!pip install nltk



In [6]:
import nltk

nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Now that we have NLTK and the corpus, let's prepare the data for Word2Vec training. We'll load a specific book from the corpus and split it into sentences.

In [7]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# Loading a book
alice = gutenberg.raw('carroll-alice.txt')

# Clean and tokenize the text into sentences and then words
sentences = []
for sentence in sent_tokenize(alice):
    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence.lower())
    words = word_tokenize(cleaned_sentence)
    if words:
        sentences.append(words)

print(f"Number of sentences: {len(sentences)}")
print("First sentence:", sentences[0])

Number of sentences: 1625
First sentence: ['alices', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'chapter', 'i']


In [8]:
from gensim.models import Word2Vec

model_nltk = Word2Vec(sentences, vector_size=50, window=5, min_count=5, workers=4)

print("\nWords similar to 'king':")
try:
    print(model_nltk.wv.most_similar("king"))
except KeyError:
    print("'king' not in vocabulary with min_count=5")

print("\nWords similar to 'queen':")
try:
     print(model_nltk.wv.most_similar("queen"))
except KeyError:
    print("'queen' not in vocabulary with min_count=5")

print("\nSimilarity between 'king' and 'queen':")
try:
    print(model_nltk.wv.similarity("king", "queen"))
except KeyError:
     print("One or both words not in vocabulary with min_count=5")


Words similar to 'king':
[('on', 0.9992731809616089), ('by', 0.999186635017395), ('him', 0.9991767406463623), ('an', 0.9991732239723206), ('its', 0.9991611242294312), ('it', 0.9991517663002014), ('the', 0.9991498589515686), ('was', 0.9991344809532166), ('voice', 0.9991249442100525), ('were', 0.999116063117981)]

Words similar to 'queen':
[('and', 0.9992676377296448), ('its', 0.9992329478263855), ('at', 0.9992161989212036), ('was', 0.9991904497146606), ('she', 0.9991714954376221), ('her', 0.9991247057914734), ('then', 0.9990798830986023), ('into', 0.9990713000297546), ('the', 0.9990606904029846), ('or', 0.9990500807762146)]

Similarity between 'king' and 'queen':
0.9985246
